From a9322f6488b432ddc1e89be88242c827c633fb63 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Wed, 11 Jun 2008 16:35:14 +0200 Subject: x86, pci: introduce pci=noioapicquirk kernel cmdline option Introduce pci=noioapicquirk kernel cmdline option to disable all boot interrupt quirks Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Signed-off-by: Ingo Molnar --- arch/x86/pci/common.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 940185ecaeda..bc6a101ed7ec 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | static int pci_bf_sort; int pci_routeirq; +int noioapicquirk; int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; @@ -495,6 +496,9 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "skip_isa_align")) { pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; return NULL; + } else if (!strcmp(str, "noioapicquirk")) { + noioapicquirk = 1; + return NULL; } return str; } -- cgit v1.2.3 From 9197979b518573999d52d9e85bce1680682ed85c Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Wed, 11 Jun 2008 16:35:15 +0200 Subject: x86, pci: introduce pci=ioapicreroute kernel cmdline option Introduce pci=ioapicreroute kernel cmdline option to enable rerouting of boot interrupts to the primary io-apic. Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 4 ++++ arch/x86/pci/common.c | 5 +++++ include/asm-x86/io_apic.h | 4 ++++ include/asm-x86/pci.h | 1 + 4 files changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1aebe9dffbaa..df262b3c3d6e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1521,6 +1521,10 @@ and is between 256 and 4096 characters. It is defined in the file noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. + ioapicreroute [APIC] Enable rerouting of boot IRQs to the + primary IO-APIC for bridges that cannot disable + boot IRQs. This fixes a source of spurious IRQs + when the system masks IRQs. biosirq [X86-32] Use PCI BIOS calls to get the interrupt routing table. These calls are known to be buggy on several machines and they hang the machine diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index bc6a101ed7ec..0a9eaa736d94 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -23,6 +23,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | static int pci_bf_sort; int pci_routeirq; int noioapicquirk; +int noioapicreroute = 1; int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; @@ -499,6 +500,10 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "noioapicquirk")) { noioapicquirk = 1; return NULL; + } else if (!strcmp(str, "ioapicreroute")) { + if (noioapicreroute != -1) + noioapicreroute = 0; + return NULL; } return str; } diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index 8ca0110819f4..a39670ae17df 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -160,12 +160,16 @@ extern int skip_ioapic_setup; /* 1 if "noapic" boot option passed */ extern int noioapicquirk; +/* -1 if "noapic" boot option passed */ +extern int noioapicreroute; + /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ extern int timer_through_8259; static inline void disable_ioapic_setup(void) { noioapicquirk = 1; + noioapicreroute = -1; skip_ioapic_setup = 1; } diff --git a/include/asm-x86/pci.h b/include/asm-x86/pci.h index 30eec93a845e..52a29f7668ef 100644 --- a/include/asm-x86/pci.h +++ b/include/asm-x86/pci.h @@ -20,6 +20,7 @@ struct pci_sysdata { extern int pci_routeirq; extern int noioapicquirk; +extern int ioapicreroute; /* scan a bus after allocating a pci_sysdata for it */ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, -- cgit v1.2.3 From 41b9eb264c8407655db57b60b4457fe1b2ec9977 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Tue, 15 Jul 2008 13:48:55 +0200 Subject: x86, pci: introduce config option for pci reroute quirks (was: [PATCH 0/3] Boot IRQ quirks for Broadcom and AMD/ATI) This is against linux-2.6-tip, branch pci-ioapic-boot-irq-quirks. From: Stefan Assmann Subject: Introduce config option for pci reroute quirks The config option X86_REROUTE_FOR_BROKEN_BOOT_IRQS is introduced to enable (or disable) the redirection of the interrupt handler to the boot interrupt line by default. Depending on the existence of interrupt masking / threaded interrupt handling in the kernel (vanilla, rt, ...) and the maturity of the rerouting patch, users can enable or disable the redirection by default. This means that the reroute quirk can be applied to any kernel without changing it. Interrupt sharing could be increased if this option is enabled. However this option is vital for threaded interrupt handling, as done by the RT kernel. It should simplify the consolidation with the RT kernel. The option can be overridden by either pci=ioapicreroute or pci=noioapicreroute. Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Cc: Jesse Barnes Cc: Jon Masters Cc: Ihno Krumreich Cc: Sven Dietrich Cc: Daniel Gollub Cc: Felix Foerster Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 4 ++++ arch/x86/Kconfig | 24 ++++++++++++++++++++++++ arch/x86/pci/common.c | 8 ++++++++ drivers/pci/quirks.c | 2 +- include/asm-x86/pci.h | 2 +- 5 files changed, 38 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f5662b7a34d1..62b6e8067a5b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1536,6 +1536,10 @@ and is between 256 and 4096 characters. It is defined in the file primary IO-APIC for bridges that cannot disable boot IRQs. This fixes a source of spurious IRQs when the system masks IRQs. + noioapicreroute [APIC] Disable workaround that uses the + boot IRQ equivalent of an IRQ that connects to + a chipset where boot IRQs cannot be disabled. + The opposite of ioapicreroute. biosirq [X86-32] Use PCI BIOS calls to get the interrupt routing table. These calls are known to be buggy on several machines and they hang the machine diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 96e0c2ebc388..09521332636b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -665,6 +665,30 @@ config X86_VISWS_APIC def_bool y depends on X86_32 && X86_VISWS +config X86_REROUTE_FOR_BROKEN_BOOT_IRQS + bool "Reroute for broken boot IRQs" + default n + depends on X86_IO_APIC + help + This option enables a workaround that fixes a source of + spurious interrupts. This is recommended when threaded + interrupt handling is used on systems where the generation of + superfluous "boot interrupts" cannot be disabled. + + Some chipsets generate a legacy INTx "boot IRQ" when the IRQ + entry in the chipset's IO-APIC is masked (as, e.g. the RT + kernel does during interrupt handling). On chipsets where this + boot IRQ generation cannot be disabled, this workaround keeps + the original IRQ line masked so that only the equivalent "boot + IRQ" is delivered to the CPUs. The workaround also tells the + kernel to set up the IRQ handler on the boot IRQ line. In this + way only one interrupt is delivered to the kernel. Otherwise + the spurious second interrupt may cause the kernel to bring + down (vital) interrupt lines. + + Only affects "broken" chipsets. Interrupt sharing may be + increased on these systems. + config X86_MCE bool "Machine Check Exception" depends on !X86_VOYAGER diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 1485a26ddcef..bb1a01f089e2 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -24,7 +24,11 @@ unsigned int pci_early_dump_regs; static int pci_bf_sort; int pci_routeirq; int noioapicquirk; +#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS +int noioapicreroute = 0; +#else int noioapicreroute = 1; +#endif int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; @@ -528,6 +532,10 @@ char * __devinit pcibios_setup(char *str) if (noioapicreroute != -1) noioapicreroute = 0; return NULL; + } else if (!strcmp(str, "noioapicreroute")) { + if (noioapicreroute != -1) + noioapicreroute = 1; + return NULL; } return str; } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 0911b0c60b64..c880dd0bbfb5 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1397,7 +1397,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x260b, quirk_intel_pcie_pm); */ static void quirk_reroute_to_boot_interrupts_intel(struct pci_dev *dev) { - if (noioapicquirk) + if (noioapicquirk || noioapicreroute) return; dev->irq_reroute_variant = INTEL_IRQ_REROUTE_VARIANT; diff --git a/include/asm-x86/pci.h b/include/asm-x86/pci.h index 52a29f7668ef..9584d6d5eb93 100644 --- a/include/asm-x86/pci.h +++ b/include/asm-x86/pci.h @@ -20,7 +20,7 @@ struct pci_sysdata { extern int pci_routeirq; extern int noioapicquirk; -extern int ioapicreroute; +extern int noioapicreroute; /* scan a bus after allocating a pci_sysdata for it */ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, -- cgit v1.2.3 From b0f209898f1a177bd503d49215b8c6628797a81c Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Tue, 21 Oct 2008 14:09:51 -0500 Subject: x86, uv: use consistent names for region size and conherence id on x86 and ia64 Use consistent names for region size and conherence id on x86 and ia64. The SGI xp drivers are used on both ia64 and x86. Using the same names (sn_coherency_id, sn_region_size) simplies the driver code. Signed-off-by: Russ Anderson Signed-off-by: Ingo Molnar --- arch/x86/kernel/bios_uv.c | 8 ++++---- arch/x86/kernel/genx2apic_uv_x.c | 4 ++-- include/asm-x86/uv/bios.h | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index f0dfe6f17e7e..7cefb7170e75 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -69,10 +69,10 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); -long uv_coherency_id; -EXPORT_SYMBOL_GPL(uv_coherency_id); -long uv_region_size; -EXPORT_SYMBOL_GPL(uv_region_size); +long sn_coherency_id; +EXPORT_SYMBOL_GPL(sn_coherency_id); +long sn_region_size; +EXPORT_SYMBOL_GPL(sn_region_size); int uv_type; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index bfd532843df6..6cf35c8bd636 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -429,7 +429,7 @@ void __init uv_system_init(void) uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &uv_coherency_id, &uv_region_size); + &sn_coherency_id, &sn_region_size); uv_rtc_init(); for_each_present_cpu(cpu) { @@ -451,7 +451,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; + uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); diff --git a/include/asm-x86/uv/bios.h b/include/asm-x86/uv/bios.h index 215f1969c266..7b3d7022c639 100644 --- a/include/asm-x86/uv/bios.h +++ b/include/asm-x86/uv/bios.h @@ -85,9 +85,9 @@ extern void uv_bios_init(void); extern int uv_type; extern long sn_partition_id; -extern long uv_coherency_id; -extern long uv_region_size; -#define partition_coherence_id() (uv_coherency_id) +extern long sn_coherency_id; +extern long sn_region_size; +#define partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ -- cgit v1.2.3 From 9e899816d126cc6f7d405c349f65363214fe7399 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Oct 2008 12:33:16 +0200 Subject: x86, mm: enable GBPAGES option by default DIRECT_GBPAGES was under DEBUG_KERNEL && EXPERIMENTAL and disabled by default. Turn it on by default and put it under EMBEDDED. Signed-off-by: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 9 +++++++++ arch/x86/Kconfig.debug | 12 ------------ 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5b9b12321ad1..c00aefcb47d5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -946,6 +946,15 @@ config X86_PAE config ARCH_PHYS_ADDR_T_64BIT def_bool X86_64 || X86_PAE +config DIRECT_GBPAGES + bool "Enable 1GB pages for kernel pagetables" if EMBEDDED + default y + depends on X86_64 + help + Allow the kernel linear mapping to use 1GB pages on CPUs that + support it. This can improve the kernel's performance a tiny bit by + reducing TLB pressure. If in doubt, say "Y". + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd5e677..567fe543e09c 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -114,18 +114,6 @@ config DEBUG_RODATA data. This is recommended so that we can catch kernel bugs sooner. If in doubt, say "Y". -config DIRECT_GBPAGES - bool "Enable gbpages-mapped kernel pagetables" - depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64 - help - Enable gigabyte pages support (if the CPU supports it). This can - improve the kernel's performance a tiny bit by reducing TLB - pressure. - - This is experimental code. - - If in doubt, say "N". - config DEBUG_RODATA_TEST bool "Testcase for the DEBUG_RODATA feature" depends on DEBUG_RODATA -- cgit v1.2.3 From 63fb70859f987f2b3b8028fa467fd63336315e9c Mon Sep 17 00:00:00 2001 From: Sitsofe Wheeler Date: Sat, 11 Oct 2008 20:27:53 +0100 Subject: x86: change OPTIMIZE_INLINING help to say enabling makes smaller kernels Impact: clarify Kconfig help text The OPTIMIZE_INLINING help currently says "The gcc 4.x series have a rewritten inlining algorithm and disabling this option will generate a smaller kernel there." This contradicts other parts of the help text and my own tests: 5463127 2008-10-11 19:51 vmlinux.no-opt 5456152 2008-10-11 19:56 vmlinux.opt Reword text to say that enabling OPTIMIZE_INLINING will lead to smaller kernels with gcc 4.x or later. Signed-off-by: Sitsofe Wheeler Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd5e677..2be1e6b8e18b 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -307,10 +307,10 @@ config OPTIMIZE_INLINING developers have marked 'inline'. Doing so takes away freedom from gcc to do what it thinks is best, which is desirable for the gcc 3.x series of compilers. The gcc 4.x series have a rewritten inlining algorithm and - disabling this option will generate a smaller kernel there. Hopefully - this algorithm is so good that allowing gcc4 to make the decision can - become the default in the future, until then this option is there to - test gcc for this. + enabling this option will generate a smaller kernel there. Hopefully + this algorithm is so good that allowing gcc 4.x and above to make the + decision will become the default in the future. Until then this option + is there to test gcc for this. If unsure, say N. -- cgit v1.2.3 From 8479d94e9f6a44b5050cbacf653272a561fbe0d0 Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Mon, 27 Oct 2008 09:30:57 +0100 Subject: x86, signals: remove duplicated register setup code in ia32 signal delivery Impact: cleanup, no functionality changed ia32_setup_rt_frame() has a duplicated code block labelled "Make -mregparm=3 work" for setting up the register parameters to the user-mode signal handler. This is harmless but ugly. Remove the redundant assignments. Signed-off-by: Mikael Pettersson Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 4bc02b23674b..e82ebd652263 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -572,11 +572,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->dx = (unsigned long) &frame->info; regs->cx = (unsigned long) &frame->uc; - /* Make -mregparm=3 work */ - regs->ax = sig; - regs->dx = (unsigned long) &frame->info; - regs->cx = (unsigned long) &frame->uc; - loadsegment(ds, __USER32_DS); loadsegment(es, __USER32_DS); -- cgit v1.2.3 From 7f1baa063e2582dd52d83bb31508e9e84468c666 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 24 Oct 2008 15:24:29 -0700 Subject: x86/uv: provide a System Activity Indicator driver Impact: start per CPU heartbeat LED timers on SGI UV systems The SGI UV system has no LEDS but uses one of the system controller regs to indicate the online internal state of the cpu. There is a heartbeat bit indicating that the cpu is responding to interrupts, and an idle bit indicating whether the cpu is idle when the heartbeat interrupt occurs. The current period is one second. When a cpu panics, an error code is written by BIOS to this same reg. This patchset provides the following: * x86_64: Add base functionality for writing to the specific SCIR's for each cpu. * heartbeat: Invert "heartbeat" bit to indicate the cpu is "interruptible". If the current thread is the idle thread, then indicate system is "idle". * if hotplug enabled, all bits are set (0xff) when the cpu is disabled. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 63 +++++++++++++++++++++++- arch/x86/kernel/genx2apic_uv_x.c | 102 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index c6ad93e315c8..400776dba9b5 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -112,6 +112,16 @@ */ #define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) +struct uv_scir_s { + struct timer_list timer; + unsigned long offset; + unsigned long last; + unsigned long idle_on; + unsigned long idle_off; + unsigned char state; + unsigned char enabled; +}; + /* * The following defines attributes of the HUB chip. These attributes are * frequently referenced and are kept in the per-cpu data areas of each cpu. @@ -130,7 +140,9 @@ struct uv_hub_info_s { unsigned char blade_processor_id; unsigned char m_val; unsigned char n_val; + struct uv_scir_s scir; }; + DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) @@ -162,6 +174,30 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_APIC_PNODE_SHIFT 6 +/* Local Bus from cpu's perspective */ +#define LOCAL_BUS_BASE 0x1c00000 +#define LOCAL_BUS_SIZE (4 * 1024 * 1024) + +/* + * System Controller Interface Reg + * + * Note there are NO leds on a UV system. This register is only + * used by the system controller to monitor system-wide operation. + * There are 64 regs per node. With Nahelem cpus (2 cores per node, + * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on + * a node. + * + * The window is located at top of ACPI MMR space + */ +#define SCIR_WINDOW_COUNT 64 +#define SCIR_LOCAL_MMR_BASE (LOCAL_BUS_BASE + \ + LOCAL_BUS_SIZE - \ + SCIR_WINDOW_COUNT) + +#define SCIR_CPU_HEARTBEAT 0x01 /* timer interrupt */ +#define SCIR_CPU_ACTIVITY 0x02 /* not idle */ +#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ + /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. @@ -276,6 +312,16 @@ static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) *uv_local_mmr_address(offset) = val; } +static inline unsigned char uv_read_local_mmr8(unsigned long offset) +{ + return *((unsigned char *)uv_local_mmr_address(offset)); +} + +static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) +{ + *((unsigned char *)uv_local_mmr_address(offset)) = val; +} + /* * Structures and definitions for converting between cpu, node, pnode, and blade * numbers. @@ -350,5 +396,20 @@ static inline int uv_num_possible_blades(void) return uv_possible_blades; } -#endif /* _ASM_X86_UV_UV_HUB_H */ +/* Update SCIR state */ +static inline void uv_set_scir_bits(unsigned char value) +{ + if (uv_hub_info->scir.state != value) { + uv_hub_info->scir.state = value; + uv_write_local_mmr8(uv_hub_info->scir.offset, value); + } +} +static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) +{ + if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_cpu_hub_info(cpu)->scir.state = value; + uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); + } +} +#endif /* _ASM_X86_UV_UV_HUB_H */ diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index dc6b46961523..84367d84bb10 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -18,6 +19,8 @@ #include #include #include +#include +#include #include #include #include @@ -356,6 +359,103 @@ static __init void uv_rtc_init(void) sn_rtc_cycles_per_second = ticks_per_sec; } +/* + * percpu heartbeat timer + */ +static void uv_heartbeat(unsigned long ignored) +{ + struct timer_list *timer = &uv_hub_info->scir.timer; + unsigned char bits = uv_hub_info->scir.state; + + /* flip heartbeat bit */ + bits ^= SCIR_CPU_HEARTBEAT; + + /* are we the idle thread? */ + if (current->pid == 0) + bits &= ~SCIR_CPU_ACTIVITY; + else + bits |= SCIR_CPU_ACTIVITY; + + /* update system controller interface reg */ + uv_set_scir_bits(bits); + + /* enable next timer period */ + mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); +} + +static void __cpuinit uv_heartbeat_enable(int cpu) +{ + if (!uv_cpu_hub_info(cpu)->scir.enabled) { + struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; + + uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); + setup_timer(timer, uv_heartbeat, cpu); + timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; + add_timer_on(timer, cpu); + uv_cpu_hub_info(cpu)->scir.enabled = 1; + } + + /* check boot cpu */ + if (!uv_cpu_hub_info(0)->scir.enabled) + uv_heartbeat_enable(0); +} + +static void __cpuinit uv_heartbeat_disable(int cpu) +{ + if (uv_cpu_hub_info(cpu)->scir.enabled) { + uv_cpu_hub_info(cpu)->scir.enabled = 0; + del_timer(&uv_cpu_hub_info(cpu)->scir.timer); + } + uv_set_cpu_scir_bits(cpu, 0xff); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * cpu hotplug notifier + */ +static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + uv_heartbeat_enable(cpu); + break; + case CPU_DOWN_PREPARE: + uv_heartbeat_disable(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static __init void uv_scir_register_cpu_notifier(void) +{ + hotcpu_notifier(uv_scir_cpu_notify, 0); +} + +#else /* !CONFIG_HOTPLUG_CPU */ + +static __init void uv_scir_register_cpu_notifier(void) +{ +} + +static __init int uv_init_heartbeat(void) +{ + int cpu; + + if (is_uv_system()) + for_each_online_cpu(cpu) + uv_heartbeat_enable(cpu); + return 0; +} + +late_initcall(uv_init_heartbeat); + +#endif /* !CONFIG_HOTPLUG_CPU */ + /* * Called on each cpu to initialize the per_cpu UV data area. * ZZZ hotplug not supported yet @@ -452,6 +552,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; + uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); @@ -468,4 +569,5 @@ void __init uv_system_init(void) map_mmioh_high(max_pnode); uv_cpu_init(); + uv_scir_register_cpu_notifier(); } -- cgit v1.2.3 From 709110bd5624094992579f5311541f2e2b7ce58a Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 23 Oct 2008 17:14:25 -0700 Subject: x86: signal: cosmetic unification of restore_sigcontext() Impact: cleanup Make restore_sigcontext() the same. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 22 ++++++++++++++++++++++ arch/x86/kernel/signal_64.c | 15 +++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d6dd057d0f22..85a0d37cdae9 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -149,14 +149,36 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; +#ifdef CONFIG_X86_32 GET_SEG(gs); COPY_SEG(fs); COPY_SEG(es); COPY_SEG(ds); +#endif /* CONFIG_X86_32 */ + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); + +#ifdef CONFIG_X86_64 + COPY(r8); + COPY(r9); + COPY(r10); + COPY(r11); + COPY(r12); + COPY(r13); + COPY(r14); + COPY(r15); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_X86_32 COPY_SEG_STRICT(cs); COPY_SEG_STRICT(ss); +#else /* !CONFIG_X86_32 */ + /* Kernel saves and restores only the CS segment register on signals, + * which is the bare minimum needed to allow mixed 32/64-bit code. + * App's signal handler can save/restore other segments if needed. */ + COPY_SEG_STRICT(cs); +#endif /* CONFIG_X86_32 */ err |= __get_user(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index a5c9627f4db9..9c469da7f9e8 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -76,8 +76,17 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; +#ifdef CONFIG_X86_32 + GET_SEG(gs); + COPY_SEG(fs); + COPY_SEG(es); + COPY_SEG(ds); +#endif /* CONFIG_X86_32 */ + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); + +#ifdef CONFIG_X86_64 COPY(r8); COPY(r9); COPY(r10); @@ -86,11 +95,17 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY(r13); COPY(r14); COPY(r15); +#endif /* CONFIG_X86_64 */ +#ifdef CONFIG_X86_32 + COPY_SEG_STRICT(cs); + COPY_SEG_STRICT(ss); +#else /* !CONFIG_X86_32 */ /* Kernel saves and restores only the CS segment register on signals, * which is the bare minimum needed to allow mixed 32/64-bit code. * App's signal handler can save/restore other segments if needed. */ COPY_SEG_STRICT(cs); +#endif /* CONFIG_X86_32 */ err |= __get_user(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); -- cgit v1.2.3 From fd4a2030a358b4818646031049d9631bd45b9915 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 23 Oct 2008 17:15:28 -0700 Subject: x86: signal_64.c: get_stack() doesn't need entire regs Impact: cleanup get_stack() uses sp only, entire regs is not needed. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 9c469da7f9e8..3d0deb336745 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -199,12 +199,10 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, */ static void __user * -get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) +get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size) { - unsigned long sp; - /* Default to using normal stack - redzone*/ - sp = regs->sp - 128; + sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { @@ -224,14 +222,14 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct task_struct *me = current; if (used_math()) { - fp = get_stack(ka, regs, sig_xstate_size); + fp = get_stack(ka, regs->sp, sig_xstate_size); frame = (void __user *)round_down( (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; if (save_i387_xstate(fp) < 0) return -EFAULT; } else - frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; + frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8; if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; -- cgit v1.2.3 From c63dfefd48d92b1db3400fe8de4886a519ac3949 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Thu, 23 Oct 2008 15:44:02 -0500 Subject: x86: remove dead IRQBALANCE code Impact: cleanup CONFIG_IRQBALANCE was removed in commit 8b8e8c1bf; this ifdef was still around. Signed-off-by: Dan McGee Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index bae0eda95486..28e409fc73f3 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -31,10 +31,6 @@ static inline int irq_canonicalize(int irq) # endif #endif -#ifdef CONFIG_IRQBALANCE -extern int irqbalance_disable(char *str); -#endif - #ifdef CONFIG_HOTPLUG_CPU #include extern void fixup_irqs(cpumask_t map); -- cgit v1.2.3 From 04d2aac33eb54fd3084140f2db130530d71e97c6 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Oct 2008 11:08:10 -0700 Subject: x86: corruption-check: fix some style issues Impact: cleanup Before moving the code to it's own file, fix some style issues in the corruption check code. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd3..4f38e0305b07 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -662,7 +662,7 @@ static void __init setup_bios_corruption_check(void) corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { u64 size; addr = find_e820_area_size(addr, &size, PAGE_SIZE); @@ -701,11 +701,11 @@ void check_for_bios_corruption(void) if (!memory_corruption_check) return; - for(i = 0; i < num_scan_areas; i++) { + for (i = 0; i < num_scan_areas; i++) { unsigned long *addr = __va(scan_areas[i].addr); unsigned long size = scan_areas[i].size; - for(; size; addr++, size -= sizeof(unsigned long)) { + for (; size; addr++, size -= sizeof(unsigned long)) { if (!*addr) continue; printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", @@ -721,7 +721,8 @@ void check_for_bios_corruption(void) static void periodic_check_for_corruption(unsigned long data) { check_for_bios_corruption(); - mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); + mod_timer(&periodic_check_timer, + round_jiffies(jiffies + corruption_check_period*HZ)); } void start_periodic_check_for_corruption(void) -- cgit v1.2.3 From 6784f7d0a5016a397d38be1134e63fc784c1ca8e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Oct 2008 11:33:42 -0700 Subject: x86: corruption check: move the corruption checks into their own file Impact: cleanup The corruption check code is rather sizable and it's likely to grow over time when we add checks for more types of corruptions (there's a few candidates in kerneloops.org that I want to add checks for)... so lets move it to its own file Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/include/asm/setup.h | 4 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/check.c | 158 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 152 ----------------------------------------- 4 files changed, 163 insertions(+), 152 deletions(-) create mode 100644 arch/x86/kernel/check.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f12d37237465..1ed8b2e80727 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -8,6 +8,10 @@ /* Interrupt control for vSMPowered x86_64 systems */ void vsmp_init(void); + +void setup_bios_corruption_check(void); + + #ifdef CONFIG_X86_VISWS extern void visws_early_detect(void); extern int is_visws_box(void); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d7e5a58ee22f..31fbcaf3df70 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,6 +35,7 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-y += tsc.o io_delay.o rtc.o +obj-y += check.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c new file mode 100644 index 000000000000..5056703e1b05 --- /dev/null +++ b/arch/x86/kernel/check.c @@ -0,0 +1,158 @@ +#include +#include + +#include +#include + +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct e820entry scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + + +static int set_corruption_check(char *arg) +{ + char *end; + + memory_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check", set_corruption_check); + +static int set_corruption_check_period(char *arg) +{ + char *end; + + corruption_check_period = simple_strtoul(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +void __init setup_bios_corruption_check(void) +{ + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + + if (addr == 0) + break; + + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; + + if (size == 0) + break; + + e820_update_range(addr, size, E820_RAM, E820_RESERVED); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; + + /* Assume we've already mapped this early memory */ + memset(__va(addr), 0, size); + + addr += size; + } + + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", + num_scan_areas); + update_e820(); +} + +static struct timer_list periodic_check_timer; + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for (i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for (; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void periodic_check_for_corruption(unsigned long data) +{ + check_for_bios_corruption(); + mod_timer(&periodic_check_timer, + round_jiffies(jiffies + corruption_check_period*HZ)); +} + +void start_periodic_check_for_corruption(void) +{ + if (!memory_corruption_check || corruption_check_period == 0) + return; + + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + + init_timer(&periodic_check_timer); + periodic_check_timer.function = &periodic_check_for_corruption; + periodic_check_for_corruption(0); +} +#endif + diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4f38e0305b07..af690aa593a9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -587,158 +587,6 @@ static struct x86_quirks default_x86_quirks __initdata; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; -/* - * Some BIOSes seem to corrupt the low 64k of memory during events - * like suspend/resume and unplugging an HDMI cable. Reserve all - * remaining free memory in that area and fill it with a distinct - * pattern. - */ -#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION -#define MAX_SCAN_AREAS 8 - -static int __read_mostly memory_corruption_check = -1; - -static unsigned __read_mostly corruption_check_size = 64*1024; -static unsigned __read_mostly corruption_check_period = 60; /* seconds */ - -static struct e820entry scan_areas[MAX_SCAN_AREAS]; -static int num_scan_areas; - - -static int set_corruption_check(char *arg) -{ - char *end; - - memory_corruption_check = simple_strtol(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check", set_corruption_check); - -static int set_corruption_check_period(char *arg) -{ - char *end; - - corruption_check_period = simple_strtoul(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_period", set_corruption_check_period); - -static int set_corruption_check_size(char *arg) -{ - char *end; - unsigned size; - - size = memparse(arg, &end); - - if (*end == '\0') - corruption_check_size = size; - - return (size == corruption_check_size) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_size", set_corruption_check_size); - - -static void __init setup_bios_corruption_check(void) -{ - u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ - - if (memory_corruption_check == -1) { - memory_corruption_check = -#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK - 1 -#else - 0 -#endif - ; - } - - if (corruption_check_size == 0) - memory_corruption_check = 0; - - if (!memory_corruption_check) - return; - - corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - - while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { - u64 size; - addr = find_e820_area_size(addr, &size, PAGE_SIZE); - - if (addr == 0) - break; - - if ((addr + size) > corruption_check_size) - size = corruption_check_size - addr; - - if (size == 0) - break; - - e820_update_range(addr, size, E820_RAM, E820_RESERVED); - scan_areas[num_scan_areas].addr = addr; - scan_areas[num_scan_areas].size = size; - num_scan_areas++; - - /* Assume we've already mapped this early memory */ - memset(__va(addr), 0, size); - - addr += size; - } - - printk(KERN_INFO "Scanning %d areas for low memory corruption\n", - num_scan_areas); - update_e820(); -} - -static struct timer_list periodic_check_timer; - -void check_for_bios_corruption(void) -{ - int i; - int corruption = 0; - - if (!memory_corruption_check) - return; - - for (i = 0; i < num_scan_areas; i++) { - unsigned long *addr = __va(scan_areas[i].addr); - unsigned long size = scan_areas[i].size; - - for (; size; addr++, size -= sizeof(unsigned long)) { - if (!*addr) - continue; - printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", - addr, __pa(addr), *addr); - corruption = 1; - *addr = 0; - } - } - - WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); -} - -static void periodic_check_for_corruption(unsigned long data) -{ - check_for_bios_corruption(); - mod_timer(&periodic_check_timer, - round_jiffies(jiffies + corruption_check_period*HZ)); -} - -void start_periodic_check_for_corruption(void) -{ - if (!memory_corruption_check || corruption_check_period == 0) - return; - - printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", - corruption_check_period); - - init_timer(&periodic_check_timer); - periodic_check_timer.function = &periodic_check_for_corruption; - periodic_check_for_corruption(0); -} -#endif - static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE -- cgit v1.2.3 From 304e629bf4a3150a0bf6556fc45c52c5c082340f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Oct 2008 12:09:03 -0700 Subject: x86: corruption check: run the corruption checks from a work queue Impact: change the implementation of the debug feature the periodic corruption checks are better off run from a work queue; there's nothing time critical about them and this way the amount of interrupt-context work is reduced. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/check.c | 27 +++++++++++++++++---------- arch/x86/mm/init_32.c | 2 -- arch/x86/mm/init_64.c | 2 -- 3 files changed, 17 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 5056703e1b05..55eed1752b43 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -1,6 +1,7 @@ #include #include - +#include +#include #include #include @@ -108,13 +109,14 @@ void __init setup_bios_corruption_check(void) update_e820(); } -static struct timer_list periodic_check_timer; void check_for_bios_corruption(void) { int i; int corruption = 0; + printk("dot\n"); + if (!memory_corruption_check) return; @@ -135,24 +137,29 @@ void check_for_bios_corruption(void) WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); } -static void periodic_check_for_corruption(unsigned long data) +static void check_corruption(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(bios_check_work, check_corruption); + +static void check_corruption(struct work_struct *dummy) { check_for_bios_corruption(); - mod_timer(&periodic_check_timer, - round_jiffies(jiffies + corruption_check_period*HZ)); + schedule_delayed_work(&bios_check_work, + round_jiffies_relative(corruption_check_period*HZ)); } -void start_periodic_check_for_corruption(void) +static int start_periodic_check_for_corruption(void) { if (!memory_corruption_check || corruption_check_period == 0) - return; + return 0; printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", corruption_check_period); - init_timer(&periodic_check_timer); - periodic_check_timer.function = &periodic_check_for_corruption; - periodic_check_for_corruption(0); + /* First time we run the checks right away */ + schedule_delayed_work(&bios_check_work, 0); + return 0; } + +module_init(start_periodic_check_for_corruption); #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8396868e82c5..5e6377560ff1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -970,8 +970,6 @@ void __init mem_init(void) int codesize, reservedpages, datasize, initsize; int tmp; - start_periodic_check_for_corruption(); - #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b8e461d49412..d6ef1589b95a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -879,8 +879,6 @@ void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; - start_periodic_check_for_corruption(); - pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ -- cgit v1.2.3 From b43d196c4d3fe46d6dda7c987c47792612b80b1b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Oct 2008 12:21:32 -0700 Subject: x86: corruption-check: some post-move cleanups Impact: cleanup now that the code is moved and converted to a work queue, there's some minor cleanups that can be done. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 ++- arch/x86/kernel/check.c | 12 ++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 31fbcaf3df70..f63a8034fb82 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-y += tsc.o io_delay.o rtc.o -obj-y += check.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o @@ -105,6 +104,8 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 55eed1752b43..2ac0ab71412a 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -11,7 +11,6 @@ * remaining free memory in that area and fill it with a distinct * pattern. */ -#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION #define MAX_SCAN_AREAS 8 static int __read_mostly memory_corruption_check = -1; @@ -23,7 +22,7 @@ static struct e820entry scan_areas[MAX_SCAN_AREAS]; static int num_scan_areas; -static int set_corruption_check(char *arg) +static __init int set_corruption_check(char *arg) { char *end; @@ -33,7 +32,7 @@ static int set_corruption_check(char *arg) } early_param("memory_corruption_check", set_corruption_check); -static int set_corruption_check_period(char *arg) +static __init int set_corruption_check_period(char *arg) { char *end; @@ -43,7 +42,7 @@ static int set_corruption_check_period(char *arg) } early_param("memory_corruption_check_period", set_corruption_check_period); -static int set_corruption_check_size(char *arg) +static __init int set_corruption_check_size(char *arg) { char *end; unsigned size; @@ -115,8 +114,6 @@ void check_for_bios_corruption(void) int i; int corruption = 0; - printk("dot\n"); - if (!memory_corruption_check) return; @@ -134,7 +131,7 @@ void check_for_bios_corruption(void) } } - WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); + WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n"); } static void check_corruption(struct work_struct *dummy); @@ -161,5 +158,4 @@ static int start_periodic_check_for_corruption(void) } module_init(start_periodic_check_for_corruption); -#endif -- cgit v1.2.3 From 69a72a0e9337aad8c730e8e9942d5aa022bc4c5c Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 27 Oct 2008 07:51:20 -0700 Subject: x86/uv: update SCIR driver to use the idle_cpu() function Impact: cleanup Change UV heartbeat function to use idle_cpu to determine cpu's "idleness". Realign uv_hub definitions. Signed-of-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 26 +++++++++++++------------- arch/x86/kernel/genx2apic_uv_x.c | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 400776dba9b5..0ee12928e9ee 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -128,19 +128,19 @@ struct uv_scir_s { * They are kept together in a struct to minimize cache misses. */ struct uv_hub_info_s { - unsigned long global_mmr_base; - unsigned long gpa_mask; - unsigned long gnode_upper; - unsigned long lowmem_remap_top; - unsigned long lowmem_remap_base; - unsigned short pnode; - unsigned short pnode_mask; - unsigned short coherency_domain_number; - unsigned short numa_blade_id; - unsigned char blade_processor_id; - unsigned char m_val; - unsigned char n_val; - struct uv_scir_s scir; + unsigned long global_mmr_base; + unsigned long gpa_mask; + unsigned long gnode_upper; + unsigned long lowmem_remap_top; + unsigned long lowmem_remap_base; + unsigned short pnode; + unsigned short pnode_mask; + unsigned short coherency_domain_number; + unsigned short numa_blade_id; + unsigned char blade_processor_id; + unsigned char m_val; + unsigned char n_val; + struct uv_scir_s scir; }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 84367d84bb10..85fb7dd48f67 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -370,8 +370,8 @@ static void uv_heartbeat(unsigned long ignored) /* flip heartbeat bit */ bits ^= SCIR_CPU_HEARTBEAT; - /* are we the idle thread? */ - if (current->pid == 0) + /* is this cpu idle? */ + if (idle_cpu(raw_smp_processor_id())) bits &= ~SCIR_CPU_ACTIVITY; else bits |= SCIR_CPU_ACTIVITY; -- cgit v1.2.3 From 30604bb410b53efa9c93ee8f03d7aa7494094faa Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 14 Oct 2008 18:59:18 -0700 Subject: x86: break up mtrr_cleanup() into several small functions. Ingo said mtrr_cleanup() is big and ugly. so break it up into more functions and make it more readable. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 346 ++++++++++++++++++++-------------------- 1 file changed, 171 insertions(+), 175 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index c78c04821ea1..1159e269e596 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -803,6 +803,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, } static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; #ifdef CONFIG_MTRR_SANITIZER @@ -1206,39 +1207,43 @@ struct mtrr_cleanup_result { #define PSHIFT (PAGE_SHIFT - 10) static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; -static struct res_range __initdata range_new[RANGE_NUM]; static unsigned long __initdata min_loss_pfn[RANGE_NUM]; -static int __init mtrr_cleanup(unsigned address_bits) +static void __init print_out_mtrr_range_state(void) { - unsigned long extra_remove_base, extra_remove_size; - unsigned long base, size, def, dummy; - mtrr_type type; - int nr_range, nr_range_new; - u64 chunk_size, gran_size; - unsigned long range_sums, range_sums_new; - int index_good; - int num_reg_good; int i; + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + mtrr_type type; - /* extra one for all 0 */ - int num[MTRR_NUM_TYPES + 1]; + for (i = 0; i < num_var_ranges; i++) { - if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) - return 0; - rdmsr(MTRRdefType_MSR, def, dummy); - def &= 0xff; - if (def != MTRR_TYPE_UNCACHABLE) - return 0; + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; - /* get it and store it aside */ - memset(range_state, 0, sizeof(range_state)); - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - range_state[i].base_pfn = base; - range_state[i].size_pfn = size; - range_state[i].type = type; + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; + + printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); } +} + +static int __init mtrr_need_cleanup(void) +{ + int i; + mtrr_type type; + unsigned long size; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; /* check entries number */ memset(num, 0, sizeof(num)); @@ -1263,29 +1268,133 @@ static int __init mtrr_cleanup(unsigned address_bits) num_var_ranges - num[MTRR_NUM_TYPES]) return 0; - /* print original var MTRRs at first, for debugging: */ - printk(KERN_DEBUG "original variable MTRRs\n"); - for (i = 0; i < num_var_ranges; i++) { - char start_factor = 'K', size_factor = 'K'; - unsigned long start_base, size_base; + return 1; +} - size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); - if (!size_base) - continue; +static unsigned long __initdata range_sums; +static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long extra_remove_base, + unsigned long extra_remove_size, + int i) +{ + int num_reg; + static struct res_range range_new[RANGE_NUM]; + static int nr_range_new; + unsigned long range_sums_new; + + /* convert ranges to var ranges state */ + num_reg = x86_setup_var_mtrrs(range, nr_range, + chunk_size, gran_size); + + /* we got new setting in range_state, check it */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + extra_remove_base, extra_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = + (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else + result[i].lose_cover_sizek = + (range_sums - range_sums_new) << PSHIFT; - size_base = to_size_factor(size_base, &size_factor), - start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), - type = range_state[i].type; + /* double check it */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || + memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } - printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", - i, start_base, start_factor, - size_base, size_factor, - (type == MTRR_TYPE_UNCACHABLE) ? "UC" : - ((type == MTRR_TYPE_WRPROT) ? "WP" : - ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) - ); + if (!result[i].bad && (range_sums - range_sums_new < + min_loss_pfn[num_reg])) { + min_loss_pfn[num_reg] = + range_sums - range_sums_new; } +} + +static void __init mtrr_print_out_one_result(int i) +{ + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); +} + +static int __init mtrr_search_optimal_index(void) +{ + int i; + int num_reg_good; + int index_good; + + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { + if (!min_loss_pfn[i]) + num_reg_good = i; + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + return index_good; +} + + +static int __init mtrr_cleanup(unsigned address_bits) +{ + unsigned long extra_remove_base, extra_remove_size; + unsigned long base, size, def, dummy; + mtrr_type type; + u64 chunk_size, gran_size; + int index_good; + int i; + + if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* check if we need handle it and can handle it */ + if (!mtrr_need_cleanup()) + return 0; + + /* print original var MTRRs at first, for debugging: */ + printk(KERN_DEBUG "original variable MTRRs\n"); + print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); extra_remove_size = 0; @@ -1309,176 +1418,64 @@ static int __init mtrr_cleanup(unsigned address_bits) range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { - int num_reg; - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - - debug_print++; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, - mtrr_gran_size); + i = 0; + mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, + extra_remove_base, extra_remove_size, i); - /* we got new setting in range_state, check it */ - memset(range_new, 0, sizeof(range_new)); - nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, - extra_remove_size); - range_sums_new = sum_ranges(range_new, nr_range_new); + mtrr_print_out_one_result(i); - i = 0; - result[i].chunk_sizek = mtrr_chunk_size >> 10; - result[i].gran_sizek = mtrr_gran_size >> 10; - result[i].num_reg = num_reg; - if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; - result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; - - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad?"*BAD*":" ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad?"-":"", - lose_base, lose_factor); if (!result[i].bad) { set_var_mtrr_all(address_bits); return 1; } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " "will find optimal one\n"); - debug_print--; - memset(result, 0, sizeof(result[0])); } i = 0; memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); memset(result, 0, sizeof(result)); for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { - char gran_factor; - unsigned long gran_base; - - if (debug_print) - gran_base = to_size_factor(gran_size >> 10, &gran_factor); for (chunk_size = gran_size; chunk_size < (1ULL<<32); chunk_size <<= 1) { - int num_reg; - if (debug_print) { - char chunk_factor; - unsigned long chunk_base; - - chunk_base = to_size_factor(chunk_size>>10, &chunk_factor), - printk(KERN_INFO "\n"); - printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n", - gran_base, gran_factor, chunk_base, chunk_factor); - } if (i >= NUM_RESULT) continue; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); - - /* we got new setting in range_state, check it */ - memset(range_new, 0, sizeof(range_new)); - nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); - range_sums_new = sum_ranges(range_new, nr_range_new); - - result[i].chunk_sizek = chunk_size >> 10; - result[i].gran_sizek = gran_size >> 10; - result[i].num_reg = num_reg; - if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; - result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; - - /* double check it */ - if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; + mtrr_calc_range_state(chunk_size, gran_size, + extra_remove_base, extra_remove_size, i); + if (debug_print) { + mtrr_print_out_one_result(i); + printk(KERN_INFO "\n"); } - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } i++; } } - /* print out all */ - for (i = 0; i < NUM_RESULT; i++) { - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad?"*BAD*":" ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad?"-":"", - lose_base, lose_factor); - } - /* try to find the optimal index */ - if (nr_mtrr_spare_reg >= num_var_ranges) - nr_mtrr_spare_reg = num_var_ranges - 1; - num_reg_good = -1; - for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { - if (!min_loss_pfn[i]) - num_reg_good = i; - } - - index_good = -1; - if (num_reg_good != -1) { - for (i = 0; i < NUM_RESULT; i++) { - if (!result[i].bad && - result[i].num_reg == num_reg_good && - !result[i].lose_cover_sizek) { - index_good = i; - break; - } - } - } + index_good = mtrr_search_optimal_index(); if (index_good != -1) { - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); i = index_good; - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n", - result[i].num_reg, lose_base, lose_factor); + mtrr_print_out_one_result(i); + /* convert ranges to var ranges state */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; gran_size <<= 10; - debug_print++; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - debug_print--; set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); return 1; + } else { + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) + mtrr_print_out_one_result(i); } printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); @@ -1562,7 +1559,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; - int nr_range; u64 total_trim_size; /* extra one for all 0 */ -- cgit v1.2.3 From d4f1b10365d4f03dd802433e0014cf503e6e930c Mon Sep 17 00:00:00 2001 From: Jike Song Date: Fri, 17 Oct 2008 13:25:07 +0800 Subject: x86: clean up comments wrt. rd{msr|tsc|pmc} The rdmsr instruction(et al) for i386 and x86-64 are semantically same. The only difference is how gcc interpret constraint "A" for these targets. Signed-off-by: Jike Song Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 46be2fa7ac26..478a9245aae1 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -22,10 +22,10 @@ static inline unsigned long long native_read_tscp(unsigned int *aux) } /* - * i386 calling convention returns 64-bit value in edx:eax, while - * x86_64 returns at rax. Also, the "A" constraint does not really - * mean rdx:rax in x86_64, so we need specialized behaviour for each - * architecture + * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A" + * constraint has different meanings. For i386, "A" means exactly + * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead, + * it means rax *or* rdx. */ #ifdef CONFIG_X86_64 #define DECLARE_ARGS(val, low, high) unsigned low, high -- cgit v1.2.3 From ad38dab01323a01e825555fc46863b73cd0efdc7 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 27 Oct 2008 13:30:56 -0700 Subject: x86: use the new byteorder headers Impact: cleanup, no functionality changed Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- arch/x86/include/asm/byteorder.h | 74 ++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h index e02ae2d89acf..f110ad417df3 100644 --- a/arch/x86/include/asm/byteorder.h +++ b/arch/x86/include/asm/byteorder.h @@ -4,26 +4,33 @@ #include #include -#ifdef __GNUC__ +#define __LITTLE_ENDIAN -#ifdef __i386__ - -static inline __attribute_const__ __u32 ___arch__swab32(__u32 x) +static inline __attribute_const__ __u32 __arch_swab32(__u32 val) { -#ifdef CONFIG_X86_BSWAP - asm("bswap %0" : "=r" (x) : "0" (x)); -#else +#ifdef __i386__ +# ifdef CONFIG_X86_BSWAP + asm("bswap %0" : "=r" (val) : "0" (val)); +# else asm("xchgb %b0,%h0\n\t" /* swap lower bytes */ "rorl $16,%0\n\t" /* swap words */ "xchgb %b0,%h0" /* swap higher bytes */ - : "=q" (x) - : "0" (x)); + : "=q" (val) + : "0" (val)); +# endif + +#else /* __i386__ */ + asm("bswapl %0" + : "=r" (val) + : "0" (val)); #endif - return x; + return val; } +#define __arch_swab32 __arch_swab32 -static inline __attribute_const__ __u64 ___arch__swab64(__u64 val) +static inline __attribute_const__ __u64 __arch_swab64(__u64 val) { +#ifdef __i386__ union { struct { __u32 a; @@ -32,50 +39,27 @@ static inline __attribute_const__ __u64 ___arch__swab64(__u64 val) __u64 u; } v; v.u = val; -#ifdef CONFIG_X86_BSWAP +# ifdef CONFIG_X86_BSWAP asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b)); -#else - v.s.a = ___arch__swab32(v.s.a); - v.s.b = ___arch__swab32(v.s.b); +# else + v.s.a = __arch_swab32(v.s.a); + v.s.b = __arch_swab32(v.s.b); asm("xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b)); -#endif +# endif return v.u; -} - #else /* __i386__ */ - -static inline __attribute_const__ __u64 ___arch__swab64(__u64 x) -{ asm("bswapq %0" - : "=r" (x) - : "0" (x)); - return x; -} - -static inline __attribute_const__ __u32 ___arch__swab32(__u32 x) -{ - asm("bswapl %0" - : "=r" (x) - : "0" (x)); - return x; -} - + : "=r" (val) + : "0" (val)); + return val; #endif +} +#define __arch_swab64 __arch_swab64 -/* Do not define swab16. Gcc is smart enough to recognize "C" version and - convert it into rotation or exhange. */ - -#define __arch__swab64(x) ___arch__swab64(x) -#define __arch__swab32(x) ___arch__swab32(x) - -#define __BYTEORDER_HAS_U64__ - -#endif /* __GNUC__ */ - -#include +#include #endif /* _ASM_X86_BYTEORDER_H */ -- cgit v1.2.3 From 96bf84b71255b0ee4fcee91e9acd1b5e73030eaf Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 29 Oct 2008 18:44:08 -0700 Subject: x86: signal: cosmetic unification of signr_convert() Impact: cleanup Make signr_convert() same. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 2 ++ arch/x86/kernel/signal_64.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 85a0d37cdae9..abf0df700fd0 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -503,10 +503,12 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, */ static int signr_convert(int sig) { +#ifdef CONFIG_X86_32 struct thread_info *info = current_thread_info(); if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) return info->exec_domain->signal_invmap[sig]; +#endif /* CONFIG_X86_32 */ return sig; } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 3d0deb336745..a4b46e6392b1 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -295,6 +295,12 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, */ static int signr_convert(int sig) { +#ifdef CONFIG_X86_32 + struct thread_info *info = current_thread_info(); + + if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) + return info->exec_domain->signal_invmap[sig]; +#endif /* CONFIG_X86_32 */ return sig; } -- cgit v1.2.3 From cabf503588961d202a33b3fd872767e9f6abbef7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 29 Oct 2008 18:46:07 -0700 Subject: x86: signal: cosmetic unification of macros for setup_rt_frame() Impact: cleanup Add #ifdef directive for unification. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 12 ++++++++++++ arch/x86/kernel/signal_64.c | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index abf0df700fd0..6f3b9a9cc123 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -512,10 +512,22 @@ static int signr_convert(int sig) return sig; } +#ifdef CONFIG_X86_32 + #define is_ia32 1 #define ia32_setup_frame __setup_frame #define ia32_setup_rt_frame __setup_rt_frame +#else /* !CONFIG_X86_32 */ + +#ifdef CONFIG_IA32_EMULATION +#define is_ia32 test_thread_flag(TIF_IA32) +#else /* !CONFIG_IA32_EMULATION */ +#define is_ia32 0 +#endif /* CONFIG_IA32_EMULATION */ + +#endif /* CONFIG_X86_32 */ + static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index a4b46e6392b1..49df79e05111 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -304,11 +304,21 @@ static int signr_convert(int sig) return sig; } +#ifdef CONFIG_X86_32 + +#define is_ia32 1 +#define ia32_setup_frame __setup_frame +#define ia32_setup_rt_frame __setup_rt_frame + +#else /* !CONFIG_X86_32 */ + #ifdef CONFIG_IA32_EMULATION #define is_ia32 test_thread_flag(TIF_IA32) -#else +#else /* !CONFIG_IA32_EMULATION */ #define is_ia32 0 -#endif +#endif /* CONFIG_IA32_EMULATION */ + +#endif /* CONFIG_X86_32 */ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, -- cgit v1.2.3 From 57917752f51bcead3bb6c83d74137fbe342504ec Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 29 Oct 2008 18:46:40 -0700 Subject: x86: signal: cosmetic unification of NR_restart_syscall Impact: cleanup Add #ifdef directive to unify NR_restart_syscall. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 6 ++++++ arch/x86/kernel/signal_64.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 6f3b9a9cc123..a0efc1b3c4c9 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -628,7 +628,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, return 0; } +#ifdef CONFIG_X86_32 #define NR_restart_syscall __NR_restart_syscall +#else /* !CONFIG_X86_32 */ +#define NR_restart_syscall \ + test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall +#endif /* CONFIG_X86_32 */ + /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 49df79e05111..83990db82f74 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -420,8 +420,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, return 0; } +#ifdef CONFIG_X86_32 +#define NR_restart_syscall __NR_restart_syscall +#else /* !CONFIG_X86_32 */ #define NR_restart_syscall \ test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall +#endif /* CONFIG_X86_32 */ + /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by -- cgit v1.2.3 From 7a5276889cfa96619bf863c87581005f46139986 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 30 Oct 2008 10:38:24 +0000 Subject: x86: simplify X86_MPPARSE config option Impact: cleanup Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 350bee1d54dc..f843de13e242 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -235,21 +235,13 @@ config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER -if ACPI config X86_MPPARSE - def_bool y - bool "Enable MPS table" + bool "Enable MPS table" if ACPI + default y depends on X86_LOCAL_APIC help For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it -endif - -if !ACPI -config X86_MPPARSE - def_bool y - depends on X86_LOCAL_APIC -endif choice prompt "Subarchitecture Type" -- cgit v1.2.3 From b062f841b569791d3054e975cd85f48562161565 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 30 Oct 2008 19:16:46 +0300 Subject: x86: nmi - add sensible names to nmi_watchdog boot param Impact: introduce nmi_watchdog=lapic and nmi_watchdog=ioapic aliases Add sensible names as "lapic" and "ioapic" to nmi_watchdog boot parameter. Sometimes it is not that easy to recall what exactly nmi_watchdog=1 does mean so we allow the using of symbolic names here. Old numeric values remain valid. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2c97f07f1c2c..c4869e4532a3 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -199,12 +199,17 @@ static int __init setup_nmi_watchdog(char *str) ++str; } - get_option(&str, &nmi); - - if (nmi >= NMI_INVALID) - return 0; + if (!strncmp(str, "lapic", 5)) + nmi_watchdog = NMI_LOCAL_APIC; + else if (!strncmp(str, "ioapic", 6)) + nmi_watchdog = NMI_IO_APIC; + else { + get_option(&str, &nmi); + if (nmi >= NMI_INVALID) + return 0; + nmi_watchdog = nmi; + } - nmi_watchdog = nmi; return 1; } __setup("nmi_watchdog=", setup_nmi_watchdog); -- cgit v1.2.3 From 1cbd8b3fdcf56a3c39a7596512095c9e33221fa1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 30 Oct 2008 10:45:36 +0000 Subject: x86: add two missing unwind annotations Impact: improve debuginfo Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332c96a6..ddeeb1052583 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -255,6 +255,7 @@ ENTRY(ret_from_fork) call schedule_tail GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + CFI_REMEMBER_STATE jnz rff_trace rff_action: RESTORE_REST @@ -264,6 +265,7 @@ rff_action: jnz int_ret_from_sys_call RESTORE_TOP_OF_STACK %rdi,ARGOFFSET jmp ret_from_sys_call + CFI_RESTORE_STATE rff_trace: movq %rsp,%rdi call syscall_trace_leave -- cgit v1.2.3 From a376f30a95a796cde81d6dffde0f5243c8bd8f92 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 31 Oct 2008 17:43:04 +0800 Subject: x86: avoid duplicate running of pud_offset and pmd_offset in one_md_table_init() Impact: simplify implementation, cleanup If !(pgd_val(*pgd) & _PAGE_PRESENT) in PAE mode, we need not get value of pmd_table again. Signed-off-by: Zhao Lei Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8396868e82c5..7f8a2daa3fde 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -102,6 +102,8 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); + + return pmd_table; } #endif pud = pud_offset(pgd, 0); -- cgit v1.2.3 From b2bcc7b299f37037b4a78dc1538e5d6508ae8110 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 31 Oct 2008 11:59:53 -0700 Subject: x86: add a synthetic TSC_RELIABLE feature bit Impact: None, bit reservation only Add a synthetic TSC_RELIABLE feature bit which will be used to mark TSC as reliable so that we could skip all the runtime checks for TSC stablity, which have false positives in virtual environment. Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index cfdf8c2c5c31..e490a7932a0d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -92,6 +92,7 @@ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ -- cgit v1.2.3 From 49ab56ac6e1b907b7dadb72a4012460359feaf0e Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Sat, 1 Nov 2008 18:34:37 -0700 Subject: x86: add X86_FEATURE_HYPERVISOR feature bit Impact: Number declaration only. Add X86_FEATURE_HYPERVISOR bit (CPUID level 1, ECX, bit 31). Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e490a7932a0d..694d1f8f1bee 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -118,6 +118,7 @@ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ @@ -238,6 +239,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 -- cgit v1.2.3 From 88b094fb8d4fe43b7025ea8d487059e8813e02cd Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 27 Oct 2008 10:41:46 -0700 Subject: x86: Hypervisor detection and get tsc_freq from hypervisor Impact: Changes timebase calibration on Vmware. v3->v2 : Abstract the hypervisor detection and feature (tsc_freq) request behind a hypervisor.c file v2->v1 : Add a x86_hyper_vendor field to the cpuinfo_x86 structure. This avoids multiple calls to the hypervisor detection function. This patch adds function to detect if we are running under VMware. The current way to check if we are on VMware is following, # check if "hypervisor present bit" is set, if so read the 0x40000000 cpuid leaf and check for "VMwareVMware" signature. # if the above fails, check the DMI vendors name for "VMware" string if we find one we query the VMware hypervisor port to check if we are under VMware. The DMI + "VMware hypervisor port check" is needed for older VMware products, which don't implement the hypervisor signature cpuid leaf. Also note that since we are checking for the DMI signature the hypervisor port should never be accessed on native hardware. This patch also adds a hypervisor_get_tsc_freq function, instead of calibrating the frequency which can be error prone in virtualized environment, we ask the hypervisor for it. We get the frequency from the hypervisor by accessing the hypervisor port if we are running on VMware. Other hypervisors too can add code to the generic routine to get frequency on their platform. Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/hypervisor.h | 26 ++++++++++++ arch/x86/include/asm/processor.h | 4 ++ arch/x86/include/asm/vmware.h | 26 ++++++++++++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/common.c | 2 + arch/x86/kernel/cpu/hypervisor.c | 48 +++++++++++++++++++++ arch/x86/kernel/cpu/vmware.c | 88 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 7 ++++ arch/x86/kernel/tsc.c | 9 +++- 9 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/hypervisor.h create mode 100644 arch/x86/include/asm/vmware.h create mode 100644 arch/x86/kernel/cpu/hypervisor.c create mode 100644 arch/x86/kernel/cpu/vmware.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h new file mode 100644 index 000000000000..369f5c5d09a1 --- /dev/null +++ b/arch/x86/include/asm/hypervisor.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2008, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef ASM_X86__HYPERVISOR_H +#define ASM_X86__HYPERVISOR_H + +extern unsigned long get_hypervisor_tsc_freq(void); +extern void init_hypervisor(struct cpuinfo_x86 *c); + +#endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ca01e383269..a570eafa4755 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -110,6 +110,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; #endif + unsigned int x86_hyper_vendor; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 @@ -123,6 +124,9 @@ struct cpuinfo_x86 { #define X86_VENDOR_UNKNOWN 0xff +#define X86_HYPER_VENDOR_NONE 0 +#define X86_HYPER_VENDOR_VMWARE 1 + /* * capabilities of CPUs */ diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h new file mode 100644 index 000000000000..02dfea5aebc4 --- /dev/null +++ b/arch/x86/include/asm/vmware.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2008, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef ASM_X86__VMWARE_H +#define ASM_X86__VMWARE_H + +extern unsigned long vmware_get_tsc_khz(void); +extern int vmware_platform(void); + +#endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec6075c057..a5c04e88777e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -4,6 +4,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o +obj-y += vmware.o hypervisor.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0217a9..b88595c36254 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "cpu.h" @@ -703,6 +704,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) detect_ht(c); #endif + init_hypervisor(c); /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c new file mode 100644 index 000000000000..7bd55064ffe9 --- /dev/null +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -0,0 +1,48 @@ +/* + * Common hypervisor code + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include + +static inline void __cpuinit +detect_hypervisor_vendor(struct cpuinfo_x86 *c) +{ + if (vmware_platform()) { + c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; + } else { + c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; + } +} + +unsigned long get_hypervisor_tsc_freq(void) +{ + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + return vmware_get_tsc_khz(); + return 0; +} + +void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) +{ + detect_hypervisor_vendor(c); +} + diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c new file mode 100644 index 000000000000..d5d1b75a4b77 --- /dev/null +++ b/arch/x86/kernel/cpu/vmware.c @@ -0,0 +1,88 @@ +/* + * VMware Detection code. + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include + +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 +#define VMWARE_HYPERVISOR_PORT 0x5658 + +#define VMWARE_PORT_CMD_GETVERSION 10 +#define VMWARE_PORT_CMD_GETHZ 45 + +#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ + __asm__("inl (%%dx)" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "0"(VMWARE_HYPERVISOR_MAGIC), \ + "1"(VMWARE_PORT_CMD_##cmd), \ + "2"(VMWARE_HYPERVISOR_PORT), "3"(0) : \ + "memory"); + +static inline int __vmware_platform(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx); + return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; +} + +static unsigned long __vmware_get_tsc_khz(void) +{ + uint64_t tsc_hz; + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (eax == (uint32_t)-1) + return 0; + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + return tsc_hz; +} + +int vmware_platform(void) +{ + if (cpu_has_hypervisor) { + unsigned int eax, ebx, ecx, edx; + char hyper_vendor_id[13]; + + cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); + memcpy(hyper_vendor_id + 0, &ebx, 4); + memcpy(hyper_vendor_id + 4, &ecx, 4); + memcpy(hyper_vendor_id + 8, &edx, 4); + hyper_vendor_id[12] = '\0'; + if (!strcmp(hyper_vendor_id, "VMwareVMware")) + return 1; + } else if (dmi_available && dmi_name_in_vendors("VMware") && + __vmware_platform()) + return 1; + + return 0; +} + +unsigned long vmware_get_tsc_khz(void) +{ + BUG_ON(!vmware_platform()); + return __vmware_get_tsc_khz(); +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd3..f44dadfb32cf 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -98,6 +98,7 @@ #include #include +#include #include #include @@ -909,6 +910,12 @@ void __init setup_arch(char **cmdline_p) dmi_check_system(bad_bios_dmi_table); + /* + * VMware detection requires dmi to be available, so this + * needs to be done after dmi_scan_machine, for the BP. + */ + init_hypervisor(&boot_cpu_data); + #ifdef CONFIG_X86_32 probe_roms(); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 62348e4fd8d1..6dbf0bcb44a8 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -15,6 +15,7 @@ #include #include #include +#include unsigned int cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -352,9 +353,15 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms, fast_calibrate, tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; + tsc_khz = get_hypervisor_tsc_freq(); + if (tsc_khz) { + printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); + return tsc_khz; + } + local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); -- cgit v1.2.3 From eca0cd028bdf0f6aaceb0d023e9c7501079a7dda Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 31 Oct 2008 12:01:58 -0700 Subject: x86: Add a synthetic TSC_RELIABLE feature bit. Impact: Changes timebase calibration on Vmware. Use the synthetic TSC_RELIABLE bit to workaround virtualization anomalies. Virtual TSCs can be kept nearly in sync, but because the virtual TSC offset is set by software, it's not perfect. So, the TSC synchronization test can fail. Even then the TSC can be used as a clocksource since the VMware platform exports a reliable TSC to the guest for timekeeping purposes. Use this bit to check if we need to skip the TSC sync checks. Along with this also set the CONSTANT_TSC bit when on VMware, since we still want to use TSC as clocksource on VM running over hardware which has unsynchronized TSC's (opteron's), since the hypervisor will take care of providing consistent TSC to the guest. Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vmware.h | 1 + arch/x86/kernel/cpu/hypervisor.c | 11 ++++++++++- arch/x86/kernel/cpu/vmware.c | 18 ++++++++++++++++++ arch/x86/kernel/tsc_sync.c | 8 +++++++- 4 files changed, 36 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index 02dfea5aebc4..c11b7e100d83 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -22,5 +22,6 @@ extern unsigned long vmware_get_tsc_khz(void); extern int vmware_platform(void); +extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 7bd55064ffe9..35ae2b75226d 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -41,8 +41,17 @@ unsigned long get_hypervisor_tsc_freq(void) return 0; } +static inline void __cpuinit +hypervisor_set_feature_bits(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { + vmware_set_feature_bits(c); + return; + } +} + void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) { detect_hypervisor_vendor(c); + hypervisor_set_feature_bits(c); } - diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index d5d1b75a4b77..2ac4394fcb90 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -86,3 +86,21 @@ unsigned long vmware_get_tsc_khz(void) BUG_ON(!vmware_platform()); return __vmware_get_tsc_khz(); } + +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) +{ + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); +} diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9ffb01c31c40..5977c40a138f 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -108,6 +108,12 @@ void __cpuinit check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + printk(KERN_INFO + "Skipping synchronization checks as TSC is reliable.\n"); + return; + } + printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", smp_processor_id(), cpu); @@ -161,7 +167,7 @@ void __cpuinit check_tsc_sync_target(void) { int cpus = 2; - if (unsynchronized_tsc()) + if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) return; /* -- cgit v1.2.3 From 395628ef4ea12ff0748099f145363b5e33c69acb Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 24 Oct 2008 17:22:01 -0700 Subject: x86: Skip verification by the watchdog for TSC clocksource. Impact: Changes timekeeping on Vmware (or with tsc=reliable). This is achieved by resetting the CLOCKSOURCE_MUST_VERIFY flag. We add a tsc=reliable commandline option to enable this. This enables legacy hardware without HPET, LAPIC, or ACPI timers to enter high-resolution timer mode. Along with that have extended this to be used in virtualization environement too. Now we also set this flag if the X86_FEATURE_TSC_RELIABLE bit is set. This is important since there is a wrap-around problem with the acpi_pm timer. The acpi_pm counter is just 24bits and this can overflow in ~4 seconds. With the NO_HZ kernels in virtualized environment, there can be situations when the guest is descheduled for longer duration, as a result we may miss the wrap of the acpi counter. When TSC is used as a clocksource and acpi_pm timer is being used as the watchdog clocksource this error in acpi_pm results in TSC being marked as unstable, and essentially results in time dropping in chunks of 4 seconds whenever this wrap is missed. Since the virtualized TSC is reliable on VMware, we should always use the TSCs clocksource on VMware, so we skip the verfication at runtime, by checking for the feature bit. Since we reset the flag for mgeode systems too, i have combined the mgeode case with the feature bit check. Signed-off-by: Jeff Hansen Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 7 +++++++ arch/x86/kernel/tsc.c | 33 +++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1bbcaa8982b6..dc6b06f67fca 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2267,6 +2267,13 @@ and is between 256 and 4096 characters. It is defined in the file Format: ,,,,,,,, + tsc= Disable clocksource-must-verify flag for TSC. + Format: + [x86] reliable: mark tsc clocksource as reliable, this + disables clocksource verification at runtime. + Used to enable high-resolution timer mode on older + hardware, and in virtualized environment. + turbografx.map[2|3]= [HW,JOY] TurboGraFX parallel port interface Format: diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6dbf0bcb44a8..ee01cd96b5e1 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -32,6 +32,7 @@ static int tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int tsc_disabled = -1; +static int tsc_clocksource_reliable; /* * Scheduler clock - returns current time in nanosec units. */ @@ -99,6 +100,15 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); +static int __init tsc_setup(char *str) +{ + if (!strcmp(str, "reliable")) + tsc_clocksource_reliable = 1; + return 1; +} + +__setup("tsc=", tsc_setup); + #define MAX_RETRIES 5 #define SMI_TRESHOLD 50000 @@ -738,24 +748,21 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { {} }; -/* - * Geode_LX - the OLPC CPU has a possibly a very reliable TSC - */ +static void __init check_system_tsc_reliable(void) +{ #ifdef CONFIG_MGEODE_LX -/* RTSC counts during suspend */ + /* RTSC counts during suspend */ #define RTSC_SUSP 0x100 - -static void __init check_geode_tsc_reliable(void) -{ unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ if (res_low & RTSC_SUSP) - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; -} -#else -static inline void check_geode_tsc_reliable(void) { } + tsc_clocksource_reliable = 1; #endif + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) + tsc_clocksource_reliable = 1; +} /* * Make an educated guess if the TSC is trustworthy and synchronized @@ -790,6 +797,8 @@ static void __init init_tsc_clocksource(void) { clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, clocksource_tsc.shift); + if (tsc_clocksource_reliable) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ if (check_tsc_unstable()) { clocksource_tsc.rating = 0; @@ -850,7 +859,7 @@ void __init tsc_init(void) if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); - check_geode_tsc_reliable(); + check_system_tsc_reliable(); init_tsc_clocksource(); } -- cgit v1.2.3 From 3555105333ae55414d0fe051557bd7dc590f5255 Mon Sep 17 00:00:00 2001 From: Gary Hade Date: Fri, 31 Oct 2008 10:52:03 -0700 Subject: x86: add memory hotremove config option Impact: enable CONFIG_MEMORY_HOTREMOVE feature on x86. (default-off) Memory hotremove functionality can currently be configured into the ia64, powerpc, and s390 kernels. This patch makes it possible to configure the memory hotremove functionality into the x86 kernel as well. Signed-off-by: Badari Pulavarty Signed-off-by: Gary Hade Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c00aefcb47d5..25e711526116 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1492,6 +1492,10 @@ config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) +config ARCH_ENABLE_MEMORY_HOTREMOVE + def_bool y + depends on MEMORY_HOTPLUG + config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA -- cgit v1.2.3 From 6bdbfe99916398dbb28d83833cc04757110f2738 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 3 Nov 2008 11:31:28 -0800 Subject: x86: VMware: Fix vmware_get_tsc code Impact: Fix possible failure to calibrate the TSC on Vmware near 4 GHz The current version of the code to get the tsc frequency from the VMware hypervisor, will be broken on processor with frequency (4G-1) HZ, because on such processors eax will have UINT_MAX and that would be legitimate. We instead check that EBX did change to decide if we were able to read the frequency from the hypervisor. Signed-off-by: Alok N Kataria Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/vmware.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 2ac4394fcb90..a0905ecfe7d2 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -36,7 +36,7 @@ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ "0"(VMWARE_HYPERVISOR_MAGIC), \ "1"(VMWARE_PORT_CMD_##cmd), \ - "2"(VMWARE_HYPERVISOR_PORT), "3"(0) : \ + "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ "memory"); static inline int __vmware_platform(void) @@ -53,7 +53,7 @@ static unsigned long __vmware_get_tsc_khz(void) VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (eax == (uint32_t)-1) + if (ebx == UINT_MAX) return 0; tsc_hz = eax | (((uint64_t)ebx) << 32); do_div(tsc_hz, 1000); -- cgit v1.2.3 From 124ffe1456d6efea5b32cc6d36e3fa434cdc84d9 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 3 Nov 2008 19:23:01 -0800 Subject: x86: signal_64: remove unused code in __setup_rt_frame() Impact: cleanup sizeof(*set) is always 8 on x86_64. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 83990db82f74..cfbb60a5f9d2 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -251,11 +251,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); - if (sizeof(*set) == 16) { - __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); - } else - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); /* Set up to return from userspace. If provided, use a stub already in userspace. */ -- cgit v1.2.3 From 6cf87efbc7a3676e0ad7c9622ec6aec244a593bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 4 Nov 2008 10:42:23 +0100 Subject: x86 debug: mark early_printk.o as notrace Impact: do not do function-tracing in the early-printk code this is useful when earlyprintk=vga,keep is used to debug tracer plugins. Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e489ff9cb3e2..943fe6026c64 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -12,6 +12,7 @@ CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_early_printk.o = -pg endif # -- cgit v1.2.3 From fd8cd7e1919fc1c27fe2fdccd2a1cd32f791ef0f Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 3 Nov 2008 15:50:38 -0800 Subject: x86: vmware: look for DMI string in the product serial key Impact: Should permit VMware detection on older platforms where the vendor is changed. Could theoretically cause a regression if some weird serial number scheme contains the string "VMware" by pure chance. Seems unlikely, especially with the mixed case. In some user configured cases, VMware may choose not to put a VMware specific DMI string, but the product serial key is always there and is VMware specific. Add a interface to check the serial key, when checking for VMware in the DMI information. Signed-off-by: Alok N Kataria Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/vmware.c | 7 ++++++- drivers/firmware/dmi_scan.c | 11 +++++++++++ include/linux/dmi.h | 2 ++ 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index a0905ecfe7d2..c034bda842d9 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -61,6 +61,11 @@ static unsigned long __vmware_get_tsc_khz(void) return tsc_hz; } +/* + * While checking the dmi string infomation, just checking the product + * serial key should be enough, as this will always have a VMware + * specific string when running under VMware hypervisor. + */ int vmware_platform(void) { if (cpu_has_hypervisor) { @@ -74,7 +79,7 @@ int vmware_platform(void) hyper_vendor_id[12] = '\0'; if (!strcmp(hyper_vendor_id, "VMwareVMware")) return 1; - } else if (dmi_available && dmi_name_in_vendors("VMware") && + } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) return 1; diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 3e526b6d00cb..d66d41282907 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -467,6 +467,17 @@ const char *dmi_get_system_info(int field) } EXPORT_SYMBOL(dmi_get_system_info); +/** + * dmi_name_in_serial - Check if string is in the DMI product serial + * information. + */ +int dmi_name_in_serial(const char *str) +{ + int f = DMI_PRODUCT_SERIAL; + if (dmi_ident[f] && strstr(dmi_ident[f], str)) + return 1; + return 0; +} /** * dmi_name_in_vendors - Check if string is anywhere in the DMI vendor information. diff --git a/include/linux/dmi.h b/include/linux/dmi.h index e5084eb5943a..2bfda178f274 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -44,6 +44,7 @@ extern const struct dmi_device * dmi_find_device(int type, const char *name, extern void dmi_scan_machine(void); extern int dmi_get_year(int field); extern int dmi_name_in_vendors(const char *str); +extern int dmi_name_in_serial(const char *str); extern int dmi_available; extern int dmi_walk(void (*decode)(const struct dmi_header *)); @@ -56,6 +57,7 @@ static inline const struct dmi_device * dmi_find_device(int type, const char *na static inline void dmi_scan_machine(void) { return; } static inline int dmi_get_year(int year) { return 0; } static inline int dmi_name_in_vendors(const char *s) { return 0; } +static inline int dmi_name_in_serial(const char *s) { return 0; } #define dmi_available 0 static inline int dmi_walk(void (*decode)(const struct dmi_header *)) { return -1; } -- cgit v1.2.3 From 838e8bb71dc0c892bf8f84abd3c709d8fe3a8d3c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 24 Oct 2008 16:53:33 +0200 Subject: x86: Implement change_bit with immediate operand as "lock xorb" Impact: Minor optimization. Implement change_bit with immediate bit count as "lock xorb". This is similar to "lock orb" and "lock andb" for set_bit and clear_bit functions. Signed-off-by: Uros Bizjak Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/bitops.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 360010322711..9fa9dcdf344b 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -168,7 +168,15 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) */ static inline void change_bit(int nr, volatile unsigned long *addr) { - asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr)); + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "xorb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((u8)CONST_MASK(nr))); + } else { + asm volatile(LOCK_PREFIX "btc %1,%0" + : BITOP_ADDR(addr) + : "Ir" (nr)); + } } /** -- cgit v1.2.3 From 64ccf2f9a70a06ba56cd8cedfa610b4e77181587 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 5 Nov 2008 22:11:56 -0600 Subject: x86: uv: Add UV watchlist bios call Add UV bios calls to allocate and free watchlists. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 17 ++++++++++++++++- arch/x86/kernel/bios_uv.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 51cadc645e6f..58105c5b0b4e 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -32,7 +32,9 @@ enum uv_bios_cmd { UV_BIOS_COMMON, UV_BIOS_GET_SN_INFO, - UV_BIOS_FREQ_BASE + UV_BIOS_FREQ_BASE, + UV_BIOS_WATCHLIST_ALLOC, + UV_BIOS_WATCHLIST_FREE }; /* @@ -71,6 +73,15 @@ union partition_info_u { }; }; +union uv_watchlist_u { + u64 val; + struct { + u64 blade : 16, + size : 32, + filler : 16; + }; +}; + /* * bios calls have 6 parameters */ @@ -80,9 +91,13 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); +extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, + unsigned long *); +extern int uv_bios_mq_watchlist_free(int, int); extern void uv_bios_init(void); +extern unsigned long sn_rtc_cycles_per_second; extern int uv_type; extern long sn_partition_id; extern long sn_coherency_id; diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 7cefb7170e75..4c02b2799216 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -100,6 +100,39 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, return ret; } +int +uv_bios_mq_watchlist_alloc(int blade, void *mq, unsigned int mq_size, + unsigned long *intr_mmr_offset) +{ + union uv_watchlist_u size_blade; + unsigned long addr; + u64 watchlist; + s64 ret; + + addr = (unsigned long)mq; + size_blade.size = mq_size; + size_blade.blade = blade; + + /* + * bios returns watchlist number or negative error number. + */ + ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, + size_blade.val, (u64)intr_mmr_offset, + (u64)&watchlist, 0); + if (ret < BIOS_STATUS_SUCCESS) + return ret; + + return watchlist; +} +EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc); + +int +uv_bios_mq_watchlist_free(int blade, int watchlist_num) +{ + return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE, + blade, watchlist_num, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free); s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { -- cgit v1.2.3 From e8929c8a6acbecbd629b8e3f2d1a2546ec4ebdfc Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 5 Nov 2008 22:13:44 -0600 Subject: x86: uv: Add UV memory protection bios call Add UV bios call to change memory protections. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 10 +++++++++- arch/x86/kernel/bios_uv.c | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 58105c5b0b4e..a301a56d4157 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -34,7 +34,8 @@ enum uv_bios_cmd { UV_BIOS_GET_SN_INFO, UV_BIOS_FREQ_BASE, UV_BIOS_WATCHLIST_ALLOC, - UV_BIOS_WATCHLIST_FREE + UV_BIOS_WATCHLIST_FREE, + UV_BIOS_MEMPROTECT }; /* @@ -82,6 +83,12 @@ union uv_watchlist_u { }; }; +enum uv_memprotect { + UV_MEMPROT_RESTRICT_ACCESS, + UV_MEMPROT_ALLOW_AMO, + UV_MEMPROT_ALLOW_RW +}; + /* * bios calls have 6 parameters */ @@ -94,6 +101,7 @@ extern s64 uv_bios_freq_base(u64, u64 *); extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); +extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern void uv_bios_init(void); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 4c02b2799216..7cf6fc3d1c10 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -134,6 +134,14 @@ uv_bios_mq_watchlist_free(int blade, int watchlist_num) } EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free); +s64 +uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms) +{ + return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len, + perms, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); + s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, -- cgit v1.2.3 From 23c357003b3671cdfb17bc4d5383589e74b71511 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 5 Nov 2008 22:15:13 -0600 Subject: x86: uv: Add UV reserved page bios call Add UV bios call to get the address of the reserved page. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 5 ++++- arch/x86/kernel/bios_uv.c | 11 +++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index a301a56d4157..da1c4e8e78fc 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -35,13 +35,15 @@ enum uv_bios_cmd { UV_BIOS_FREQ_BASE, UV_BIOS_WATCHLIST_ALLOC, UV_BIOS_WATCHLIST_FREE, - UV_BIOS_MEMPROTECT + UV_BIOS_MEMPROTECT, + UV_BIOS_GET_PARTITION_ADDR }; /* * Status values returned from a BIOS call. */ enum { + BIOS_STATUS_MORE_PASSES = 1, BIOS_STATUS_SUCCESS = 0, BIOS_STATUS_UNIMPLEMENTED = -ENOSYS, BIOS_STATUS_EINVAL = -EINVAL, @@ -102,6 +104,7 @@ extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); +extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern void uv_bios_init(void); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 7cf6fc3d1c10..d22d0f1bbea0 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -142,6 +142,17 @@ uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms) } EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); +s64 +uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len) +{ + s64 ret; + + ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, + (u64)addr, buf, (u64)len, 0); + return ret; +} +EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa); + s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, -- cgit v1.2.3 From 4b33669e817a01dd99ff91df330d504ccfb2e99c Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 5 Nov 2008 18:30:25 -0800 Subject: x86: signal_32: do save_i387_xstate() at get_sigframe() Impact: cleanup move calling save_i387_xstate() into get_sigframe() from setup_sigcontext() like 64bit. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index a0efc1b3c4c9..6a05c74b4084 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -303,11 +303,7 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, err |= __put_user(regs->sp, &sc->sp_at_signal); err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); - tmp = save_i387_xstate(fpstate); - if (tmp < 0) - err = 1; - else - err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); + err |= __put_user(fpstate, &sc->fpstate); /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); @@ -350,6 +346,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, if (used_math()) { sp = sp - sig_xstate_size; *fpstate = (struct _fpstate *) sp; + if (save_i387_xstate(*fpstate) < 0) + return (void __user *)-1L; } sp -= frame_size; -- cgit v1.2.3 From 99ea1b93bf80a287dd70499b96d9c4d06f320ff2 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 5 Nov 2008 18:32:54 -0800 Subject: x86: ia32_signal: do save_i387_xstate_ia32 at get_sigframe() Impact: cleanup move calling save_i387_xstate_ia32() into get_sigframe() from setup_sigcontext(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 4bc02b23674b..47ddc23f4f54 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -367,12 +367,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, err |= __put_user(regs->flags, &sc->flags); err |= __put_user(regs->sp, &sc->sp_at_signal); - tmp = save_i387_xstate_ia32(fpstate); - if (tmp < 0) - err = -EFAULT; - else - err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), - &sc->fpstate); + err |= __put_user(ptr_to_compat(fpstate), &sc->fpstate); /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); @@ -408,6 +403,8 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, if (used_math()) { sp = sp - sig_xstate_ia32_size; *fpstate = (struct _fpstate_ia32 *) sp; + if (save_i387_xstate_ia32(*fpstate) < 0) + return (void __user *) -1L; } sp -= frame_size; -- cgit v1.2.3 From ee7d523c124a186ce3a886868de9cd1d8bc991f3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 5 Nov 2008 18:33:35 -0800 Subject: x86: signal_64: setup fpstate in setup_sigcontext() Impact: cleanup set fpstate field of signal context at setup_sigcontext(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index cfbb60a5f9d2..97d26fa62ac1 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -159,8 +159,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) */ static inline int -setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, - unsigned long mask, struct task_struct *me) +setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, + unsigned long mask, struct task_struct *me) { int err = 0; @@ -188,6 +189,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, err |= __put_user(me->thread.error_code, &sc->err); err |= __put_user(regs->ip, &sc->ip); err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(fpstate, &sc->fpstate); err |= __put_user(mask, &sc->oldmask); err |= __put_user(me->thread.cr2, &sc->cr2); @@ -249,8 +251,8 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); - err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, + regs, set->sig[0], me); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); /* Set up to return from userspace. If provided, use a stub -- cgit v1.2.3 From 8735b7d0a2a6246faa406a8cdd1376bd0e689ba3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 5 Nov 2008 18:34:35 -0800 Subject: x86: signal_64: make setup_sigcontext() similar Impact: cleanup remove passing task struct. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 97d26fa62ac1..3868c2a21793 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -160,8 +160,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) static inline int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, - unsigned long mask, struct task_struct *me) + struct pt_regs *regs, unsigned long mask) { int err = 0; @@ -185,13 +184,13 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, err |= __put_user(regs->r13, &sc->r13); err |= __put_user(regs->r14, &sc->r14); err |= __put_user(regs->r15, &sc->r15); - err |= __put_user(me->thread.trap_no, &sc->trapno); - err |= __put_user(me->thread.error_code, &sc->err); + err |= __put_user(current->thread.trap_no, &sc->trapno); + err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->ip, &sc->ip); err |= __put_user(regs->flags, &sc->flags); err |= __put_user(fpstate, &sc->fpstate); err |= __put_user(mask, &sc->oldmask); - err |= __put_user(me->thread.cr2, &sc->cr2); + err |= __put_user(current->thread.cr2, &sc->cr2); return err; } @@ -251,8 +250,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, - regs, set->sig[0], me); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); /* Set up to return from userspace. If provided, use a stub -- cgit v1.2.3 From fd51b2d7d5df932767b89e00d0871a38a2c53e74 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 5 Nov 2008 02:27:19 +0900 Subject: x86: update CONFIG_NUMA description Impact: clarify/update CONFIG_NUMA text CONFIG_NUMA description talk about a bit old thing. So, following changes are better. o CONFIG_NUMA is no longer EXPERIMENTAL o Opteron is not the only processor of NUMA topology on x86_64 no longer, but also Intel Core7i has it. Signed-off-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 350bee1d54dc..38ae04bf6514 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -951,22 +951,26 @@ config ARCH_PHYS_ADDR_T_64BIT # Common NUMA Features config NUMA - bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" + bool "Numa Memory Allocation and Scheduler Support" depends on SMP depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) help Enable NUMA (Non Uniform Memory Access) support. + The kernel will try to allocate memory used by a CPU on the local memory controller of the CPU and add some more NUMA awareness to the kernel. - For 32-bit this is currently highly experimental and should be only - used for kernel development. It might also cause boot failures. - For 64-bit this is recommended on all multiprocessor Opteron systems. - If the system is EM64T, you should say N unless your system is - EM64T NUMA. + For 64-bit this is recommended if the system is Intel Core 7i + (or later), AMD Opteron, or EM64T NUMA. + + For 32-bit this is only needed on (rare) 32-bit-only platforms + that support NUMA topologies, such as NUMAQ / Summit, or if you + boot a 32-bit kernel on a 64-bit NUMA platform. + + Otherwise, you should say N. comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) -- cgit v1.2.3 From 15002fa9bf3a79ac9dcafba7ff308586936088b2 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 7 Nov 2008 19:25:36 -0800 Subject: x86: signal: cosmetic unification of setup_sigcontext() Impact: cleanup Make setup_sigcontext() same. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 32 ++++++++++++++++++++++++++++---- arch/x86/kernel/signal_64.c | 33 ++++++++++++++++++++++++++++----- 2 files changed, 56 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 6a05c74b4084..27a5c8174322 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -279,14 +279,20 @@ static int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { - int tmp, err = 0; + int err = 0; - err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); - savesegment(gs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); +#ifdef CONFIG_X86_32 + { + unsigned int tmp; + savesegment(gs, tmp); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + } + err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); err |= __put_user(regs->es, (unsigned int __user *)&sc->es); err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); +#endif /* CONFIG_X86_32 */ + err |= __put_user(regs->di, &sc->di); err |= __put_user(regs->si, &sc->si); err |= __put_user(regs->bp, &sc->bp); @@ -295,13 +301,31 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, err |= __put_user(regs->dx, &sc->dx); err |= __put_user(regs->cx, &sc->cx); err |= __put_user(regs->ax, &sc->ax); +#ifdef CONFIG_X86_64 + err |= __put_user(regs->r8, &sc->r8); + err |= __put_user(regs->r9, &sc->r9); + err |= __put_user(regs->r10, &sc->r10); + err |= __put_user(regs->r11, &sc->r11); + err |= __put_user(regs->r12, &sc->r12); + err |= __put_user(regs->r13, &sc->r13); + err |= __put_user(regs->r14, &sc->r14); + err |= __put_user(regs->r15, &sc->r15); +#endif /* CONFIG_X86_64 */ + err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->ip, &sc->ip); +#ifdef CONFIG_X86_32 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); err |= __put_user(regs->flags, &sc->flags); err |= __put_user(regs->sp, &sc->sp_at_signal); err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); +#else /* !CONFIG_X86_32 */ + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->cs, &sc->cs); + err |= __put_user(0, &sc->gs); + err |= __put_user(0, &sc->fs); +#endif /* CONFIG_X86_32 */ err |= __put_user(fpstate, &sc->fpstate); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 3868c2a21793..d2307e41fbdb 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -157,16 +157,23 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) /* * Set up a signal frame. */ - -static inline int +static int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { int err = 0; - err |= __put_user(regs->cs, &sc->cs); - err |= __put_user(0, &sc->gs); - err |= __put_user(0, &sc->fs); +#ifdef CONFIG_X86_32 + { + unsigned int tmp; + + savesegment(gs, tmp); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + } + err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); + err |= __put_user(regs->es, (unsigned int __user *)&sc->es); + err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); +#endif /* CONFIG_X86_32 */ err |= __put_user(regs->di, &sc->di); err |= __put_user(regs->si, &sc->si); @@ -176,6 +183,7 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, err |= __put_user(regs->dx, &sc->dx); err |= __put_user(regs->cx, &sc->cx); err |= __put_user(regs->ax, &sc->ax); +#ifdef CONFIG_X86_64 err |= __put_user(regs->r8, &sc->r8); err |= __put_user(regs->r9, &sc->r9); err |= __put_user(regs->r10, &sc->r10); @@ -184,11 +192,26 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, err |= __put_user(regs->r13, &sc->r13); err |= __put_user(regs->r14, &sc->r14); err |= __put_user(regs->r15, &sc->r15); +#endif /* CONFIG_X86_64 */ + err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->ip, &sc->ip); +#ifdef CONFIG_X86_32 + err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); +#else /* !CONFIG_X86_32 */ + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->cs, &sc->cs); + err |= __put_user(0, &sc->gs); + err |= __put_user(0, &sc->fs); +#endif /* CONFIG_X86_32 */ + err |= __put_user(fpstate, &sc->fpstate); + + /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); err |= __put_user(current->thread.cr2, &sc->cr2); -- cgit v1.2.3 From cb9e35dce94a1b9c59d46224e8a94377d673e204 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 8 Nov 2008 20:27:00 +0100 Subject: x86: clean up rdtsc_barrier() use Impact: cleanup Move rdtsc_barrier() use to vsyscall_64.c where it's relied on, and point out its role in the context of its use. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tsc.h | 6 +----- arch/x86/kernel/vsyscall_64.c | 9 +++++++++ 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 9cd83a8e40d5..700aeb8d2098 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -44,11 +44,7 @@ static __always_inline cycles_t vget_cycles(void) if (!cpu_has_tsc) return 0; #endif - rdtsc_barrier(); - cycles = (cycles_t)__native_read_tsc(); - rdtsc_barrier(); - - return cycles; + return (cycles_t)__native_read_tsc(); } extern void tsc_init(void); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 0b8b6690a86d..ebf2f12900f5 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -128,7 +128,16 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) gettimeofday(tv,NULL); return; } + + /* + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); now = vread(); + rdtsc_barrier(); + base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; -- cgit v1.2.3 From 4fcc50abdffb517cee36cec9cb22138d84fb62d0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 9 Nov 2008 08:10:03 +0100 Subject: x86: clean up vget_cycles() Impact: remove unused variable I forgot to remove the now unused "cycles_t cycles" parameter from vget_cycles() - which triggers build warnings as tsc.h is included in a number of files. Remove it. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tsc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 700aeb8d2098..38ae163cc91b 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -34,8 +34,6 @@ static inline cycles_t get_cycles(void) static __always_inline cycles_t vget_cycles(void) { - cycles_t cycles; - /* * We only do VDSOs on TSC capable CPUs, so this shouldnt * access boot_cpu_data (which is not VDSO-safe): -- cgit v1.2.3 From 19f47c634ea8c5a10ff7bb1a08c52fd0f49bc54c Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sun, 9 Nov 2008 21:28:15 -0800 Subject: x86: x86_32 has its own irq_regs definition Impact: cleanup Arches that have their own irq_regs definition are expected to define ARCH_HAS_OWN_IRQ_REGS or else a generic (unused) set will also be defined in lib/irq_regs.c Sparse noticed the unused generic one had no prototype: lib/irq_regs.c:15:1: warning: symbol 'per_cpu____irq_regs' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_regs_32.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h index af2f02d27fc7..86afd7473457 100644 --- a/arch/x86/include/asm/irq_regs_32.h +++ b/arch/x86/include/asm/irq_regs_32.h @@ -9,6 +9,8 @@ #include +#define ARCH_HAS_OWN_IRQ_REGS + DECLARE_PER_CPU(struct pt_regs *, irq_regs); static inline struct pt_regs *get_irq_regs(void) -- cgit v1.2.3 From a3d732f93785da17e0137210deadb4616f5536fc Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Mon, 10 Nov 2008 16:16:31 -0600 Subject: x86, UV: fix redundant creation of sgi_uv Impact: fix double entry creation in /proc There is a collision between two UV functions: both uv_ptc_init() and gru_proc_init() try to make /proc/sgi_uv So move it's creation to a single place: uv_system_init() Signed-off-by: Cliff Wickman Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 2 ++ arch/x86/kernel/tlb_uv.c | 4 ---- drivers/misc/sgi-gru/gruprocfs.c | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 85fb7dd48f67..d7213a1cb784 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -570,4 +571,5 @@ void __init uv_system_init(void) uv_cpu_init(); uv_scir_register_cpu_notifier(); + proc_mkdir("sgi_uv", NULL); } diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 04431f34fd16..6a00e5faaa74 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -566,14 +566,10 @@ static int __init uv_ptc_init(void) if (!is_uv_system()) return 0; - if (!proc_mkdir("sgi_uv", NULL)) - return -EINVAL; - proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); if (!proc_uv_ptc) { printk(KERN_ERR "unable to create %s proc entry\n", UV_PTC_BASENAME); - remove_proc_entry("sgi_uv", NULL); return -EINVAL; } proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; diff --git a/drivers/misc/sgi-gru/gruprocfs.c b/drivers/misc/sgi-gru/gruprocfs.c index 533923f83f1a..73b0ca061bb5 100644 --- a/drivers/misc/sgi-gru/gruprocfs.c +++ b/drivers/misc/sgi-gru/gruprocfs.c @@ -317,7 +317,6 @@ int gru_proc_init(void) { struct proc_entry *p; - proc_mkdir("sgi_uv", NULL); proc_gru = proc_mkdir("sgi_uv/gru", NULL); for (p = proc_files; p->name; p++) -- cgit v1.2.3 From c280ea5e4c6ba0b38ed6b005150fe16a660e903b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 8 Nov 2008 13:29:45 +0100 Subject: x86: fix documentation typo in arch/x86/Kconfig Impact: documentation update Chris Snook pointed out that it's Core i7, not Core 7i. Reported-by: Chris Snook Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 38ae04bf6514..bacac556b189 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -963,7 +963,7 @@ config NUMA local memory controller of the CPU and add some more NUMA awareness to the kernel. - For 64-bit this is recommended if the system is Intel Core 7i + For 64-bit this is recommended if the system is Intel Core i7 (or later), AMD Opteron, or EM64T NUMA. For 32-bit this is only needed on (rare) 32-bit-only platforms -- cgit v1.2.3 From 4687518c4cb7807fbeff21770e309080f9eb7f2f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 11 Nov 2008 13:03:07 -0800 Subject: x86: 32 bit: interrupt stub consistency with 64 bit Don't generate interrupt stubs for interrupt vectors below FIRST_EXTERNAL_VECTOR, and make the table of interrupt vectors (interrupt[]) __initconst. Both of these changes both conserve memory and improve consistency with 64 bits. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/kernel/entry_32.S | 6 +++--- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/lguest/boot.c | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b97aecb0b61d..27d33f92afe2 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -110,7 +110,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif #ifdef CONFIG_X86_32 -extern void (*const interrupt[NR_VECTORS])(void); +extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); #endif typedef int vector_irq_t[NR_VECTORS]; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 28b597ef9ca1..4aea95652cff 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -622,16 +622,16 @@ END(syscall_badsys) * Build the entry stubs and pointer table with * some assembler magic. */ -.section .rodata,"a" +.section .init.rodata,"a" ENTRY(interrupt) .text ENTRY(irq_entries_start) RING0_INT_FRAME -vector=0 +vector=FIRST_EXTERNAL_VECTOR .rept NR_VECTORS ALIGN - .if vector + .if vector != FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 .endif 1: pushl $~(vector) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 845aa9803e80..607db63044a5 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -129,7 +129,7 @@ void __init native_init_IRQ(void) for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i]); + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a5d8e1ace1cf..50a779264bb1 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -590,7 +590,8 @@ static void __init lguest_init_IRQ(void) * a straightforward 1 to 1 mapping, so force that here. */ __get_cpu_var(vector_irq)[vector] = i; if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, interrupt[vector]); + set_intr_gate(vector, + interrupt[vector-FIRST_EXTERNAL_VECTOR]); set_irq_chip_and_handler_name(i, &lguest_irq_controller, handle_level_irq, "level"); -- cgit v1.2.3 From b7c6244f13d37592003b46e12500a90e9781ad9d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 11 Nov 2008 13:24:58 -0800 Subject: x86: 32 bits: shrink and align IRQ stubs Shrink the IRQ stubs on 32 bits down to just over four bytes per (we fit seven into a 32-byte chunk.) This shrinks the total icache consumption of the IRQ stubs down to an even kilobyte, if all of them are in active use. The downside is that we end up with a double jump, which could have a negative effect on some pipelines. The double jump is always inside the same cacheline on any modern chips (the exception being 486/Elan/Geode which have only 16-byte cachelines, but are unlikely to have too many interrupt sources.) To get the most effect, cache-align the IRQ stubs. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 4aea95652cff..dae81b9fd451 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -619,28 +619,37 @@ END(syscall_badsys) 27:; /* - * Build the entry stubs and pointer table with - * some assembler magic. + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. */ .section .init.rodata,"a" ENTRY(interrupt) .text - + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT ENTRY(irq_entries_start) RING0_INT_FRAME vector=FIRST_EXTERNAL_VECTOR -.rept NR_VECTORS - ALIGN - .if vector != FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector != FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 - .endif -1: pushl $~(vector) + .endif +1: pushl $(~vector+0x80) /* Note: always in signed byte range */ CFI_ADJUST_CFA_OFFSET 4 - jmp common_interrupt - .previous + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) != 6 + jmp 2f + .endif + .previous .long 1b - .text + .text vector=vector+1 + .endif + .endr +2: jmp common_interrupt .endr END(irq_entries_start) @@ -652,8 +661,9 @@ END(interrupt) * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: */ - ALIGN + .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ SAVE_ALL TRACE_IRQS_OFF movl %esp,%eax -- cgit v1.2.3 From 939b787130bf22887a09d8fd2641a094dcef8c22 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 11 Nov 2008 13:51:52 -0800 Subject: x86: 64 bits: shrink and align IRQ stubs Move the IRQ stub generation to assembly to simplify it and for consistency with 32 bits. Doing it in a C file with asm() statements doesn't help clarity, and it prevents some optimizations. Shrink the IRQ stubs down to just over four bytes per (we fit seven into a 32-byte chunk.) This shrinks the total icache consumption of the IRQ stubs down to an even kilobyte, if all of them are in active use. The downside is that we end up with a double jump, which could have a negative effect on some pipelines. The double jump is always inside the same cacheline on any modern chips. To get the most effect, cache-align the IRQ stubs. This makes the 64-bit code match changes already done to the 32-bit code, and should open up irqinit*.c for unification. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/hw_irq.h | 2 -- arch/x86/kernel/entry_64.S | 48 +++++++++++++++++++++++++++++-- arch/x86/kernel/irqinit_64.c | 66 ------------------------------------------- 3 files changed, 45 insertions(+), 71 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 27d33f92afe2..8de644b6b959 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -109,9 +109,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif #endif -#ifdef CONFIG_X86_32 extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); -#endif typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332c96a6..9b2aeaac9a6b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -627,6 +627,46 @@ END(stub_rt_sigreturn) vector already pushed) */ #define XCPT_FRAME _frame ORIG_RAX +/* + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. + */ + .section .init.rodata,"a" +ENTRY(interrupt) + .text + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT +ENTRY(irq_entries_start) + INTR_FRAME +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector != FIRST_EXTERNAL_VECTOR + CFI_ADJUST_CFA_OFFSET -8 + .endif +1: pushq $(~vector+0x80) /* Note: always in signed byte range */ + CFI_ADJUST_CFA_OFFSET 8 + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) != 6 + jmp 2f + .endif + .previous + .quad 1b + .text +vector=vector+1 + .endif + .endr +2: jmp common_interrupt +.endr + CFI_ENDPROC +END(irq_entries_start) + +.previous +END(interrupt) +.previous + /* * Interrupt entry/exit. * @@ -635,11 +675,12 @@ END(stub_rt_sigreturn) * Entry runs with interrupts off. */ -/* 0(%rsp): interrupt number */ +/* 0(%rsp): ~(interrupt number)+0x80 */ .macro interrupt func + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ cld SAVE_ARGS - leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler + leaq -ARGOFFSET(%rsp),%rdi /* arg1 for handler */ pushq %rbp /* * Save rbp twice: One is for marking the stack frame, as usual, and the @@ -670,7 +711,8 @@ END(stub_rt_sigreturn) call \func .endm -ENTRY(common_interrupt) + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: XCPT_FRAME interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ff0235391285..8670b3ce626e 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -23,41 +23,6 @@ #include #include -/* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -/* - * SMP has a few special interrupts for IPI messages - */ - -#define BUILD_IRQ(nr) \ - asmlinkage void IRQ_NAME(nr); \ - asm("\n.text\n.p2align\n" \ - "IRQ" #nr "_interrupt:\n\t" \ - "push $~(" #nr ") ; " \ - "jmp common_interrupt\n" \ - ".previous"); - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: * (these are usually mapped to vectors 0x30-0x3f) @@ -73,37 +38,6 @@ * * (these are usually mapped into the 0x30-0xff vector range) */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - - - /* * IRQ2 is cascade interrupt to second interrupt controller -- cgit v1.2.3 From 4a61204856e8b28e9f5489a7875cb3a60afd1e43 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 11 Nov 2008 19:09:29 -0800 Subject: x86: signal_32: introduce retcode and rt_retcode Impact: cleanup Introduce retcode and rt_retcode to replace setting up frame->retcode. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 27a5c8174322..514171ac0d03 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -45,6 +45,28 @@ # define FIX_EFLAGS __FIX_EFLAGS #endif +static const struct { + u16 poplmovl; + u32 val; + u16 int80; +} __attribute__((packed)) retcode = { + 0xb858, /* popl %eax; movl $..., %eax */ + __NR_sigreturn, + 0x80cd, /* int $0x80 */ +}; + +static const struct { + u8 movl; + u32 val; + u16 int80; + u8 pad; +} __attribute__((packed)) rt_retcode = { + 0xb8, /* movl $..., %eax */ + __NR_rt_sigreturn, + 0x80cd, /* int $0x80 */ + 0 +}; + /* * Atomically swap in the new signal mask, and wait for a signal. */ @@ -427,9 +449,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, * reasons and because gdb uses it as a signature to notice * signal handler stack frames. */ - err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); - err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); - err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); + err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); if (err) return -EFAULT; @@ -498,9 +518,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * reasons and because gdb uses it as a signature to notice * signal handler stack frames. */ - err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); - err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); - err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); + err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode); if (err) return -EFAULT; -- cgit v1.2.3 From 9cc3c49ed1b1dbf6e50de4055a6773bd162f24b7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 11 Nov 2008 19:11:39 -0800 Subject: x86: ia32_signal: remove unnecessary padding Impact: reduce structure padding Remove unnecessary paddings, this saves 4 bytes. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 47ddc23f4f54..e2d0bc779bf7 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -427,12 +427,10 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, u16 poplmovl; u32 val; u16 int80; - u16 pad; } __attribute__((packed)) code = { 0xb858, /* popl %eax ; movl $...,%eax */ __NR_ia32_sigreturn, 0x80cd, /* int $0x80 */ - 0, }; frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); @@ -508,8 +506,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, u8 movl; u32 val; u16 int80; - u16 pad; - u8 pad2; + u8 pad; } __attribute__((packed)) code = { 0xb8, __NR_ia32_rt_sigreturn, -- cgit v1.2.3 From 8665596ec05498525014436520b316ba174a068a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 12 Nov 2008 10:27:35 -0800 Subject: x86: fix up the new IRQ code for older versions of gas Older versions of gas don't implement the C-style != operator, they instead want the Pascal-style <> operator. Change != to <> so we don't break compilation with those old versions of gas. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 4 ++-- arch/x86/kernel/entry_64.S | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index dae81b9fd451..bd02ec77edc4 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -635,12 +635,12 @@ vector=FIRST_EXTERNAL_VECTOR .balign 32 .rept 7 .if vector < NR_VECTORS - .if vector != FIRST_EXTERNAL_VECTOR + .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 .endif 1: pushl $(~vector+0x80) /* Note: always in signed byte range */ CFI_ADJUST_CFA_OFFSET 4 - .if ((vector-FIRST_EXTERNAL_VECTOR)%7) != 6 + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 jmp 2f .endif .previous diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 9b2aeaac9a6b..2b42362a85b2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -644,12 +644,12 @@ vector=FIRST_EXTERNAL_VECTOR .balign 32 .rept 7 .if vector < NR_VECTORS - .if vector != FIRST_EXTERNAL_VECTOR + .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -8 .endif 1: pushq $(~vector+0x80) /* Note: always in signed byte range */ CFI_ADJUST_CFA_OFFSET 8 - .if ((vector-FIRST_EXTERNAL_VECTOR)%7) != 6 + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 jmp 2f .endif .previous -- cgit v1.2.3 From 8652cb4b0d87accbe78725fd2a13be2787059649 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 12 Nov 2008 13:35:00 -0500 Subject: x86: warn of incorrect cpu_khz on AMD systems Impact: add debug check If none of the perfctrs are free when calculating cpu_khz we default to using ctr 3 (ie, we just choose 3). This may lead to an incorrect tsc freq value which can cause the system to be unstable. To aid in future debugging, WARN the user of a potential problem. Signed-off-by: Prarit Bhargava Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index cb19d650c216..418a095c5796 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -80,6 +80,8 @@ unsigned long __init calibrate_cpu(void) break; no_ctr_free = (i == 4); if (no_ctr_free) { + WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " + "cpu_khz value may be incorrect.\n"); i = 3; rdmsrl(MSR_K7_EVNTSEL3, evntsel3); wrmsrl(MSR_K7_EVNTSEL3, 0); -- cgit v1.2.3 From 350b4da71f8326b9319ada7b701f2bce2e1285b7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:38:40 +1100 Subject: CRED: Wrap task credential accesses in the x86 arch Wrap access to task credentials so that they can be separated more easily from the task_struct during the introduction of COW creds. Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id(). Change some task->e?[ug]id to task_e?[ug]id(). In some places it makes more sense to use RCU directly rather than a convenient wrapper; these will be addressed by later patches. Signed-off-by: David Howells Reviewed-by: James Morris Acked-by: Serge Hallyn Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Signed-off-by: James Morris --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 31e8730fa246..3a1b6ef4f05d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -393,7 +393,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, if (pte && pte_present(*pte) && !pte_exec(*pte)) printk(KERN_CRIT "kernel tried to execute " "NX-protected page - exploit attempt? " - "(uid: %d)\n", current->uid); + "(uid: %d)\n", current_uid()); } #endif -- cgit v1.2.3 From a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:24 +1100 Subject: CRED: Make execve() take advantage of copy-on-write credentials Make execve() take advantage of copy-on-write credentials, allowing it to set up the credentials in advance, and then commit the whole lot after the point of no return. This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). The credential bits from struct linux_binprm are, for the most part, replaced with a single credentials pointer (bprm->cred). This means that all the creds can be calculated in advance and then applied at the point of no return with no possibility of failure. I would like to replace bprm->cap_effective with: cap_isclear(bprm->cap_effective) but this seems impossible due to special behaviour for processes of pid 1 (they always retain their parent's capability masks where normally they'd be changed - see cap_bprm_set_creds()). The following sequence of events now happens: (a) At the start of do_execve, the current task's cred_exec_mutex is locked to prevent PTRACE_ATTACH from obsoleting the calculation of creds that we make. (a) prepare_exec_creds() is then called to make a copy of the current task's credentials and prepare it. This copy is then assigned to bprm->cred. This renders security_bprm_alloc() and security_bprm_free() unnecessary, and so they've been removed. (b) The determination of unsafe execution is now performed immediately after (a) rather than later on in the code. The result is stored in bprm->unsafe for future reference. (c) prepare_binprm() is called, possibly multiple times. (i) This applies the result of set[ug]id binaries to the new creds attached to bprm->cred. Personality bit clearance is recorded, but now deferred on the basis that the exec procedure may yet fail. (ii) This then calls the new security_bprm_set_creds(). This should calculate the new LSM and capability credentials into *bprm->cred. This folds together security_bprm_set() and parts of security_bprm_apply_creds() (these two have been removed). Anything that might fail must be done at this point. (iii) bprm->cred_prepared is set to 1. bprm->cred_prepared is 0 on the first pass of the security calculations, and 1 on all subsequent passes. This allows SELinux in (ii) to base its calculations only on the initial script and not on the interpreter. (d) flush_old_exec() is called to commit the task to execution. This performs the following steps with regard to credentials: (i) Clear pdeath_signal and set dumpable on certain circumstances that may not be covered by commit_creds(). (ii) Clear any bits in current->personality that were deferred from (c.i). (e) install_exec_creds() [compute_creds() as was] is called to install the new credentials. This performs the following steps with regard to credentials: (i) Calls security_bprm_committing_creds() to apply any security requirements, such as flushing unauthorised files in SELinux, that must be done before the credentials are changed. This is made up of bits of security_bprm_apply_creds() and security_bprm_post_apply_creds(), both of which have been removed. This function is not allowed to fail; anything that might fail must have been done in (c.ii). (ii) Calls commit_creds() to apply the new credentials in a single assignment (more or less). Possibly pdeath_signal and dumpable should be part of struct creds. (iii) Unlocks the task's cred_replace_mutex, thus allowing PTRACE_ATTACH to take place. (iv) Clears The bprm->cred pointer as the credentials it was holding are now immutable. (v) Calls security_bprm_committed_creds() to apply any security alterations that must be done after the creds have been changed. SELinux uses this to flush signals and signal handlers. (f) If an error occurs before (d.i), bprm_free() will call abort_creds() to destroy the proposed new credentials and will then unlock cred_replace_mutex. No changes to the credentials will have been made. (2) LSM interface. A number of functions have been changed, added or removed: (*) security_bprm_alloc(), ->bprm_alloc_security() (*) security_bprm_free(), ->bprm_free_security() Removed in favour of preparing new credentials and modifying those. (*) security_bprm_apply_creds(), ->bprm_apply_creds() (*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds() Removed; split between security_bprm_set_creds(), security_bprm_committing_creds() and security_bprm_committed_creds(). (*) security_bprm_set(), ->bprm_set_security() Removed; folded into security_bprm_set_creds(). (*) security_bprm_set_creds(), ->bprm_set_creds() New. The new credentials in bprm->creds should be checked and set up as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the second and subsequent calls. (*) security_bprm_committing_creds(), ->bprm_committing_creds() (*) security_bprm_committed_creds(), ->bprm_committed_creds() New. Apply the security effects of the new credentials. This includes closing unauthorised files in SELinux. This function may not fail. When the former is called, the creds haven't yet been applied to the process; when the latter is called, they have. The former may access bprm->cred, the latter may not. (3) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) The bprm_security_struct struct has been removed in favour of using the credentials-under-construction approach. (c) flush_unauthorized_files() now takes a cred pointer and passes it on to inode_has_perm(), file_has_perm() and dentry_open(). Signed-off-by: David Howells Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: James Morris --- arch/x86/ia32/ia32_aout.c | 2 +- fs/binfmt_aout.c | 2 +- fs/binfmt_elf.c | 2 +- fs/binfmt_elf_fdpic.c | 2 +- fs/binfmt_flat.c | 2 +- fs/binfmt_som.c | 2 +- fs/compat.c | 42 +++--- fs/exec.c | 149 +++++++++++--------- fs/internal.h | 6 + include/linux/audit.h | 16 --- include/linux/binfmts.h | 16 ++- include/linux/cred.h | 3 +- include/linux/key.h | 2 - include/linux/security.h | 103 +++++--------- kernel/cred.c | 46 ++++++- security/capability.c | 19 +-- security/commoncap.c | 152 ++++++++++---------- security/keys/process_keys.c | 42 ------ security/root_plug.c | 13 +- security/security.c | 26 ++-- security/selinux/hooks.c | 283 ++++++++++++++++---------------------- security/selinux/include/objsec.h | 11 -- security/smack/smack_lsm.c | 3 +- 23 files changed, 429 insertions(+), 515 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 127ec3f07214..2a4d073d2cf1 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -327,7 +327,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->cached_hole_size = 0; current->mm->mmap = NULL; - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; if (N_MAGIC(ex) == OMAGIC) { diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 204cfd1d7676..f1f3f4192a60 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -320,7 +320,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; #ifdef __sparc__ if (N_MAGIC(ex) == NMAGIC) { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 9142ff5dc8e6..f458c1217c5e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -956,7 +956,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) } #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; retval = create_elf_tables(bprm, &loc->elf_ex, load_addr, interp_load_addr); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 45dabd59936f..aa5b43205e37 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -404,7 +404,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, current->mm->start_stack = current->mm->start_brk + stack_size; #endif - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; if (create_elf_fdpic_tables(bprm, current->mm, &exec_params, &interp_params) < 0) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index ccb781a6a804..7bbd5c6b3725 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -880,7 +880,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) (libinfo.lib_list[j].loaded)? libinfo.lib_list[j].start_data:UNLOADED_LIB; - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; set_binfmt(&flat_format); diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 74e587a52796..08644a61616e 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -255,7 +255,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) kfree(hpuxhdr); set_binfmt(&som_format); - compute_creds(bprm); + install_exec_creds(bprm); setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); create_som_tables(bprm); diff --git a/fs/compat.c b/fs/compat.c index e5f49f538502..d1ece79b6411 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1393,10 +1393,20 @@ int compat_do_execve(char * filename, if (!bprm) goto out_ret; + retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + if (retval < 0) + goto out_free; + + retval = -ENOMEM; + bprm->cred = prepare_exec_creds(); + if (!bprm->cred) + goto out_unlock; + check_unsafe_exec(bprm); + file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) - goto out_kfree; + goto out_unlock; sched_exec(); @@ -1410,14 +1420,10 @@ int compat_do_execve(char * filename, bprm->argc = compat_count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) - goto out_mm; + goto out; bprm->envc = compat_count(envp, MAX_ARG_STRINGS); if ((retval = bprm->envc) < 0) - goto out_mm; - - retval = security_bprm_alloc(bprm); - if (retval) goto out; retval = prepare_binprm(bprm); @@ -1438,19 +1444,16 @@ int compat_do_execve(char * filename, goto out; retval = search_binary_handler(bprm, regs); - if (retval >= 0) { - /* execve success */ - security_bprm_free(bprm); - acct_update_integrals(current); - free_bprm(bprm); - return retval; - } + if (retval < 0) + goto out; -out: - if (bprm->security) - security_bprm_free(bprm); + /* execve succeeded */ + mutex_unlock(¤t->cred_exec_mutex); + acct_update_integrals(current); + free_bprm(bprm); + return retval; -out_mm: +out: if (bprm->mm) mmput(bprm->mm); @@ -1460,7 +1463,10 @@ out_file: fput(bprm->file); } -out_kfree: +out_unlock: + mutex_unlock(¤t->cred_exec_mutex); + +out_free: free_bprm(bprm); out_ret: diff --git a/fs/exec.c b/fs/exec.c index 9bd3559ddece..32f13e299417 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -55,6 +55,7 @@ #include #include #include +#include "internal.h" #ifdef __alpha__ /* for /sbin/loader handling in search_binary_handler() */ @@ -1007,15 +1008,17 @@ int flush_old_exec(struct linux_binprm * bprm) */ current->mm->task_size = TASK_SIZE; - if (bprm->e_uid != current_euid() || - bprm->e_gid != current_egid()) { - set_dumpable(current->mm, suid_dumpable); + /* install the new credentials */ + if (bprm->cred->uid != current_euid() || + bprm->cred->gid != current_egid()) { current->pdeath_signal = 0; } else if (file_permission(bprm->file, MAY_READ) || - (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) { + bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) { set_dumpable(current->mm, suid_dumpable); } + current->personality &= ~bprm->per_clear; + /* An exec changes our domain. We are no longer part of the thread group */ @@ -1032,13 +1035,50 @@ out: EXPORT_SYMBOL(flush_old_exec); +/* + * install the new credentials for this executable + */ +void install_exec_creds(struct linux_binprm *bprm) +{ + security_bprm_committing_creds(bprm); + + commit_creds(bprm->cred); + bprm->cred = NULL; + + /* cred_exec_mutex must be held at least to this point to prevent + * ptrace_attach() from altering our determination of the task's + * credentials; any time after this it may be unlocked */ + + security_bprm_committed_creds(bprm); +} +EXPORT_SYMBOL(install_exec_creds); + +/* + * determine how safe it is to execute the proposed program + * - the caller must hold current->cred_exec_mutex to protect against + * PTRACE_ATTACH + */ +void check_unsafe_exec(struct linux_binprm *bprm) +{ + struct task_struct *p = current; + + bprm->unsafe = tracehook_unsafe_exec(p); + + if (atomic_read(&p->fs->count) > 1 || + atomic_read(&p->files->count) > 1 || + atomic_read(&p->sighand->count) > 1) + bprm->unsafe |= LSM_UNSAFE_SHARE; +} + /* * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + * + * This may be called multiple times for binary chains (scripts for example). */ int prepare_binprm(struct linux_binprm *bprm) { - int mode; + umode_t mode; struct inode * inode = bprm->file->f_path.dentry->d_inode; int retval; @@ -1046,14 +1086,15 @@ int prepare_binprm(struct linux_binprm *bprm) if (bprm->file->f_op == NULL) return -EACCES; - bprm->e_uid = current_euid(); - bprm->e_gid = current_egid(); + /* clear any previous set[ug]id data from a previous binary */ + bprm->cred->euid = current_euid(); + bprm->cred->egid = current_egid(); - if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) { + if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) { /* Set-uid? */ if (mode & S_ISUID) { - current->personality &= ~PER_CLEAR_ON_SETID; - bprm->e_uid = inode->i_uid; + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->euid = inode->i_uid; } /* Set-gid? */ @@ -1063,50 +1104,23 @@ int prepare_binprm(struct linux_binprm *bprm) * executable. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { - current->personality &= ~PER_CLEAR_ON_SETID; - bprm->e_gid = inode->i_gid; + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->egid = inode->i_gid; } } /* fill in binprm security blob */ - retval = security_bprm_set(bprm); + retval = security_bprm_set_creds(bprm); if (retval) return retval; + bprm->cred_prepared = 1; - memset(bprm->buf,0,BINPRM_BUF_SIZE); - return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE); + memset(bprm->buf, 0, BINPRM_BUF_SIZE); + return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); } EXPORT_SYMBOL(prepare_binprm); -static int unsafe_exec(struct task_struct *p) -{ - int unsafe = tracehook_unsafe_exec(p); - - if (atomic_read(&p->fs->count) > 1 || - atomic_read(&p->files->count) > 1 || - atomic_read(&p->sighand->count) > 1) - unsafe |= LSM_UNSAFE_SHARE; - - return unsafe; -} - -void compute_creds(struct linux_binprm *bprm) -{ - int unsafe; - - if (bprm->e_uid != current_uid()) - current->pdeath_signal = 0; - exec_keys(current); - - task_lock(current); - unsafe = unsafe_exec(current); - security_bprm_apply_creds(bprm, unsafe); - task_unlock(current); - security_bprm_post_apply_creds(bprm); -} -EXPORT_SYMBOL(compute_creds); - /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after @@ -1259,6 +1273,8 @@ EXPORT_SYMBOL(search_binary_handler); void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); + if (bprm->cred) + abort_creds(bprm->cred); kfree(bprm); } @@ -1284,10 +1300,20 @@ int do_execve(char * filename, if (!bprm) goto out_files; + retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + if (retval < 0) + goto out_free; + + retval = -ENOMEM; + bprm->cred = prepare_exec_creds(); + if (!bprm->cred) + goto out_unlock; + check_unsafe_exec(bprm); + file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) - goto out_kfree; + goto out_unlock; sched_exec(); @@ -1301,14 +1327,10 @@ int do_execve(char * filename, bprm->argc = count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) - goto out_mm; + goto out; bprm->envc = count(envp, MAX_ARG_STRINGS); if ((retval = bprm->envc) < 0) - goto out_mm; - - retval = security_bprm_alloc(bprm); - if (retval) goto out; retval = prepare_binprm(bprm); @@ -1330,21 +1352,18 @@ int do_execve(char * filename, current->flags &= ~PF_KTHREAD; retval = search_binary_handler(bprm,regs); - if (retval >= 0) { - /* execve success */ - security_bprm_free(bprm); - acct_update_integrals(current); - free_bprm(bprm); - if (displaced) - put_files_struct(displaced); - return retval; - } + if (retval < 0) + goto out; -out: - if (bprm->security) - security_bprm_free(bprm); + /* execve succeeded */ + mutex_unlock(¤t->cred_exec_mutex); + acct_update_integrals(current); + free_bprm(bprm); + if (displaced) + put_files_struct(displaced); + return retval; -out_mm: +out: if (bprm->mm) mmput (bprm->mm); @@ -1353,7 +1372,11 @@ out_file: allow_write_access(bprm->file); fput(bprm->file); } -out_kfree: + +out_unlock: + mutex_unlock(¤t->cred_exec_mutex); + +out_free: free_bprm(bprm); out_files: diff --git a/fs/internal.h b/fs/internal.h index 80aa9a023372..53af885f1732 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -10,6 +10,7 @@ */ struct super_block; +struct linux_binprm; /* * block_dev.c @@ -39,6 +40,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb) */ extern void __init chrdev_init(void); +/* + * exec.c + */ +extern void check_unsafe_exec(struct linux_binprm *); + /* * namespace.c */ diff --git a/include/linux/audit.h b/include/linux/audit.h index 0b2fcb698a63..e8ce2c4c7ac7 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -508,22 +508,6 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) return 0; } -/* - * ieieeeeee, an audit function without a return code! - * - * This function might fail! I decided that it didn't matter. We are too late - * to fail the syscall and the information isn't REQUIRED for any purpose. It's - * just nice to have. We should be able to look at past audit logs to figure - * out this process's current cap set along with the fcaps from the PATH record - * and use that to come up with the final set. Yeah, its ugly, but all the info - * is still in the audit log. So I'm not going to bother mentioning we failed - * if we couldn't allocate memory. - * - * If someone changes their mind they could create the aux record earlier and - * then search here and use that earlier allocation. But I don't wanna. - * - * -Eric - */ static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 7394b5b349ff..6cbfbe297180 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -35,16 +35,20 @@ struct linux_binprm{ struct mm_struct *mm; unsigned long p; /* current top of mem */ unsigned int sh_bang:1, - misc_bang:1; + misc_bang:1, + cred_prepared:1,/* true if creds already prepared (multiple + * preps happen for interpreters) */ + cap_effective:1;/* true if has elevated effective capabilities, + * false if not; except for init which inherits + * its parent's caps anyway */ #ifdef __alpha__ unsigned int taso:1; #endif unsigned int recursion_depth; struct file * file; - int e_uid, e_gid; - kernel_cap_t cap_post_exec_permitted; - bool cap_effective; - void *security; + struct cred *cred; /* new credentials */ + int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ + unsigned int per_clear; /* bits to clear in current->personality */ int argc, envc; char * filename; /* Name of binary as seen by procps */ char * interp; /* Name of the binary really executed. Most @@ -101,7 +105,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, int executable_stack); extern int bprm_mm_init(struct linux_binprm *bprm); extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); -extern void compute_creds(struct linux_binprm *binprm); +extern void install_exec_creds(struct linux_binprm *bprm); extern int do_coredump(long signr, int exit_code, struct pt_regs * regs); extern int set_binfmt(struct linux_binfmt *new); extern void free_bprm(struct linux_binprm *); diff --git a/include/linux/cred.h b/include/linux/cred.h index eaf6fa695a04..8edb4d1d5427 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -84,8 +84,6 @@ struct thread_group_cred { struct key *process_keyring; /* keyring private to this process */ struct rcu_head rcu; /* RCU deletion hook */ }; - -extern void release_tgcred(struct cred *cred); #endif /* @@ -144,6 +142,7 @@ struct cred { extern void __put_cred(struct cred *); extern int copy_creds(struct task_struct *, unsigned long); extern struct cred *prepare_creds(void); +extern struct cred *prepare_exec_creds(void); extern struct cred *prepare_usermodehelper_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); diff --git a/include/linux/key.h b/include/linux/key.h index 69ecf0934b02..21d32a142c00 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -278,7 +278,6 @@ extern ctl_table key_sysctls[]; * the userspace interface */ extern int install_thread_keyring_to_cred(struct cred *cred); -extern int exec_keys(struct task_struct *tsk); extern void key_fsuid_changed(struct task_struct *tsk); extern void key_fsgid_changed(struct task_struct *tsk); extern void key_init(void); @@ -294,7 +293,6 @@ extern void key_init(void); #define make_key_ref(k, p) NULL #define key_ref_to_ptr(k) NULL #define is_key_possessed(k) 0 -#define exec_keys(t) do { } while(0) #define key_fsuid_changed(t) do { } while(0) #define key_fsgid_changed(t) do { } while(0) #define key_init() do { } while(0) diff --git a/include/linux/security.h b/include/linux/security.h index 68be11251447..56a0eed65673 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -57,8 +57,7 @@ extern int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted); -extern int cap_bprm_set_security(struct linux_binprm *bprm); -extern int cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); +extern int cap_bprm_set_creds(struct linux_binprm *bprm); extern int cap_bprm_secureexec(struct linux_binprm *bprm); extern int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); @@ -110,7 +109,7 @@ extern unsigned long mmap_min_addr; struct sched_param; struct request_sock; -/* bprm_apply_creds unsafe reasons */ +/* bprm->unsafe reasons */ #define LSM_UNSAFE_SHARE 1 #define LSM_UNSAFE_PTRACE 2 #define LSM_UNSAFE_PTRACE_CAP 4 @@ -154,36 +153,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * * Security hooks for program execution operations. * - * @bprm_alloc_security: - * Allocate and attach a security structure to the @bprm->security field. - * The security field is initialized to NULL when the bprm structure is - * allocated. - * @bprm contains the linux_binprm structure to be modified. - * Return 0 if operation was successful. - * @bprm_free_security: - * @bprm contains the linux_binprm structure to be modified. - * Deallocate and clear the @bprm->security field. - * @bprm_apply_creds: - * Compute and set the security attributes of a process being transformed - * by an execve operation based on the old attributes (current->security) - * and the information saved in @bprm->security by the set_security hook. - * Since this function may return an error, in which case the process will - * be killed. However, it can leave the security attributes of the - * process unchanged if an access failure occurs at this point. - * bprm_apply_creds is called under task_lock. @unsafe indicates various - * reasons why it may be unsafe to change security state. - * @bprm contains the linux_binprm structure. - * @bprm_post_apply_creds: - * Runs after bprm_apply_creds with the task_lock dropped, so that - * functions which cannot be called safely under the task_lock can - * be used. This hook is a good place to perform state changes on - * the process such as closing open file descriptors to which access - * is no longer granted if the attributes were changed. - * Note that a security module might need to save state between - * bprm_apply_creds and bprm_post_apply_creds to store the decision - * on whether the process may proceed. - * @bprm contains the linux_binprm structure. - * @bprm_set_security: + * @bprm_set_creds: * Save security information in the bprm->security field, typically based * on information about the bprm->file, for later use by the apply_creds * hook. This hook may also optionally check permissions (e.g. for @@ -196,15 +166,30 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. * @bprm_check_security: - * This hook mediates the point when a search for a binary handler will - * begin. It allows a check the @bprm->security value which is set in - * the preceding set_security call. The primary difference from - * set_security is that the argv list and envp list are reliably - * available in @bprm. This hook may be called multiple times - * during a single execve; and in each pass set_security is called - * first. + * This hook mediates the point when a search for a binary handler will + * begin. It allows a check the @bprm->security value which is set in the + * preceding set_creds call. The primary difference from set_creds is + * that the argv list and envp list are reliably available in @bprm. This + * hook may be called multiple times during a single execve; and in each + * pass set_creds is called first. * @bprm contains the linux_binprm structure. * Return 0 if the hook is successful and permission is granted. + * @bprm_committing_creds: + * Prepare to install the new security attributes of a process being + * transformed by an execve operation, based on the old credentials + * pointed to by @current->cred and the information set in @bprm->cred by + * the bprm_set_creds hook. @bprm points to the linux_binprm structure. + * This hook is a good place to perform state changes on the process such + * as closing open file descriptors to which access will no longer be + * granted when the attributes are changed. This is called immediately + * before commit_creds(). + * @bprm_committed_creds: + * Tidy up after the installation of the new security attributes of a + * process being transformed by an execve operation. The new credentials + * have, by this point, been set to @current->cred. @bprm points to the + * linux_binprm structure. This hook is a good place to perform state + * changes on the process such as clearing out non-inheritable signal + * state. This is called immediately after commit_creds(). * @bprm_secureexec: * Return a boolean value (0 or 1) indicating whether a "secure exec" * is required. The flag is passed in the auxiliary table @@ -1301,13 +1286,11 @@ struct security_operations { int (*settime) (struct timespec *ts, struct timezone *tz); int (*vm_enough_memory) (struct mm_struct *mm, long pages); - int (*bprm_alloc_security) (struct linux_binprm *bprm); - void (*bprm_free_security) (struct linux_binprm *bprm); - int (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe); - void (*bprm_post_apply_creds) (struct linux_binprm *bprm); - int (*bprm_set_security) (struct linux_binprm *bprm); + int (*bprm_set_creds) (struct linux_binprm *bprm); int (*bprm_check_security) (struct linux_binprm *bprm); int (*bprm_secureexec) (struct linux_binprm *bprm); + void (*bprm_committing_creds) (struct linux_binprm *bprm); + void (*bprm_committed_creds) (struct linux_binprm *bprm); int (*sb_alloc_security) (struct super_block *sb); void (*sb_free_security) (struct super_block *sb); @@ -1569,12 +1552,10 @@ int security_settime(struct timespec *ts, struct timezone *tz); int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); int security_vm_enough_memory_kern(long pages); -int security_bprm_alloc(struct linux_binprm *bprm); -void security_bprm_free(struct linux_binprm *bprm); -int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe); -void security_bprm_post_apply_creds(struct linux_binprm *bprm); -int security_bprm_set(struct linux_binprm *bprm); +int security_bprm_set_creds(struct linux_binprm *bprm); int security_bprm_check(struct linux_binprm *bprm); +void security_bprm_committing_creds(struct linux_binprm *bprm); +void security_bprm_committed_creds(struct linux_binprm *bprm); int security_bprm_secureexec(struct linux_binprm *bprm); int security_sb_alloc(struct super_block *sb); void security_sb_free(struct super_block *sb); @@ -1812,32 +1793,22 @@ static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) return cap_vm_enough_memory(mm, pages); } -static inline int security_bprm_alloc(struct linux_binprm *bprm) -{ - return 0; -} - -static inline void security_bprm_free(struct linux_binprm *bprm) -{ } - -static inline int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) +static inline int security_bprm_set_creds(struct linux_binprm *bprm) { - return cap_bprm_apply_creds(bprm, unsafe); + return cap_bprm_set_creds(bprm); } -static inline void security_bprm_post_apply_creds(struct linux_binprm *bprm) +static inline int security_bprm_check(struct linux_binprm *bprm) { - return; + return 0; } -static inline int security_bprm_set(struct linux_binprm *bprm) +static inline void security_bprm_committing_creds(struct linux_binprm *bprm) { - return cap_bprm_set_security(bprm); } -static inline int security_bprm_check(struct linux_binprm *bprm) +static inline void security_bprm_committed_creds(struct linux_binprm *bprm) { - return 0; } static inline int security_bprm_secureexec(struct linux_binprm *bprm) diff --git a/kernel/cred.c b/kernel/cred.c index cb6b5eda978d..e6fcdd67b2ec 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -68,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu) /* * Release a set of thread group credentials. */ -void release_tgcred(struct cred *cred) +static void release_tgcred(struct cred *cred) { #ifdef CONFIG_KEYS struct thread_group_cred *tgcred = cred->tgcred; @@ -163,6 +163,50 @@ error: } EXPORT_SYMBOL(prepare_creds); +/* + * Prepare credentials for current to perform an execve() + * - The caller must hold current->cred_exec_mutex + */ +struct cred *prepare_exec_creds(void) +{ + struct thread_group_cred *tgcred = NULL; + struct cred *new; + +#ifdef CONFIG_KEYS + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) + return NULL; +#endif + + new = prepare_creds(); + if (!new) { + kfree(tgcred); + return new; + } + +#ifdef CONFIG_KEYS + /* newly exec'd tasks don't get a thread keyring */ + key_put(new->thread_keyring); + new->thread_keyring = NULL; + + /* create a new per-thread-group creds for all this set of threads to + * share */ + memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); + + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + + /* inherit the session keyring; new process keyring */ + key_get(tgcred->session_keyring); + tgcred->process_keyring = NULL; + + release_tgcred(new); + new->tgcred = tgcred; +#endif + + return new; +} + /* * prepare new credentials for the usermode helper dispatcher */ diff --git a/security/capability.c b/security/capability.c index efeb6d9e0e6a..185804f99ad1 100644 --- a/security/capability.c +++ b/security/capability.c @@ -32,24 +32,19 @@ static int cap_quota_on(struct dentry *dentry) return 0; } -static int cap_bprm_alloc_security(struct linux_binprm *bprm) +static int cap_bprm_check_security (struct linux_binprm *bprm) { return 0; } -static void cap_bprm_free_security(struct linux_binprm *bprm) +static void cap_bprm_committing_creds(struct linux_binprm *bprm) { } -static void cap_bprm_post_apply_creds(struct linux_binprm *bprm) +static void cap_bprm_committed_creds(struct linux_binprm *bprm) { } -static int cap_bprm_check_security(struct linux_binprm *bprm) -{ - return 0; -} - static int cap_sb_alloc_security(struct super_block *sb) { return 0; @@ -827,11 +822,9 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, syslog); set_to_cap_if_null(ops, settime); set_to_cap_if_null(ops, vm_enough_memory); - set_to_cap_if_null(ops, bprm_alloc_security); - set_to_cap_if_null(ops, bprm_free_security); - set_to_cap_if_null(ops, bprm_apply_creds); - set_to_cap_if_null(ops, bprm_post_apply_creds); - set_to_cap_if_null(ops, bprm_set_security); + set_to_cap_if_null(ops, bprm_set_creds); + set_to_cap_if_null(ops, bprm_committing_creds); + set_to_cap_if_null(ops, bprm_committed_creds); set_to_cap_if_null(ops, bprm_check_security); set_to_cap_if_null(ops, bprm_secureexec); set_to_cap_if_null(ops, sb_alloc_security); diff --git a/security/commoncap.c b/security/commoncap.c index b5419273f92d..51dfa11e8e56 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -167,7 +167,7 @@ int cap_capset(struct cred *new, static inline void bprm_clear_caps(struct linux_binprm *bprm) { - cap_clear(bprm->cap_post_exec_permitted); + cap_clear(bprm->cred->cap_permitted); bprm->cap_effective = false; } @@ -198,15 +198,15 @@ int cap_inode_killpriv(struct dentry *dentry) } static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, - struct linux_binprm *bprm) + struct linux_binprm *bprm, + bool *effective) { + struct cred *new = bprm->cred; unsigned i; int ret = 0; if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE) - bprm->cap_effective = true; - else - bprm->cap_effective = false; + *effective = true; CAP_FOR_EACH_U32(i) { __u32 permitted = caps->permitted.cap[i]; @@ -215,16 +215,13 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, /* * pP' = (X & fP) | (pI & fI) */ - bprm->cap_post_exec_permitted.cap[i] = - (current->cred->cap_bset.cap[i] & permitted) | - (current->cred->cap_inheritable.cap[i] & inheritable); + new->cap_permitted.cap[i] = + (new->cap_bset.cap[i] & permitted) | + (new->cap_inheritable.cap[i] & inheritable); - if (permitted & ~bprm->cap_post_exec_permitted.cap[i]) { - /* - * insufficient to execute correctly - */ + if (permitted & ~new->cap_permitted.cap[i]) + /* insufficient to execute correctly */ ret = -EPERM; - } } /* @@ -232,7 +229,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, * do not have enough capabilities, we return an error if they are * missing some "forced" (aka file-permitted) capabilities. */ - return bprm->cap_effective ? ret : 0; + return *effective ? ret : 0; } int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) @@ -250,10 +247,9 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_CAPS, &caps, XATTR_CAPS_SZ); - if (size == -ENODATA || size == -EOPNOTSUPP) { + if (size == -ENODATA || size == -EOPNOTSUPP) /* no data, that's ok */ return -ENODATA; - } if (size < 0) return size; @@ -262,7 +258,7 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc); - switch ((magic_etc & VFS_CAP_REVISION_MASK)) { + switch (magic_etc & VFS_CAP_REVISION_MASK) { case VFS_CAP_REVISION_1: if (size != XATTR_CAPS_SZ_1) return -EINVAL; @@ -283,11 +279,12 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted); cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable); } + return 0; } /* Locate any VFS capabilities: */ -static int get_file_caps(struct linux_binprm *bprm) +static int get_file_caps(struct linux_binprm *bprm, bool *effective) { struct dentry *dentry; int rc = 0; @@ -313,7 +310,10 @@ static int get_file_caps(struct linux_binprm *bprm) goto out; } - rc = bprm_caps_from_vfs_caps(&vcaps, bprm); + rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective); + if (rc == -EINVAL) + printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n", + __func__, rc, bprm->filename); out: dput(dentry); @@ -334,18 +334,27 @@ int cap_inode_killpriv(struct dentry *dentry) return 0; } -static inline int get_file_caps(struct linux_binprm *bprm) +static inline int get_file_caps(struct linux_binprm *bprm, bool *effective) { bprm_clear_caps(bprm); return 0; } #endif -int cap_bprm_set_security (struct linux_binprm *bprm) +/* + * set up the new credentials for an exec'd task + */ +int cap_bprm_set_creds(struct linux_binprm *bprm) { + const struct cred *old = current_cred(); + struct cred *new = bprm->cred; + bool effective; int ret; - ret = get_file_caps(bprm); + effective = false; + ret = get_file_caps(bprm, &effective); + if (ret < 0) + return ret; if (!issecure(SECURE_NOROOT)) { /* @@ -353,63 +362,47 @@ int cap_bprm_set_security (struct linux_binprm *bprm) * executables under compatibility mode, we override the * capability sets for the file. * - * If only the real uid is 0, we do not set the effective - * bit. + * If only the real uid is 0, we do not set the effective bit. */ - if (bprm->e_uid == 0 || current_uid() == 0) { + if (new->euid == 0 || new->uid == 0) { /* pP' = (cap_bset & ~0) | (pI & ~0) */ - bprm->cap_post_exec_permitted = cap_combine( - current->cred->cap_bset, - current->cred->cap_inheritable); - bprm->cap_effective = (bprm->e_uid == 0); - ret = 0; + new->cap_permitted = cap_combine(old->cap_bset, + old->cap_inheritable); } + if (new->euid == 0) + effective = true; } - return ret; -} - -int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) -{ - const struct cred *old = current_cred(); - struct cred *new; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - if (bprm->e_uid != old->uid || bprm->e_gid != old->gid || - !cap_issubset(bprm->cap_post_exec_permitted, - old->cap_permitted)) { - set_dumpable(current->mm, suid_dumpable); - current->pdeath_signal = 0; - - if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) { - if (!capable(CAP_SETUID)) { - bprm->e_uid = old->uid; - bprm->e_gid = old->gid; - } - if (cap_limit_ptraced_target()) { - bprm->cap_post_exec_permitted = cap_intersect( - bprm->cap_post_exec_permitted, - new->cap_permitted); - } + /* Don't let someone trace a set[ug]id/setpcap binary with the revised + * credentials unless they have the appropriate permit + */ + if ((new->euid != old->uid || + new->egid != old->gid || + !cap_issubset(new->cap_permitted, old->cap_permitted)) && + bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) { + /* downgrade; they get no more than they had, and maybe less */ + if (!capable(CAP_SETUID)) { + new->euid = new->uid; + new->egid = new->gid; } + if (cap_limit_ptraced_target()) + new->cap_permitted = cap_intersect(new->cap_permitted, + old->cap_permitted); } - new->suid = new->euid = new->fsuid = bprm->e_uid; - new->sgid = new->egid = new->fsgid = bprm->e_gid; + new->suid = new->fsuid = new->euid; + new->sgid = new->fsgid = new->egid; - /* For init, we want to retain the capabilities set - * in the init_task struct. Thus we skip the usual - * capability rules */ + /* For init, we want to retain the capabilities set in the initial + * task. Thus we skip the usual capability rules + */ if (!is_global_init(current)) { - new->cap_permitted = bprm->cap_post_exec_permitted; - if (bprm->cap_effective) - new->cap_effective = bprm->cap_post_exec_permitted; + if (effective) + new->cap_effective = new->cap_permitted; else cap_clear(new->cap_effective); } + bprm->cap_effective = effective; /* * Audit candidate if current->cap_effective is set @@ -425,23 +418,31 @@ int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) */ if (!cap_isclear(new->cap_effective)) { if (!cap_issubset(CAP_FULL_SET, new->cap_effective) || - bprm->e_uid != 0 || new->uid != 0 || - issecure(SECURE_NOROOT)) - audit_log_bprm_fcaps(bprm, new, old); + new->euid != 0 || new->uid != 0 || + issecure(SECURE_NOROOT)) { + ret = audit_log_bprm_fcaps(bprm, new, old); + if (ret < 0) + return ret; + } } new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); - return commit_creds(new); + return 0; } -int cap_bprm_secureexec (struct linux_binprm *bprm) +/* + * determine whether a secure execution is required + * - the creds have been committed at this point, and are no longer available + * through bprm + */ +int cap_bprm_secureexec(struct linux_binprm *bprm) { const struct cred *cred = current_cred(); if (cred->uid != 0) { if (bprm->cap_effective) return 1; - if (!cap_isclear(bprm->cap_post_exec_permitted)) + if (!cap_isclear(cred->cap_permitted)) return 1; } @@ -477,7 +478,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name) } /* moved from kernel/sys.c. */ -/* +/* * cap_emulate_setxuid() fixes the effective / permitted capabilities of * a process after a call to setuid, setreuid, or setresuid. * @@ -491,10 +492,10 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name) * 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective * capabilities are set to the permitted capabilities. * - * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should + * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should * never happen. * - * -astor + * -astor * * cevans - New behaviour, Oct '99 * A process may, via prctl(), elect to keep its capabilities when it @@ -751,4 +752,3 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages) cap_sys_admin = 1; return __vm_enough_memory(mm, pages, cap_sys_admin); } - diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index df329f684a65..2f5d89e92b85 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -274,48 +274,6 @@ static int install_session_keyring(struct key *keyring) return commit_creds(new); } -/*****************************************************************************/ -/* - * deal with execve() - */ -int exec_keys(struct task_struct *tsk) -{ - struct thread_group_cred *tgcred = NULL; - struct cred *new; - -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) - return -ENOMEM; -#endif - - new = prepare_creds(); - if (new < 0) - return -ENOMEM; - - /* newly exec'd tasks don't get a thread keyring */ - key_put(new->thread_keyring); - new->thread_keyring = NULL; - - /* create a new per-thread-group creds for all this set of threads to - * share */ - memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); - - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - - /* inherit the session keyring; new process keyring */ - key_get(tgcred->session_keyring); - tgcred->process_keyring = NULL; - - release_tgcred(new); - new->tgcred = tgcred; - - commit_creds(new); - return 0; - -} /* end exec_keys() */ - /*****************************************************************************/ /* * the filesystem user ID changed diff --git a/security/root_plug.c b/security/root_plug.c index c3f68b5b372d..40fb4f15e27b 100644 --- a/security/root_plug.c +++ b/security/root_plug.c @@ -55,9 +55,9 @@ static int rootplug_bprm_check_security (struct linux_binprm *bprm) struct usb_device *dev; root_dbg("file %s, e_uid = %d, e_gid = %d\n", - bprm->filename, bprm->e_uid, bprm->e_gid); + bprm->filename, bprm->cred->euid, bprm->cred->egid); - if (bprm->e_gid == 0) { + if (bprm->cred->egid == 0) { dev = usb_find_device(vendor_id, product_id); if (!dev) { root_dbg("e_gid = 0, and device not found, " @@ -75,15 +75,12 @@ static struct security_operations rootplug_security_ops = { .ptrace_may_access = cap_ptrace_may_access, .ptrace_traceme = cap_ptrace_traceme, .capget = cap_capget, - .capset_check = cap_capset_check, - .capset_set = cap_capset_set, + .capset = cap_capset, .capable = cap_capable, - .bprm_apply_creds = cap_bprm_apply_creds, - .bprm_set_security = cap_bprm_set_security, + .bprm_set_creds = cap_bprm_set_creds, - .task_post_setuid = cap_task_post_setuid, - .task_reparent_to_init = cap_task_reparent_to_init, + .task_fix_setuid = cap_task_fix_setuid, .task_prctl = cap_task_prctl, .bprm_check_security = rootplug_bprm_check_security, diff --git a/security/security.c b/security/security.c index a55d739c6864..dc5babb2d6d8 100644 --- a/security/security.c +++ b/security/security.c @@ -213,34 +213,24 @@ int security_vm_enough_memory_kern(long pages) return security_ops->vm_enough_memory(current->mm, pages); } -int security_bprm_alloc(struct linux_binprm *bprm) +int security_bprm_set_creds(struct linux_binprm *bprm) { - return security_ops->bprm_alloc_security(bprm); + return security_ops->bprm_set_creds(bprm); } -void security_bprm_free(struct linux_binprm *bprm) -{ - security_ops->bprm_free_security(bprm); -} - -int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) -{ - return security_ops->bprm_apply_creds(bprm, unsafe); -} - -void security_bprm_post_apply_creds(struct linux_binprm *bprm) +int security_bprm_check(struct linux_binprm *bprm) { - security_ops->bprm_post_apply_creds(bprm); + return security_ops->bprm_check_security(bprm); } -int security_bprm_set(struct linux_binprm *bprm) +void security_bprm_committing_creds(struct linux_binprm *bprm) { - return security_ops->bprm_set_security(bprm); + return security_ops->bprm_committing_creds(bprm); } -int security_bprm_check(struct linux_binprm *bprm) +void security_bprm_committed_creds(struct linux_binprm *bprm) { - return security_ops->bprm_check_security(bprm); + return security_ops->bprm_committed_creds(bprm); } int security_bprm_secureexec(struct linux_binprm *bprm) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c71bba78872f..21a592184633 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2029,59 +2029,45 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages) /* binprm security operations */ -static int selinux_bprm_alloc_security(struct linux_binprm *bprm) +static int selinux_bprm_set_creds(struct linux_binprm *bprm) { - struct bprm_security_struct *bsec; - - bsec = kzalloc(sizeof(struct bprm_security_struct), GFP_KERNEL); - if (!bsec) - return -ENOMEM; - - bsec->sid = SECINITSID_UNLABELED; - bsec->set = 0; - - bprm->security = bsec; - return 0; -} - -static int selinux_bprm_set_security(struct linux_binprm *bprm) -{ - struct task_security_struct *tsec; - struct inode *inode = bprm->file->f_path.dentry->d_inode; + const struct task_security_struct *old_tsec; + struct task_security_struct *new_tsec; struct inode_security_struct *isec; - struct bprm_security_struct *bsec; - u32 newsid; struct avc_audit_data ad; + struct inode *inode = bprm->file->f_path.dentry->d_inode; int rc; - rc = secondary_ops->bprm_set_security(bprm); + rc = secondary_ops->bprm_set_creds(bprm); if (rc) return rc; - bsec = bprm->security; - - if (bsec->set) + /* SELinux context only depends on initial program or script and not + * the script interpreter */ + if (bprm->cred_prepared) return 0; - tsec = current_security(); + old_tsec = current_security(); + new_tsec = bprm->cred->security; isec = inode->i_security; /* Default to the current task SID. */ - bsec->sid = tsec->sid; + new_tsec->sid = old_tsec->sid; + new_tsec->osid = old_tsec->sid; /* Reset fs, key, and sock SIDs on execve. */ - tsec->create_sid = 0; - tsec->keycreate_sid = 0; - tsec->sockcreate_sid = 0; + new_tsec->create_sid = 0; + new_tsec->keycreate_sid = 0; + new_tsec->sockcreate_sid = 0; - if (tsec->exec_sid) { - newsid = tsec->exec_sid; + if (old_tsec->exec_sid) { + new_tsec->sid = old_tsec->exec_sid; /* Reset exec SID on execve. */ - tsec->exec_sid = 0; + new_tsec->exec_sid = 0; } else { /* Check for a default transition on this program. */ - rc = security_transition_sid(tsec->sid, isec->sid, - SECCLASS_PROCESS, &newsid); + rc = security_transition_sid(old_tsec->sid, isec->sid, + SECCLASS_PROCESS, &new_tsec->sid); if (rc) return rc; } @@ -2090,33 +2076,63 @@ static int selinux_bprm_set_security(struct linux_binprm *bprm) ad.u.fs.path = bprm->file->f_path; if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) - newsid = tsec->sid; + new_tsec->sid = old_tsec->sid; - if (tsec->sid == newsid) { - rc = avc_has_perm(tsec->sid, isec->sid, + if (new_tsec->sid == old_tsec->sid) { + rc = avc_has_perm(old_tsec->sid, isec->sid, SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { /* Check permissions for the transition. */ - rc = avc_has_perm(tsec->sid, newsid, + rc = avc_has_perm(old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__TRANSITION, &ad); if (rc) return rc; - rc = avc_has_perm(newsid, isec->sid, + rc = avc_has_perm(new_tsec->sid, isec->sid, SECCLASS_FILE, FILE__ENTRYPOINT, &ad); if (rc) return rc; - /* Clear any possibly unsafe personality bits on exec: */ - current->personality &= ~PER_CLEAR_ON_SETID; + /* Check for shared state */ + if (bprm->unsafe & LSM_UNSAFE_SHARE) { + rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + SECCLASS_PROCESS, PROCESS__SHARE, + NULL); + if (rc) + return -EPERM; + } + + /* Make sure that anyone attempting to ptrace over a task that + * changes its SID has the appropriate permit */ + if (bprm->unsafe & + (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { + struct task_struct *tracer; + struct task_security_struct *sec; + u32 ptsid = 0; + + rcu_read_lock(); + tracer = tracehook_tracer_task(current); + if (likely(tracer != NULL)) { + sec = __task_cred(tracer)->security; + ptsid = sec->sid; + } + rcu_read_unlock(); + + if (ptsid != 0) { + rc = avc_has_perm(ptsid, new_tsec->sid, + SECCLASS_PROCESS, + PROCESS__PTRACE, NULL); + if (rc) + return -EPERM; + } + } - /* Set the security field to the new SID. */ - bsec->sid = newsid; + /* Clear any possibly unsafe personality bits on exec: */ + bprm->per_clear |= PER_CLEAR_ON_SETID; } - bsec->set = 1; return 0; } @@ -2125,7 +2141,6 @@ static int selinux_bprm_check_security(struct linux_binprm *bprm) return secondary_ops->bprm_check_security(bprm); } - static int selinux_bprm_secureexec(struct linux_binprm *bprm) { const struct cred *cred = current_cred(); @@ -2141,19 +2156,13 @@ static int selinux_bprm_secureexec(struct linux_binprm *bprm) the noatsecure permission is granted between the two SIDs, i.e. ahp returns 0. */ atsecure = avc_has_perm(osid, sid, - SECCLASS_PROCESS, - PROCESS__NOATSECURE, NULL); + SECCLASS_PROCESS, + PROCESS__NOATSECURE, NULL); } return (atsecure || secondary_ops->bprm_secureexec(bprm)); } -static void selinux_bprm_free_security(struct linux_binprm *bprm) -{ - kfree(bprm->security); - bprm->security = NULL; -} - extern struct vfsmount *selinuxfs_mount; extern struct dentry *selinux_null; @@ -2252,108 +2261,78 @@ static inline void flush_unauthorized_files(const struct cred *cred, spin_unlock(&files->file_lock); } -static int selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe) +/* + * Prepare a process for imminent new credential changes due to exec + */ +static void selinux_bprm_committing_creds(struct linux_binprm *bprm) { - struct task_security_struct *tsec; - struct bprm_security_struct *bsec; - struct cred *new; - u32 sid; - int rc; - - rc = secondary_ops->bprm_apply_creds(bprm, unsafe); - if (rc < 0) - return rc; - - new = prepare_creds(); - if (!new) - return -ENOMEM; + struct task_security_struct *new_tsec; + struct rlimit *rlim, *initrlim; + int rc, i; - tsec = new->security; + secondary_ops->bprm_committing_creds(bprm); - bsec = bprm->security; - sid = bsec->sid; - - tsec->osid = tsec->sid; - bsec->unsafe = 0; - if (tsec->sid != sid) { - /* Check for shared state. If not ok, leave SID - unchanged and kill. */ - if (unsafe & LSM_UNSAFE_SHARE) { - rc = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS, - PROCESS__SHARE, NULL); - if (rc) { - bsec->unsafe = 1; - goto out; - } - } + new_tsec = bprm->cred->security; + if (new_tsec->sid == new_tsec->osid) + return; - /* Check for ptracing, and update the task SID if ok. - Otherwise, leave SID unchanged and kill. */ - if (unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { - struct task_struct *tracer; - struct task_security_struct *sec; - u32 ptsid = 0; + /* Close files for which the new task SID is not authorized. */ + flush_unauthorized_files(bprm->cred, current->files); - rcu_read_lock(); - tracer = tracehook_tracer_task(current); - if (likely(tracer != NULL)) { - sec = __task_cred(tracer)->security; - ptsid = sec->sid; - } - rcu_read_unlock(); + /* Always clear parent death signal on SID transitions. */ + current->pdeath_signal = 0; - if (ptsid != 0) { - rc = avc_has_perm(ptsid, sid, SECCLASS_PROCESS, - PROCESS__PTRACE, NULL); - if (rc) { - bsec->unsafe = 1; - goto out; - } - } + /* Check whether the new SID can inherit resource limits from the old + * SID. If not, reset all soft limits to the lower of the current + * task's hard limit and the init task's soft limit. + * + * Note that the setting of hard limits (even to lower them) can be + * controlled by the setrlimit check. The inclusion of the init task's + * soft limit into the computation is to avoid resetting soft limits + * higher than the default soft limit for cases where the default is + * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK. + */ + rc = avc_has_perm(new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS, + PROCESS__RLIMITINH, NULL); + if (rc) { + for (i = 0; i < RLIM_NLIMITS; i++) { + rlim = current->signal->rlim + i; + initrlim = init_task.signal->rlim + i; + rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } - tsec->sid = sid; + update_rlimit_cpu(rlim->rlim_cur); } - -out: - commit_creds(new); - return 0; } /* - * called after apply_creds without the task lock held + * Clean up the process immediately after the installation of new credentials + * due to exec */ -static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm) +static void selinux_bprm_committed_creds(struct linux_binprm *bprm) { - const struct cred *cred = current_cred(); - struct task_security_struct *tsec; - struct rlimit *rlim, *initrlim; + const struct task_security_struct *tsec = current_security(); struct itimerval itimer; - struct bprm_security_struct *bsec; struct sighand_struct *psig; + u32 osid, sid; int rc, i; unsigned long flags; - tsec = current_security(); - bsec = bprm->security; + secondary_ops->bprm_committed_creds(bprm); - if (bsec->unsafe) { - force_sig_specific(SIGKILL, current); - return; - } - if (tsec->osid == tsec->sid) + osid = tsec->osid; + sid = tsec->sid; + + if (sid == osid) return; - /* Close files for which the new task SID is not authorized. */ - flush_unauthorized_files(cred, current->files); - - /* Check whether the new SID can inherit signal state - from the old SID. If not, clear itimers to avoid - subsequent signal generation and flush and unblock - signals. This must occur _after_ the task SID has - been updated so that any kill done after the flush - will be checked against the new SID. */ - rc = avc_has_perm(tsec->osid, tsec->sid, SECCLASS_PROCESS, - PROCESS__SIGINH, NULL); + /* Check whether the new SID can inherit signal state from the old SID. + * If not, clear itimers to avoid subsequent signal generation and + * flush and unblock signals. + * + * This must occur _after_ the task SID has been updated so that any + * kill done after the flush will be checked against the new SID. + */ + rc = avc_has_perm(osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL); if (rc) { memset(&itimer, 0, sizeof itimer); for (i = 0; i < 3; i++) @@ -2366,32 +2345,8 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm) spin_unlock_irq(¤t->sighand->siglock); } - /* Always clear parent death signal on SID transitions. */ - current->pdeath_signal = 0; - - /* Check whether the new SID can inherit resource limits - from the old SID. If not, reset all soft limits to - the lower of the current task's hard limit and the init - task's soft limit. Note that the setting of hard limits - (even to lower them) can be controlled by the setrlimit - check. The inclusion of the init task's soft limit into - the computation is to avoid resetting soft limits higher - than the default soft limit for cases where the default - is lower than the hard limit, e.g. RLIMIT_CORE or - RLIMIT_STACK.*/ - rc = avc_has_perm(tsec->osid, tsec->sid, SECCLASS_PROCESS, - PROCESS__RLIMITINH, NULL); - if (rc) { - for (i = 0; i < RLIM_NLIMITS; i++) { - rlim = current->signal->rlim + i; - initrlim = init_task.signal->rlim+i; - rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); - } - update_rlimit_cpu(rlim->rlim_cur); - } - - /* Wake up the parent if it is waiting so that it can - recheck wait permission to the new task SID. */ + /* Wake up the parent if it is waiting so that it can recheck + * wait permission to the new task SID. */ read_lock_irq(&tasklist_lock); psig = current->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); @@ -5556,12 +5511,10 @@ static struct security_operations selinux_ops = { .netlink_send = selinux_netlink_send, .netlink_recv = selinux_netlink_recv, - .bprm_alloc_security = selinux_bprm_alloc_security, - .bprm_free_security = selinux_bprm_free_security, - .bprm_apply_creds = selinux_bprm_apply_creds, - .bprm_post_apply_creds = selinux_bprm_post_apply_creds, - .bprm_set_security = selinux_bprm_set_security, + .bprm_set_creds = selinux_bprm_set_creds, .bprm_check_security = selinux_bprm_check_security, + .bprm_committing_creds = selinux_bprm_committing_creds, + .bprm_committed_creds = selinux_bprm_committed_creds, .bprm_secureexec = selinux_bprm_secureexec, .sb_alloc_security = selinux_sb_alloc_security, diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index f8be8d7fa26d..3cc45168f674 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -77,17 +77,6 @@ struct ipc_security_struct { u32 sid; /* SID of IPC resource */ }; -struct bprm_security_struct { - u32 sid; /* SID for transformed process */ - unsigned char set; - - /* - * unsafe is used to share failure information from bprm_apply_creds() - * to bprm_post_apply_creds(). - */ - char unsafe; -}; - struct netif_security_struct { int ifindex; /* device index */ u32 sid; /* SID for this interface */ diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index e952b397153d..de396742abf4 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -2596,8 +2596,7 @@ struct security_operations smack_ops = { .settime = cap_settime, .vm_enough_memory = cap_vm_enough_memory, - .bprm_apply_creds = cap_bprm_apply_creds, - .bprm_set_security = cap_bprm_set_security, + .bprm_set_creds = cap_bprm_set_creds, .bprm_secureexec = cap_bprm_secureexec, .sb_alloc_security = smack_sb_alloc_security, -- cgit v1.2.3 From 722024dbb74f3ea316c285c0a71a4512e113b0c4 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 13 Nov 2008 13:50:20 +0100 Subject: x86: irq: fix apicinterrupts on 64 bits Impact: Fix interrupt via the apicinterrupt macro Checkin 939b787130bf22887a09d8fd2641a094dcef8c22 changed the "interrupt" macro, but the "interrupt" macro is also invoked indirectly from the "apicinterrupt" macro. The "apicinterrupt" macro probably should have its own collection of systematic stubs for the same reason the main IRQ code does; as is it is a huge amount of replicated code. Signed-off-by: Alexander van Heukelum Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2b42362a85b2..369de6973c58 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -675,9 +675,8 @@ END(interrupt) * Entry runs with interrupts off. */ -/* 0(%rsp): ~(interrupt number)+0x80 */ +/* 0(%rsp): ~(interrupt number) */ .macro interrupt func - addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ cld SAVE_ARGS leaq -ARGOFFSET(%rsp),%rdi /* arg1 for handler */ @@ -711,9 +710,14 @@ END(interrupt) call \func .endm + /* + * The interrupt stubs push (~vector+0x80) onto the stack and + * then jump to common_interrupt. + */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: XCPT_FRAME + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: -- cgit v1.2.3 From 0bd7b79851d0f74b24a9ce87d088f2e7c718f668 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sun, 16 Nov 2008 15:29:00 +0100 Subject: x86: entry_64.S: remove whitespace at end of lines Impact: cleanup All blame goes to: color white,red "[^[:graph:]]+$" in .nanorc ;). Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 190 ++++++++++++++++++++++----------------------- 1 file changed, 95 insertions(+), 95 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332c96a6..54927784bab9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -11,15 +11,15 @@ * * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. - * - * Normal syscalls and interrupts don't save a full stack frame, this is + * + * Normal syscalls and interrupts don't save a full stack frame, this is * only done for syscall tracing, signals or fork/exec et.al. - * - * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. * - partial stack frame: partially saved registers upto R11. - * - full stack frame: Like partial stack frame, but all register saved. + * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: * - CFI macros are used to generate dwarf2 unwind information for better @@ -142,7 +142,7 @@ END(mcount) #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args -#endif +#endif #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) @@ -161,14 +161,14 @@ ENTRY(native_usergs_sysret64) .endm /* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based * fast path FIXUP_TOP_OF_STACK is needed. * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs * manipulation. - */ - - /* %rsp:at FRAMEEND */ + */ + + /* %rsp:at FRAMEEND */ .macro FIXUP_TOP_OF_STACK tmp movq %gs:pda_oldrsp,\tmp movq \tmp,RSP(%rsp) @@ -244,8 +244,8 @@ ENTRY(native_usergs_sysret64) .endm /* * A newly forked process directly context switches into this. - */ -/* rdi: prev */ + */ +/* rdi: prev */ ENTRY(ret_from_fork) CFI_DEFAULT_STACK push kernel_eflags(%rip) @@ -256,7 +256,7 @@ ENTRY(ret_from_fork) GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) jnz rff_trace -rff_action: +rff_action: RESTORE_REST testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? je int_ret_from_sys_call @@ -267,7 +267,7 @@ rff_action: rff_trace: movq %rsp,%rdi call syscall_trace_leave - GET_THREAD_INFO(%rcx) + GET_THREAD_INFO(%rcx) jmp rff_action CFI_ENDPROC END(ret_from_fork) @@ -278,20 +278,20 @@ END(ret_from_fork) * SYSCALL does not save anything on the stack and does not change the * stack pointer. */ - + /* - * Register setup: + * Register setup: * rax system call number * rdi arg0 - * rcx return address for syscall/sysret, C arg3 + * rcx return address for syscall/sysret, C arg3 * rsi arg1 - * rdx arg2 + * rdx arg2 * r10 arg3 (--> moved to rcx for C) * r8 arg4 * r9 arg5 * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. - * + * r12-r15,rbp,rbx saved by C code, not touched. + * * Interrupts are off on entry. * Only called from user space. * @@ -301,7 +301,7 @@ END(ret_from_fork) * When user can change the frames always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. - */ + */ ENTRY(system_call) CFI_STARTPROC simple @@ -317,7 +317,7 @@ ENTRY(system_call) */ ENTRY(system_call_after_swapgs) - movq %rsp,%gs:pda_oldrsp + movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp /* * No need to follow this irqs off/on section - it's straight @@ -325,7 +325,7 @@ ENTRY(system_call_after_swapgs) */ ENABLE_INTERRUPTS(CLBR_NONE) SAVE_ARGS 8,1 - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) @@ -339,19 +339,19 @@ system_call_fastpath: movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. - */ + * Has incomplete stack frame and undefined top of stack. + */ ret_from_sys_call: movl $_TIF_ALLWORK_MASK,%edi /* edi: flagmask */ -sysret_check: +sysret_check: LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl TI_flags(%rcx),%edx andl %edi,%edx - jnz sysret_careful + jnz sysret_careful CFI_REMEMBER_STATE /* * sysretq will re-enable interrupts: @@ -366,7 +366,7 @@ sysret_check: CFI_RESTORE_STATE /* Handle reschedules */ - /* edx: work, edi: workmask */ + /* edx: work, edi: workmask */ sysret_careful: bt $TIF_NEED_RESCHED,%edx jnc sysret_signal @@ -379,7 +379,7 @@ sysret_careful: CFI_ADJUST_CFA_OFFSET -8 jmp sysret_check - /* Handle a signal */ + /* Handle a signal */ sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) @@ -398,7 +398,7 @@ sysret_signal: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check - + badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp ret_from_sys_call @@ -437,7 +437,7 @@ sysret_audit: #endif /* CONFIG_AUDITSYSCALL */ /* Do syscall tracing */ -tracesys: +tracesys: #ifdef CONFIG_AUDITSYSCALL testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) jz auditsys @@ -460,8 +460,8 @@ tracesys: call *sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ - -/* + +/* * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. */ @@ -505,18 +505,18 @@ int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST - /* Check for syscall exit trace */ + /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal pushq %rdi CFI_ADJUST_CFA_OFFSET 8 - leaq 8(%rsp),%rdi # &ptregs -> arg1 + leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave popq %rdi CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest - + int_signal: testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f @@ -531,11 +531,11 @@ int_restore_rest: jmp int_with_check CFI_ENDPROC END(system_call) - -/* + +/* * Certain special system calls that need to save a complete full stack frame. - */ - + */ + .macro PTREGSCALL label,func,arg .globl \label \label: @@ -572,7 +572,7 @@ ENTRY(ptregscall_common) ret CFI_ENDPROC END(ptregscall_common) - + ENTRY(stub_execve) CFI_STARTPROC popq %r11 @@ -588,11 +588,11 @@ ENTRY(stub_execve) jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execve) - + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. - */ + */ ENTRY(stub_rt_sigreturn) CFI_STARTPROC addq $8, %rsp @@ -685,12 +685,12 @@ exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) je retint_kernel - + /* Interrupt came from user space */ /* * Has a correct top of stack, but a partial stack frame * %rcx: thread info. Interrupts off. - */ + */ retint_with_reschedule: movl $_TIF_WORK_MASK,%edi retint_check: @@ -763,20 +763,20 @@ retint_careful: pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule - popq %rdi + popq %rdi CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp retint_check - + retint_signal: testl $_TIF_DO_NOTIFY_MASK,%edx jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST - movq $-1,ORIG_RAX(%rsp) + movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume @@ -798,14 +798,14 @@ ENTRY(retint_kernel) jnc retint_restore_args call preempt_schedule_irq jmp exit_intr -#endif +#endif CFI_ENDPROC END(common_interrupt) - + /* * APIC interrupts. - */ + */ .macro apicinterrupt num,func INTR_FRAME pushq $~(\num) @@ -823,14 +823,14 @@ ENTRY(threshold_interrupt) apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt END(threshold_interrupt) -#ifdef CONFIG_SMP +#ifdef CONFIG_SMP ENTRY(reschedule_interrupt) apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt END(reschedule_interrupt) .macro INVALIDATE_ENTRY num ENTRY(invalidate_interrupt\num) - apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt + apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt END(invalidate_interrupt\num) .endm @@ -869,22 +869,22 @@ END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt END(spurious_interrupt) - + /* * Exception entry points. - */ + */ .macro zeroentry sym INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 /* push error code/oldrax */ + pushq $0 /* push error code/oldrax */ CFI_ADJUST_CFA_OFFSET 8 - pushq %rax /* push real oldrax to the rdi slot */ + pushq %rax /* push real oldrax to the rdi slot */ CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rax,0 leaq \sym(%rip),%rax jmp error_entry CFI_ENDPROC - .endm + .endm .macro errorentry sym XCPT_FRAME @@ -998,13 +998,13 @@ paranoid_schedule\trace: /* * Exception entry point. This expects an error code/orig_rax on the stack - * and the exception handler in %rax. - */ + * and the exception handler in %rax. + */ KPROBE_ENTRY(error_entry) _frame RDI CFI_REL_OFFSET rax,0 /* rdi slot contains rax, oldrax contains error code */ - cld + cld subq $14*8,%rsp CFI_ADJUST_CFA_OFFSET (14*8) movq %rsi,13*8(%rsp) @@ -1015,7 +1015,7 @@ KPROBE_ENTRY(error_entry) CFI_REL_OFFSET rdx,RDX movq %rcx,11*8(%rsp) CFI_REL_OFFSET rcx,RCX - movq %rsi,10*8(%rsp) /* store rax */ + movq %rsi,10*8(%rsp) /* store rax */ CFI_REL_OFFSET rax,RAX movq %r8, 9*8(%rsp) CFI_REL_OFFSET r8,R8 @@ -1025,29 +1025,29 @@ KPROBE_ENTRY(error_entry) CFI_REL_OFFSET r10,R10 movq %r11,6*8(%rsp) CFI_REL_OFFSET r11,R11 - movq %rbx,5*8(%rsp) + movq %rbx,5*8(%rsp) CFI_REL_OFFSET rbx,RBX - movq %rbp,4*8(%rsp) + movq %rbp,4*8(%rsp) CFI_REL_OFFSET rbp,RBP - movq %r12,3*8(%rsp) + movq %r12,3*8(%rsp) CFI_REL_OFFSET r12,R12 - movq %r13,2*8(%rsp) + movq %r13,2*8(%rsp) CFI_REL_OFFSET r13,R13 - movq %r14,1*8(%rsp) + movq %r14,1*8(%rsp) CFI_REL_OFFSET r14,R14 - movq %r15,(%rsp) + movq %r15,(%rsp) CFI_REL_OFFSET r15,R15 - xorl %ebx,%ebx + xorl %ebx,%ebx testl $3,CS(%rsp) je error_kernelspace -error_swapgs: +error_swapgs: SWAPGS error_sti: TRACE_IRQS_OFF - movq %rdi,RDI(%rsp) + movq %rdi,RDI(%rsp) CFI_REL_OFFSET rdi,RDI movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) call *%rax /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ @@ -1056,7 +1056,7 @@ error_exit: RESTORE_REST DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) + GET_THREAD_INFO(%rcx) testl %eax,%eax jne retint_kernel LOCKDEP_SYS_EXIT_IRQ @@ -1072,7 +1072,7 @@ error_kernelspace: /* There are two places in the kernel that can potentially fault with usergs. Handle them here. The exception handlers after iret run with kernel gs again, so don't set the user space flag. - B stepping K8s sometimes report an truncated RIP for IRET + B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too. */ leaq irq_return(%rip),%rcx cmpq %rcx,RIP(%rsp) @@ -1084,17 +1084,17 @@ error_kernelspace: je error_swapgs jmp error_sti KPROBE_END(error_entry) - + /* Reload gs selector with exception handling */ - /* edi: new selector */ + /* edi: new selector */ ENTRY(native_load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) SWAPGS -gs_change: - movl %edi,%gs +gs_change: + movl %edi,%gs 2: mfence /* workaround */ SWAPGS popf @@ -1102,20 +1102,20 @@ gs_change: ret CFI_ENDPROC ENDPROC(native_load_gs_index) - + .section __ex_table,"a" .align 8 .quad gs_change,bad_gs .previous .section .fixup,"ax" /* running with kernelgs */ -bad_gs: +bad_gs: SWAPGS /* switch back to user gs */ xorl %eax,%eax movl %eax,%gs jmp 2b - .previous - + .previous + /* * Create a kernel thread. * @@ -1138,7 +1138,7 @@ ENTRY(kernel_thread) xorl %r8d,%r8d xorl %r9d,%r9d - + # clone now call do_fork movq %rax,RAX(%rsp) @@ -1149,14 +1149,14 @@ ENTRY(kernel_thread) * so internally to the x86_64 port you can rely on kernel_thread() * not to reschedule the child before returning, this avoids the need * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] + * [Hopefully no generic code relies on the reschedule -AK] */ RESTORE_ALL UNFAKE_STACK_FRAME ret CFI_ENDPROC ENDPROC(kernel_thread) - + child_rip: pushq $0 # fake return address CFI_STARTPROC @@ -1191,10 +1191,10 @@ ENDPROC(child_rip) ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 - SAVE_ALL + SAVE_ALL movq %rsp,%rcx call sys_execve - movq %rax, RAX(%rsp) + movq %rax, RAX(%rsp) RESTORE_REST testq %rax,%rax je int_ret_from_sys_call @@ -1213,7 +1213,7 @@ ENTRY(coprocessor_error) END(coprocessor_error) ENTRY(simd_coprocessor_error) - zeroentry do_simd_coprocessor_error + zeroentry do_simd_coprocessor_error END(simd_coprocessor_error) ENTRY(device_not_available) @@ -1225,12 +1225,12 @@ KPROBE_ENTRY(debug) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 - CFI_ADJUST_CFA_OFFSET 8 + CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK paranoidexit KPROBE_END(debug) - /* runs on exception stack */ + /* runs on exception stack */ KPROBE_ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1264,7 +1264,7 @@ ENTRY(bounds) END(bounds) ENTRY(invalid_op) - zeroentry do_invalid_op + zeroentry do_invalid_op END(invalid_op) ENTRY(coprocessor_segment_overrun) @@ -1319,7 +1319,7 @@ ENTRY(machine_check) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 - CFI_ADJUST_CFA_OFFSET 8 + CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check jmp paranoid_exit1 CFI_ENDPROC -- cgit v1.2.3 From a1afd01c175324656d0e8f1c82ea94b474953c04 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 18 Nov 2008 12:44:21 +0100 Subject: x86: default to SWIOTLB=y on x86_64 Impact: fixes korg bugzilla 11980 A kernel for a 64bit x86 system should always contain the swiotlb code in case it is booted on a machine without any hardware IOMMU supported by the kernel and more than 4GB of RAM. This patch changes Kconfig to always compile swiotlb into the kernel for x86_64. Signed-off-by: Joerg Roedel Cc: stable@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93224b569187..669c6d588bde 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -566,7 +566,7 @@ config AMD_IOMMU # need this always selected by IOMMU for the VIA workaround config SWIOTLB - bool + def_bool y if X86_64 help Support for software bounce buffers used on x86-64 systems which don't have a hardware IOMMU (e.g. the current generation -- cgit v1.2.3 From b78a5b5260abf90d574911e7c7b8d35d5b48d6c0 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 17 Nov 2008 15:44:50 -0800 Subject: x86: ia32_signal: cleanup macro COPY Impact: cleanup No need to use temporary variable in this case. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e2d0bc779bf7..610a17774ea2 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -197,10 +197,8 @@ struct rt_sigframe /* fp state follows here */ }; -#define COPY(x) { \ - unsigned int reg; \ - err |= __get_user(reg, &sc->x); \ - regs->x = reg; \ +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ } #define RELOAD_SEG(seg,mask) \ -- cgit v1.2.3 From d71a68dca54756049e0eae62458a1705bf680d09 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 17 Nov 2008 15:47:06 -0800 Subject: x86: ia32_signal: introduce COPY_SEG_CPL3 Impact: cleanup Introduce COPY_SEG_CPL3 for ia32_restore_sigcontext(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 610a17774ea2..fe44c314c9c0 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -201,6 +201,12 @@ struct rt_sigframe err |= __get_user(regs->x, &sc->x); \ } +#define COPY_SEG_CPL3(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + #define RELOAD_SEG(seg,mask) \ { unsigned int cur; \ unsigned short pre; \ @@ -246,10 +252,8 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, COPY(dx); COPY(cx); COPY(ip); /* Don't touch extended registers */ - err |= __get_user(regs->cs, &sc->cs); - regs->cs |= 3; - err |= __get_user(regs->ss, &sc->ss); - regs->ss |= 3; + COPY_SEG_CPL3(cs); + COPY_SEG_CPL3(ss); err |= __get_user(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); -- cgit v1.2.3 From 8c6e5ce0fd67c57ad5e19d1718e1250214e855db Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 17 Nov 2008 15:47:48 -0800 Subject: x86: ia32_signal: cleanup macro RELOAD_SEG Impact: cleanup Remove mask parameter because it's always 3. Cleanup coding styles. Signed-off-by: Hiroshi Shimamoto Reviewed-by: WANG Cong Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index fe44c314c9c0..2c56e6857d1a 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -207,13 +207,14 @@ struct rt_sigframe regs->seg = tmp | 3; \ } -#define RELOAD_SEG(seg,mask) \ - { unsigned int cur; \ - unsigned short pre; \ - err |= __get_user(pre, &sc->seg); \ - savesegment(seg, cur); \ - pre |= mask; \ - if (pre != cur) loadsegment(seg, pre); } +#define RELOAD_SEG(seg) { \ + unsigned int cur, pre; \ + err |= __get_user(pre, &sc->seg); \ + savesegment(seg, cur); \ + pre |= 3; \ + if (pre != cur) \ + loadsegment(seg, pre); \ +} static int ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, @@ -244,9 +245,9 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, if (gs != oldgs) load_gs_index(gs); - RELOAD_SEG(fs, 3); - RELOAD_SEG(ds, 3); - RELOAD_SEG(es, 3); + RELOAD_SEG(fs); + RELOAD_SEG(ds); + RELOAD_SEG(es); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); -- cgit v1.2.3 From 047ce93581ca122442ed3c13a62a645249a7db1d Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 17 Nov 2008 15:48:27 -0800 Subject: x86: ia32_signal: remove using temporary variable Impact: cleanup No need to use temporary variable. Also rename the variable same as arch/x86/kernel/signal_32.c. Signed-off-by: Hiroshi Shimamoto Reviewed-by: WANG Cong Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 2c56e6857d1a..e591e381611b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -218,7 +218,7 @@ struct rt_sigframe static int ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, - unsigned int *peax) + unsigned int *pax) { unsigned int tmpflags, gs, oldgs, err = 0; void __user *buf; @@ -265,9 +265,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, buf = compat_ptr(tmp); err |= restore_i387_xstate_ia32(buf); - err |= __get_user(tmp, &sc->ax); - *peax = tmp; - + err |= __get_user(*pax, &sc->ax); return err; } -- cgit v1.2.3 From 64977609e316c86fad513d9bf0afff998581e59d Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 17 Nov 2008 15:49:14 -0800 Subject: x86: ia32_signal: change order of storing in setup_sigcontext() Impact: cleanup Change order of storing to match the sigcontext_ia32. And add casting to make this code same as arch/x86/kernel/signal_32.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e591e381611b..1267977e7708 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -360,13 +360,13 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, err |= __put_user(regs->dx, &sc->dx); err |= __put_user(regs->cx, &sc->cx); err |= __put_user(regs->ax, &sc->ax); - err |= __put_user(regs->cs, &sc->cs); - err |= __put_user(regs->ss, &sc->ss); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); err |= __put_user(regs->flags, &sc->flags); err |= __put_user(regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); err |= __put_user(ptr_to_compat(fpstate), &sc->fpstate); -- cgit v1.2.3 From 77be80e437fec44f8b7a620314b7d7b605b8d93b Mon Sep 17 00:00:00 2001 From: "Richard A. Holden III" Date: Wed, 19 Nov 2008 16:05:14 -0700 Subject: x86: fix arch/x86/kernel/genx2apic_uv_x.c build warning when !CONFIG_HOTPLUG_CPU Impact: cleanup, reduce size of the kernel image a bit Fix: arch/x86/kernel/genx2apic_uv_x.c:403: warning: 'uv_heartbeat_disable' defined but not used the function is only used when CONFIG_HOTPLUG_CPU is defined. Signed-off-by: Richard A. Holden III Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index f02bbe5d0178..221299f4509f 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -400,6 +400,7 @@ static void __cpuinit uv_heartbeat_enable(int cpu) uv_heartbeat_enable(0); } +#ifdef CONFIG_HOTPLUG_CPU static void __cpuinit uv_heartbeat_disable(int cpu) { if (uv_cpu_hub_info(cpu)->scir.enabled) { @@ -409,7 +410,6 @@ static void __cpuinit uv_heartbeat_disable(int cpu) uv_set_cpu_scir_bits(cpu, 0xff); } -#ifdef CONFIG_HOTPLUG_CPU /* * cpu hotplug notifier */ -- cgit v1.2.3 From bb5574608a8375026510b4f983ffbb06ece33fe2 Mon Sep 17 00:00:00 2001 From: "Richard A. Holden III" Date: Wed, 19 Nov 2008 16:05:15 -0700 Subject: x86: fix arch/x86/kernel/setup.c build warning when !CONFIG_X86_RESERVE_LOW_64K Impact: cleanup Fix: arch/x86/kernel/setup.c:592: warning: 'dmi_low_memory_corruption' defined but not used this is only used if CONFIG_X86_RESERVE_LOW_64K is defined. Signed-off-by: Richard A. Holden III Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e6c51433247d..13a5f592ac28 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -587,6 +587,7 @@ static struct x86_quirks default_x86_quirks __initdata; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; +#ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE @@ -598,6 +599,7 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) return 0; } +#endif /* List of systems that have known low memory corruption BIOS problems */ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { -- cgit v1.2.3 From d99015b1abbad743aa049b439c1e1dede6d0fa49 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 19 Nov 2008 01:18:11 +0100 Subject: x86: move entry_64.S register saving out of the macros Here is a combined patch that moves "save_args" out-of-line for the interrupt macro and moves "error_entry" mostly out-of-line for the zeroentry and errorentry macros. The save_args function becomes really straightforward and easy to understand, with the possible exception of the stack switch code, which now needs to copy the return address of to the calling function. Normal interrupts arrive with ((~vector)-0x80) on the stack, which gets adjusted in common_interrupt: : (5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */ (4) sub $0x50,%rsp /* space for registers */ (5) callq ffffffff80211290 (5) callq ffffffff80214290 : ... An apic interrupt stub now look like this: : (5) pushq $0xffffffffffffff05 /* ~(vector) */ (4) sub $0x50,%rsp /* space for registers */ (5) callq ffffffff80211290 (5) callq ffffffff80212b8f (5) jmpq ffffffff80211f93 Similarly the exception handler register saving function becomes simpler, without the need of any parameter shuffling. The stub for an exception without errorcode looks like this: : (6) callq *0x1cad12(%rip) # ffffffff803dd448 (2) pushq $0xffffffffffffffff /* no syscall */ (4) sub $0x78,%rsp /* space for registers */ (5) callq ffffffff8030e3b0 (3) mov %rsp,%rdi /* pt_regs pointer */ (2) xor %esi,%esi /* no error code */ (5) callq ffffffff80213446 (5) jmpq ffffffff8030e460 And one for an exception with errorcode like this: : (6) callq *0x1cab92(%rip) # ffffffff803dd448 (4) sub $0x78,%rsp /* space for registers */ (5) callq ffffffff8030e3b0 (3) mov %rsp,%rdi /* pt_regs pointer */ (5) mov 0x78(%rsp),%rsi /* load error code */ (9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */ (5) callq ffffffff80213209 (5) jmpq ffffffff8030e460 Unfortunately, this last type is more than 32 bytes. But the total space savings due to this patch is about 2500 bytes on an smp-configuration, and I think the code is clearer than it was before. The tested kernels were non-paravirt ones (i.e., without the indirect call at the top of the exception handlers). Anyhow, I tested this patch on top of a recent -tip. The machine was an 2x4-core Xeon at 2333MHz. Measured where the delays between (almost-)adjacent rdtsc instructions. The graphs show how much time is spent outside of the program as a function of the measured delay. The area under the graph represents the total time spent outside the program. Eight instances of the rdtsctest were started, each pinned to a single cpu. The histogams are added. For each kernel two measurements were done: one in mostly idle condition, the other while running "bonnie++ -f", bound to cpu 0. Each measurement took 40 minutes runtime. See the attached graphs for the results. The graphs overlap almost everywhere, but there are small differences. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 300 +++++++++++++++++++++++++-------------------- 1 file changed, 166 insertions(+), 134 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index dbf06a0ef3d5..5a12432ccdf9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -242,6 +242,78 @@ ENTRY(native_usergs_sysret64) CFI_REL_OFFSET rsp,RSP /*CFI_REL_OFFSET ss,SS*/ .endm + +/* + * initial frame state for interrupts and exceptions + */ + .macro _frame ref + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8-\ref + /*CFI_REL_OFFSET ss,SS-\ref*/ + CFI_REL_OFFSET rsp,RSP-\ref + /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ + /*CFI_REL_OFFSET cs,CS-\ref*/ + CFI_REL_OFFSET rip,RIP-\ref + .endm + +/* + * initial frame state for interrupts (and exceptions without error code) + */ +#define INTR_FRAME _frame RIP +/* + * initial frame state for exceptions with error code (and interrupts + * with vector already pushed) + */ +#define XCPT_FRAME _frame ORIG_RAX + +/* save partial stack frame */ +ENTRY(save_args) + XCPT_FRAME + cld + movq %rdi, 8*8+16(%rsp) + CFI_REL_OFFSET rdi, 8*8+16 + movq %rsi, 7*8+16(%rsp) + CFI_REL_OFFSET rsi, 7*8+16 + movq %rdx, 6*8+16(%rsp) + CFI_REL_OFFSET rdx, 6*8+16 + movq %rcx, 5*8+16(%rsp) + CFI_REL_OFFSET rcx, 5*8+16 + movq %rax, 4*8+16(%rsp) + CFI_REL_OFFSET rax, 4*8+16 + movq %r8, 3*8+16(%rsp) + CFI_REL_OFFSET r8, 3*8+16 + movq %r9, 2*8+16(%rsp) + CFI_REL_OFFSET r9, 2*8+16 + movq %r10, 1*8+16(%rsp) + CFI_REL_OFFSET r10, 1*8+16 + movq %r11, 0*8+16(%rsp) + CFI_REL_OFFSET r11, 0*8+16 + leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ + movq %rbp, 8(%rsp) /* push %rbp */ + leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ + testl $3, CS(%rdi) + je 1f + SWAPGS + /* + * irqcount is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ +1: incl %gs:pda_irqcount + jne 2f + pop %rax /* move return address... */ + mov %gs:pda_irqstackptr,%rsp + push %rax /* ... to the new stack */ + /* + * We entered an interrupt context - irqs are off: + */ +2: TRACE_IRQS_OFF + ret + CFI_ENDPROC +END(save_args) + /* * A newly forked process directly context switches into this. */ @@ -607,26 +679,6 @@ ENTRY(stub_rt_sigreturn) CFI_ENDPROC END(stub_rt_sigreturn) -/* - * initial frame state for interrupts and exceptions - */ - .macro _frame ref - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-\ref - /*CFI_REL_OFFSET ss,SS-\ref*/ - CFI_REL_OFFSET rsp,RSP-\ref - /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ - /*CFI_REL_OFFSET cs,CS-\ref*/ - CFI_REL_OFFSET rip,RIP-\ref - .endm - -/* initial frame state for interrupts (and exceptions without error code) */ -#define INTR_FRAME _frame RIP -/* initial frame state for exceptions with error code (and interrupts with - vector already pushed) */ -#define XCPT_FRAME _frame ORIG_RAX - /* * Build the entry stubs and pointer table with some assembler magic. * We pack 7 stubs into a single 32-byte chunk, which will fit in a @@ -667,46 +719,19 @@ END(irq_entries_start) END(interrupt) .previous -/* +/* * Interrupt entry/exit. * * Interrupt entry points save only callee clobbered registers in fast path. - * - * Entry runs with interrupts off. - */ + * + * Entry runs with interrupts off. + */ /* 0(%rsp): ~(interrupt number) */ .macro interrupt func - cld - SAVE_ARGS - leaq -ARGOFFSET(%rsp),%rdi /* arg1 for handler */ - pushq %rbp - /* - * Save rbp twice: One is for marking the stack frame, as usual, and the - * other, to fill pt_regs properly. This is because bx comes right - * before the last saved register in that structure, and not bp. If the - * base pointer were in the place bx is today, this would not be needed. - */ - movq %rbp, -8(%rsp) - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbp, 0 - movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp - testl $3,CS(%rdi) - je 1f - SWAPGS - /* irqcount is used to check if a CPU is already on an interrupt - stack or not. While this is essentially redundant with preempt_count - it is a little cheaper to use a separate counter in the PDA - (short of moving irq_enter into assembly, which would be too - much work) */ -1: incl %gs:pda_irqcount - cmoveq %gs:pda_irqstackptr,%rsp - push %rbp # backlink for old unwinder - /* - * We entered an interrupt context - irqs are off: - */ - TRACE_IRQS_OFF + subq $10*8, %rsp + CFI_ADJUST_CFA_OFFSET 10*8 + call save_args call \func .endm @@ -852,6 +877,8 @@ END(common_interrupt) /* * APIC interrupts. */ + .p2align 5 + .macro apicinterrupt num,func INTR_FRAME pushq $~(\num) @@ -922,24 +949,29 @@ END(spurious_interrupt) .macro zeroentry sym INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 /* push error code/oldrax */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ CFI_ADJUST_CFA_OFFSET 8 - pushq %rax /* push real oldrax to the rdi slot */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rax,0 - leaq \sym(%rip),%rax - jmp error_entry + subq $15*8,%rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call error_entry + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \sym + jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC .endm .macro errorentry sym XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rax,0 - leaq \sym(%rip),%rax - jmp error_entry + subq $15*8,%rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call error_entry + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + call \sym + jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC .endm @@ -1043,93 +1075,93 @@ paranoid_schedule\trace: .endm /* - * Exception entry point. This expects an error code/orig_rax on the stack - * and the exception handler in %rax. + * Exception entry point. This expects an error code/orig_rax on the stack. + * returns in "no swapgs flag" in %ebx. */ KPROBE_ENTRY(error_entry) _frame RDI - CFI_REL_OFFSET rax,0 - /* rdi slot contains rax, oldrax contains error code */ + CFI_ADJUST_CFA_OFFSET 15*8 + /* oldrax contains error code */ cld - subq $14*8,%rsp - CFI_ADJUST_CFA_OFFSET (14*8) - movq %rsi,13*8(%rsp) - CFI_REL_OFFSET rsi,RSI - movq 14*8(%rsp),%rsi /* load rax from rdi slot */ - CFI_REGISTER rax,rsi - movq %rdx,12*8(%rsp) - CFI_REL_OFFSET rdx,RDX - movq %rcx,11*8(%rsp) - CFI_REL_OFFSET rcx,RCX - movq %rsi,10*8(%rsp) /* store rax */ - CFI_REL_OFFSET rax,RAX - movq %r8, 9*8(%rsp) - CFI_REL_OFFSET r8,R8 - movq %r9, 8*8(%rsp) - CFI_REL_OFFSET r9,R9 - movq %r10,7*8(%rsp) - CFI_REL_OFFSET r10,R10 - movq %r11,6*8(%rsp) - CFI_REL_OFFSET r11,R11 - movq %rbx,5*8(%rsp) - CFI_REL_OFFSET rbx,RBX - movq %rbp,4*8(%rsp) - CFI_REL_OFFSET rbp,RBP - movq %r12,3*8(%rsp) - CFI_REL_OFFSET r12,R12 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13,R13 - movq %r14,1*8(%rsp) - CFI_REL_OFFSET r14,R14 - movq %r15,(%rsp) - CFI_REL_OFFSET r15,R15 + movq %rdi,14*8+8(%rsp) + CFI_REL_OFFSET rdi,RDI+8 + movq %rsi,13*8+8(%rsp) + CFI_REL_OFFSET rsi,RSI+8 + movq %rdx,12*8+8(%rsp) + CFI_REL_OFFSET rdx,RDX+8 + movq %rcx,11*8+8(%rsp) + CFI_REL_OFFSET rcx,RCX+8 + movq %rax,10*8+8(%rsp) + CFI_REL_OFFSET rax,RAX+8 + movq %r8, 9*8+8(%rsp) + CFI_REL_OFFSET r8,R8+8 + movq %r9, 8*8+8(%rsp) + CFI_REL_OFFSET r9,R9+8 + movq %r10,7*8+8(%rsp) + CFI_REL_OFFSET r10,R10+8 + movq %r11,6*8+8(%rsp) + CFI_REL_OFFSET r11,R11+8 + movq %rbx,5*8+8(%rsp) + CFI_REL_OFFSET rbx,RBX+8 + movq %rbp,4*8+8(%rsp) + CFI_REL_OFFSET rbp,RBP+8 + movq %r12,3*8+8(%rsp) + CFI_REL_OFFSET r12,R12+8 + movq %r13,2*8+8(%rsp) + CFI_REL_OFFSET r13,R13+8 + movq %r14,1*8+8(%rsp) + CFI_REL_OFFSET r14,R14+8 + movq %r15,0*8+8(%rsp) + CFI_REL_OFFSET r15,R15+8 xorl %ebx,%ebx - testl $3,CS(%rsp) - je error_kernelspace + testl $3,CS+8(%rsp) + je error_kernelspace error_swapgs: SWAPGS error_sti: TRACE_IRQS_OFF - movq %rdi,RDI(%rsp) - CFI_REL_OFFSET rdi,RDI - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) - call *%rax - /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ -error_exit: + ret + CFI_ENDPROC + +/* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. The exception handlers after iret run with + * kernel gs again, so don't set the user space flag. B stepping K8s + * sometimes report an truncated RIP for IRET exceptions returning to + * compat mode. Check for these here too. + */ +error_kernelspace: + incl %ebx + leaq irq_return(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + movl %ecx,%ecx /* zero extend */ + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti +KPROBE_END(error_entry) + + +/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +KPROBE_ENTRY(error_exit) + _frame R15 movl %ebx,%eax RESTORE_REST DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax - jne retint_kernel + jne retint_kernel LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful + movl TI_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful jmp retint_swapgs CFI_ENDPROC - -error_kernelspace: - incl %ebx - /* There are two places in the kernel that can potentially fault with - usergs. Handle them here. The exception handlers after - iret run with kernel gs again, so don't set the user space flag. - B stepping K8s sometimes report an truncated RIP for IRET - exceptions returning to compat mode. Check for these here too. */ - leaq irq_return(%rip),%rcx - cmpq %rcx,RIP(%rsp) - je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP(%rsp) - je error_swapgs - cmpq $gs_change,RIP(%rsp) - je error_swapgs - jmp error_sti -KPROBE_END(error_entry) +KPROBE_END(error_exit) /* Reload gs selector with exception handling */ /* edi: new selector */ -- cgit v1.2.3 From dcd072e26055de600cecdc3f7a1e083ecd55c2e4 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 20 Nov 2008 14:40:11 +0100 Subject: x86: clean up after: move entry_64.S register saving out of the macros This add-on patch to x86: move entry_64.S register saving out of the macros visually cleans up the appearance of the code by introducing some basic helper macro's. It also adds some cfi annotations which were missing. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 220 +++++++++++++++++++++++---------------------- 1 file changed, 112 insertions(+), 108 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 5a12432ccdf9..7a04f696121d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -60,6 +60,23 @@ #define __AUDIT_ARCH_LE 0x40000000 .code64 +/* + * Some macro's to hide the most frequently occuring CFI annotations. + */ + .macro CFI_PUSHQ reg + pushq \reg + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro CFI_POPQ reg + popq \reg + CFI_ADJUST_CFA_OFFSET -8 + .endm + + .macro CFI_MOVQ reg offset=0 + movq %\reg, \offset(%rsp) + CFI_REL_OFFSET \reg, \offset + .endm #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -213,84 +230,84 @@ ENTRY(native_usergs_sysret64) CFI_ADJUST_CFA_OFFSET -(6*8) .endm - .macro CFI_DEFAULT_STACK start=1 +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro EMPTY_FRAME start=1 offset=0 .if \start - CFI_STARTPROC simple + CFI_STARTPROC simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8 + CFI_DEF_CFA rsp,8+\offset .else - CFI_DEF_CFA_OFFSET SS+8 + CFI_DEF_CFA_OFFSET 8+\offset .endif - CFI_REL_OFFSET r15,R15 - CFI_REL_OFFSET r14,R14 - CFI_REL_OFFSET r13,R13 - CFI_REL_OFFSET r12,R12 - CFI_REL_OFFSET rbp,RBP - CFI_REL_OFFSET rbx,RBX - CFI_REL_OFFSET r11,R11 - CFI_REL_OFFSET r10,R10 - CFI_REL_OFFSET r9,R9 - CFI_REL_OFFSET r8,R8 - CFI_REL_OFFSET rax,RAX - CFI_REL_OFFSET rcx,RCX - CFI_REL_OFFSET rdx,RDX - CFI_REL_OFFSET rsi,RSI - CFI_REL_OFFSET rdi,RDI - CFI_REL_OFFSET rip,RIP - /*CFI_REL_OFFSET cs,CS*/ - /*CFI_REL_OFFSET rflags,EFLAGS*/ - CFI_REL_OFFSET rsp,RSP - /*CFI_REL_OFFSET ss,SS*/ .endm /* - * initial frame state for interrupts and exceptions + * initial frame state for interrupts (and exceptions without error code) */ - .macro _frame ref - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-\ref - /*CFI_REL_OFFSET ss,SS-\ref*/ - CFI_REL_OFFSET rsp,RSP-\ref - /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ - /*CFI_REL_OFFSET cs,CS-\ref*/ - CFI_REL_OFFSET rip,RIP-\ref + .macro INTR_FRAME start=1 offset=0 + EMPTY_FRAME \start, (SS+8-RIP)+\offset + /*CFI_REL_OFFSET ss, SS-RIP+\offset*/ + CFI_REL_OFFSET rsp, RSP-RIP+\offset + /*CFI_REL_OFFSET rflags, EFLAGS-RIP+\offset*/ + /*CFI_REL_OFFSET cs, CS-RIP+\offset*/ + CFI_REL_OFFSET rip, RIP-RIP+\offset .endm -/* - * initial frame state for interrupts (and exceptions without error code) - */ -#define INTR_FRAME _frame RIP /* * initial frame state for exceptions with error code (and interrupts * with vector already pushed) */ -#define XCPT_FRAME _frame ORIG_RAX + .macro XCPT_FRAME start=1 offset=0 + INTR_FRAME \start, (RIP-ORIG_RAX)+\offset + /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/ + .endm + +/* + * frame that enables calling into C. + */ + .macro PARTIAL_FRAME start=1 offset=0 + XCPT_FRAME \start, (ORIG_RAX-ARGOFFSET)+\offset + CFI_REL_OFFSET rdi, (RDI-ARGOFFSET)+\offset + CFI_REL_OFFSET rsi, (RSI-ARGOFFSET)+\offset + CFI_REL_OFFSET rdx, (RDX-ARGOFFSET)+\offset + CFI_REL_OFFSET rcx, (RCX-ARGOFFSET)+\offset + CFI_REL_OFFSET rax, (RAX-ARGOFFSET)+\offset + CFI_REL_OFFSET r8, (R8-ARGOFFSET)+\offset + CFI_REL_OFFSET r9, (R9-ARGOFFSET)+\offset + CFI_REL_OFFSET r10, (R10-ARGOFFSET)+\offset + CFI_REL_OFFSET r11, (R11-ARGOFFSET)+\offset + .endm + +/* + * frame that enables passing a complete pt_regs to a C function. + */ + .macro DEFAULT_FRAME start=1 offset=0 + PARTIAL_FRAME \start, (R11-R15)+\offset + CFI_REL_OFFSET rbx, RBX+\offset + CFI_REL_OFFSET rbp, RBP+\offset + CFI_REL_OFFSET r12, R12+\offset + CFI_REL_OFFSET r13, R13+\offset + CFI_REL_OFFSET r14, R14+\offset + CFI_REL_OFFSET r15, R15+\offset + .endm /* save partial stack frame */ ENTRY(save_args) XCPT_FRAME cld - movq %rdi, 8*8+16(%rsp) - CFI_REL_OFFSET rdi, 8*8+16 - movq %rsi, 7*8+16(%rsp) - CFI_REL_OFFSET rsi, 7*8+16 - movq %rdx, 6*8+16(%rsp) - CFI_REL_OFFSET rdx, 6*8+16 - movq %rcx, 5*8+16(%rsp) - CFI_REL_OFFSET rcx, 5*8+16 - movq %rax, 4*8+16(%rsp) - CFI_REL_OFFSET rax, 4*8+16 - movq %r8, 3*8+16(%rsp) - CFI_REL_OFFSET r8, 3*8+16 - movq %r9, 2*8+16(%rsp) - CFI_REL_OFFSET r9, 2*8+16 - movq %r10, 1*8+16(%rsp) - CFI_REL_OFFSET r10, 1*8+16 - movq %r11, 0*8+16(%rsp) - CFI_REL_OFFSET r11, 0*8+16 + CFI_MOVQ rdi, (RDI-ARGOFFSET)+16 + CFI_MOVQ rsi, (RSI-ARGOFFSET)+16 + CFI_MOVQ rdx, (RDX-ARGOFFSET)+16 + CFI_MOVQ rcx, (RCX-ARGOFFSET)+16 + CFI_MOVQ rax, (RAX-ARGOFFSET)+16 + CFI_MOVQ r8, (R8-ARGOFFSET)+16 + CFI_MOVQ r9, (R9-ARGOFFSET)+16 + CFI_MOVQ r10, (R10-ARGOFFSET)+16 + CFI_MOVQ r11, (R11-ARGOFFSET)+16 leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ - movq %rbp, 8(%rsp) /* push %rbp */ + CFI_MOVQ rbp, 8 /* push %rbp */ leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ testl $3, CS(%rdi) je 1f @@ -303,9 +320,10 @@ ENTRY(save_args) */ 1: incl %gs:pda_irqcount jne 2f - pop %rax /* move return address... */ + CFI_POPQ %rax /* move return address... */ mov %gs:pda_irqstackptr,%rsp - push %rax /* ... to the new stack */ + EMPTY_FRAME 0 + CFI_PUSHQ %rax /* ... to the new stack */ /* * We entered an interrupt context - irqs are off: */ @@ -319,7 +337,7 @@ END(save_args) */ /* rdi: prev */ ENTRY(ret_from_fork) - CFI_DEFAULT_STACK + DEFAULT_FRAME push kernel_eflags(%rip) CFI_ADJUST_CFA_OFFSET 8 popf # reset kernel eflags @@ -732,6 +750,7 @@ END(interrupt) subq $10*8, %rsp CFI_ADJUST_CFA_OFFSET 10*8 call save_args + PARTIAL_FRAME 0 call \func .endm @@ -949,11 +968,11 @@ END(spurious_interrupt) .macro zeroentry sym INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $-1 /* ORIG_RAX: no syscall to restart */ - CFI_ADJUST_CFA_OFFSET 8 + CFI_PUSHQ $-1 /* ORIG_RAX: no syscall to restart */ subq $15*8,%rsp CFI_ADJUST_CFA_OFFSET 15*8 call error_entry + DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ call \sym @@ -967,6 +986,7 @@ END(spurious_interrupt) subq $15*8,%rsp CFI_ADJUST_CFA_OFFSET 15*8 call error_entry + DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ @@ -1079,40 +1099,25 @@ paranoid_schedule\trace: * returns in "no swapgs flag" in %ebx. */ KPROBE_ENTRY(error_entry) - _frame RDI + XCPT_FRAME CFI_ADJUST_CFA_OFFSET 15*8 /* oldrax contains error code */ cld - movq %rdi,14*8+8(%rsp) - CFI_REL_OFFSET rdi,RDI+8 - movq %rsi,13*8+8(%rsp) - CFI_REL_OFFSET rsi,RSI+8 - movq %rdx,12*8+8(%rsp) - CFI_REL_OFFSET rdx,RDX+8 - movq %rcx,11*8+8(%rsp) - CFI_REL_OFFSET rcx,RCX+8 - movq %rax,10*8+8(%rsp) - CFI_REL_OFFSET rax,RAX+8 - movq %r8, 9*8+8(%rsp) - CFI_REL_OFFSET r8,R8+8 - movq %r9, 8*8+8(%rsp) - CFI_REL_OFFSET r9,R9+8 - movq %r10,7*8+8(%rsp) - CFI_REL_OFFSET r10,R10+8 - movq %r11,6*8+8(%rsp) - CFI_REL_OFFSET r11,R11+8 - movq %rbx,5*8+8(%rsp) - CFI_REL_OFFSET rbx,RBX+8 - movq %rbp,4*8+8(%rsp) - CFI_REL_OFFSET rbp,RBP+8 - movq %r12,3*8+8(%rsp) - CFI_REL_OFFSET r12,R12+8 - movq %r13,2*8+8(%rsp) - CFI_REL_OFFSET r13,R13+8 - movq %r14,1*8+8(%rsp) - CFI_REL_OFFSET r14,R14+8 - movq %r15,0*8+8(%rsp) - CFI_REL_OFFSET r15,R15+8 + CFI_MOVQ rdi, RDI+8 + CFI_MOVQ rsi, RSI+8 + CFI_MOVQ rdx, RDX+8 + CFI_MOVQ rcx, RCX+8 + CFI_MOVQ rax, RAX+8 + CFI_MOVQ r8, R8+8 + CFI_MOVQ r9, R9+8 + CFI_MOVQ r10, R10+8 + CFI_MOVQ r11, R11+8 + CFI_MOVQ rbx, RBX+8 + CFI_MOVQ rbp, RBP+8 + CFI_MOVQ r12, R12+8 + CFI_MOVQ r13, R13+8 + CFI_MOVQ r14, R14+8 + CFI_MOVQ r15, R15+8 xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1146,7 +1151,7 @@ KPROBE_END(error_entry) /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ KPROBE_ENTRY(error_exit) - _frame R15 + DEFAULT_FRAME movl %ebx,%eax RESTORE_REST DISABLE_INTERRUPTS(CLBR_NONE) @@ -1455,7 +1460,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) see the correct pointer to the pt_regs */ movq %rdi, %rsp # we don't return, adjust the stack frame CFI_ENDPROC - CFI_DEFAULT_STACK + DEFAULT_FRAME 11: incl %gs:pda_irqcount movq %rsp,%rbp CFI_DEF_CFA_REGISTER rbp @@ -1483,10 +1488,13 @@ END(do_hypervisor_callback) # with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) - framesz = (RIP-0x30) /* workaround buggy gas */ - _frame framesz - CFI_REL_OFFSET rcx, 0 - CFI_REL_OFFSET r11, 8 + INTR_FRAME 1 (6*8) + /*CFI_REL_OFFSET gs,GS*/ + /*CFI_REL_OFFSET fs,FS*/ + /*CFI_REL_OFFSET es,ES*/ + /*CFI_REL_OFFSET ds,DS*/ + CFI_REL_OFFSET r11,8 + CFI_REL_OFFSET rcx,0 movw %ds,%cx cmpw %cx,0x10(%rsp) CFI_REMEMBER_STATE @@ -1507,12 +1515,9 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - pushq %rcx - CFI_ADJUST_CFA_OFFSET 8 + CFI_PUSHQ $0 /* RIP */ + CFI_PUSHQ %r11 + CFI_PUSHQ %rcx jmp general_protection CFI_RESTORE_STATE 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ @@ -1522,8 +1527,7 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 + CFI_PUSHQ $0 SAVE_ALL jmp error_exit CFI_ENDPROC -- cgit v1.2.3 From 3ddd972d970fdabbe6515aa2f95e0ef2c8df903d Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 20 Nov 2008 18:32:17 -0800 Subject: x86: signal: rename COPY_SEG_STRICT to COPY_SEG_CPL3 Impact: cleanup Rename macro COPY_SEG_STRICT to COPY_SEG_CPL3, as suggested by hpa. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 8 ++++---- arch/x86/kernel/signal_64.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 514171ac0d03..c2aabeba27a5 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -145,7 +145,7 @@ asmlinkage int sys_sigaltstack(unsigned long bx) regs->seg = tmp; \ } -#define COPY_SEG_STRICT(seg) { \ +#define COPY_SEG_CPL3(seg) { \ unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ regs->seg = tmp | 3; \ @@ -193,13 +193,13 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_32 - COPY_SEG_STRICT(cs); - COPY_SEG_STRICT(ss); + COPY_SEG_CPL3(cs); + COPY_SEG_CPL3(ss); #else /* !CONFIG_X86_32 */ /* Kernel saves and restores only the CS segment register on signals, * which is the bare minimum needed to allow mixed 32/64-bit code. * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_STRICT(cs); + COPY_SEG_CPL3(cs); #endif /* CONFIG_X86_32 */ err |= __get_user(tmpflags, &sc->flags); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index d2307e41fbdb..3d54d366ccb2 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -56,7 +56,7 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, err |= __get_user(regs->x, &sc->x); \ } -#define COPY_SEG_STRICT(seg) { \ +#define COPY_SEG_CPL3(seg) { \ unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ regs->seg = tmp | 3; \ @@ -98,13 +98,13 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_32 - COPY_SEG_STRICT(cs); - COPY_SEG_STRICT(ss); + COPY_SEG_CPL3(cs); + COPY_SEG_CPL3(ss); #else /* !CONFIG_X86_32 */ /* Kernel saves and restores only the CS segment register on signals, * which is the bare minimum needed to allow mixed 32/64-bit code. * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_STRICT(cs); + COPY_SEG_CPL3(cs); #endif /* CONFIG_X86_32 */ err |= __get_user(tmpflags, &sc->flags); -- cgit v1.2.3 From e8a0e27662186f8856a0a6242e7a8386c9a64a53 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 21 Nov 2008 15:11:32 +0100 Subject: x86: clean up after: move entry_64.S register saving out of the macros, fix Impact: build fix The break builds with older binutils (2.16.1): arch/x86/kernel/entry_64.S: Assembler messages: arch/x86/kernel/entry_64.S:282: Error: too many positional arguments arch/x86/kernel/entry_64.S:283: Error: too many positional arguments arch/x86/kernel/entry_64.S:284: Error: too many positional arguments arch/x86/kernel/entry_64.S:285: Error: too many positional arguments arch/x86/kernel/entry_64.S:286: Error: too many positional arguments arch/x86/kernel/entry_64.S:287: Error: too many positional arguments arch/x86/kernel/entry_64.S:288: Error: too many positional arguments arch/x86/kernel/entry_64.S:289: Error: too many positional arguments arch/x86/kernel/entry_64.S:290: Error: too many positional arguments Took some time to figure out the detail that GAS chokes on: it's negative offsets. Rearrange the calculations to make sure we never go negative. Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 54 +++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7a04f696121d..4e3d83678f85 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -247,12 +247,12 @@ ENTRY(native_usergs_sysret64) * initial frame state for interrupts (and exceptions without error code) */ .macro INTR_FRAME start=1 offset=0 - EMPTY_FRAME \start, (SS+8-RIP)+\offset - /*CFI_REL_OFFSET ss, SS-RIP+\offset*/ - CFI_REL_OFFSET rsp, RSP-RIP+\offset - /*CFI_REL_OFFSET rflags, EFLAGS-RIP+\offset*/ - /*CFI_REL_OFFSET cs, CS-RIP+\offset*/ - CFI_REL_OFFSET rip, RIP-RIP+\offset + EMPTY_FRAME \start, SS+8+\offset-RIP + /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ + CFI_REL_OFFSET rsp, RSP+\offset-RIP + /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ + /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ + CFI_REL_OFFSET rip, RIP+\offset-RIP .endm /* @@ -260,7 +260,7 @@ ENTRY(native_usergs_sysret64) * with vector already pushed) */ .macro XCPT_FRAME start=1 offset=0 - INTR_FRAME \start, (RIP-ORIG_RAX)+\offset + INTR_FRAME \start, RIP+\offset-ORIG_RAX /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/ .endm @@ -268,23 +268,23 @@ ENTRY(native_usergs_sysret64) * frame that enables calling into C. */ .macro PARTIAL_FRAME start=1 offset=0 - XCPT_FRAME \start, (ORIG_RAX-ARGOFFSET)+\offset - CFI_REL_OFFSET rdi, (RDI-ARGOFFSET)+\offset - CFI_REL_OFFSET rsi, (RSI-ARGOFFSET)+\offset - CFI_REL_OFFSET rdx, (RDX-ARGOFFSET)+\offset - CFI_REL_OFFSET rcx, (RCX-ARGOFFSET)+\offset - CFI_REL_OFFSET rax, (RAX-ARGOFFSET)+\offset - CFI_REL_OFFSET r8, (R8-ARGOFFSET)+\offset - CFI_REL_OFFSET r9, (R9-ARGOFFSET)+\offset - CFI_REL_OFFSET r10, (R10-ARGOFFSET)+\offset - CFI_REL_OFFSET r11, (R11-ARGOFFSET)+\offset + XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET + CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET + CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET + CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET + CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET + CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET + CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET + CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET + CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET + CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET .endm /* * frame that enables passing a complete pt_regs to a C function. */ .macro DEFAULT_FRAME start=1 offset=0 - PARTIAL_FRAME \start, (R11-R15)+\offset + PARTIAL_FRAME \start, R11+\offset-R15 CFI_REL_OFFSET rbx, RBX+\offset CFI_REL_OFFSET rbp, RBP+\offset CFI_REL_OFFSET r12, R12+\offset @@ -297,15 +297,15 @@ ENTRY(native_usergs_sysret64) ENTRY(save_args) XCPT_FRAME cld - CFI_MOVQ rdi, (RDI-ARGOFFSET)+16 - CFI_MOVQ rsi, (RSI-ARGOFFSET)+16 - CFI_MOVQ rdx, (RDX-ARGOFFSET)+16 - CFI_MOVQ rcx, (RCX-ARGOFFSET)+16 - CFI_MOVQ rax, (RAX-ARGOFFSET)+16 - CFI_MOVQ r8, (R8-ARGOFFSET)+16 - CFI_MOVQ r9, (R9-ARGOFFSET)+16 - CFI_MOVQ r10, (R10-ARGOFFSET)+16 - CFI_MOVQ r11, (R11-ARGOFFSET)+16 + CFI_MOVQ rdi, RDI+16-ARGOFFSET + CFI_MOVQ rsi, RSI+16-ARGOFFSET + CFI_MOVQ rdx, RDX+16-ARGOFFSET + CFI_MOVQ rcx, RCX+16-ARGOFFSET + CFI_MOVQ rax, RAX+16-ARGOFFSET + CFI_MOVQ r8, R8+16-ARGOFFSET + CFI_MOVQ r9, R9+16-ARGOFFSET + CFI_MOVQ r10, R10+16-ARGOFFSET + CFI_MOVQ r11, R11+16-ARGOFFSET leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ CFI_MOVQ rbp, 8 /* push %rbp */ leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ -- cgit v1.2.3 From 14ae22ba2b8bb3d53fb795f9b8074aa39ef7b6cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 21 Nov 2008 15:20:47 +0100 Subject: x86: entry_64.S: rename Impact: cleanup Rename: CFI_PUSHQ => pushq_cfi CFI_POPQ => popq_cfi CFI_MOVQ => movq_cfi To make it blend better into regular assembly code. Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 71 +++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 4e3d83678f85..92c5e18340db 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -63,17 +63,17 @@ /* * Some macro's to hide the most frequently occuring CFI annotations. */ - .macro CFI_PUSHQ reg + .macro pushq_cfi reg pushq \reg CFI_ADJUST_CFA_OFFSET 8 .endm - .macro CFI_POPQ reg + .macro popq_cfi reg popq \reg CFI_ADJUST_CFA_OFFSET -8 .endm - .macro CFI_MOVQ reg offset=0 + .macro movq_cfi reg offset=0 movq %\reg, \offset(%rsp) CFI_REL_OFFSET \reg, \offset .endm @@ -297,17 +297,18 @@ ENTRY(native_usergs_sysret64) ENTRY(save_args) XCPT_FRAME cld - CFI_MOVQ rdi, RDI+16-ARGOFFSET - CFI_MOVQ rsi, RSI+16-ARGOFFSET - CFI_MOVQ rdx, RDX+16-ARGOFFSET - CFI_MOVQ rcx, RCX+16-ARGOFFSET - CFI_MOVQ rax, RAX+16-ARGOFFSET - CFI_MOVQ r8, R8+16-ARGOFFSET - CFI_MOVQ r9, R9+16-ARGOFFSET - CFI_MOVQ r10, R10+16-ARGOFFSET - CFI_MOVQ r11, R11+16-ARGOFFSET + movq_cfi rdi, RDI+16-ARGOFFSET + movq_cfi rsi, RSI+16-ARGOFFSET + movq_cfi rdx, RDX+16-ARGOFFSET + movq_cfi rcx, RCX+16-ARGOFFSET + movq_cfi rax, RAX+16-ARGOFFSET + movq_cfi r8, R8+16-ARGOFFSET + movq_cfi r9, R9+16-ARGOFFSET + movq_cfi r10, R10+16-ARGOFFSET + movq_cfi r11, R11+16-ARGOFFSET + leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ - CFI_MOVQ rbp, 8 /* push %rbp */ + movq_cfi rbp, 8 /* push %rbp */ leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ testl $3, CS(%rdi) je 1f @@ -320,10 +321,10 @@ ENTRY(save_args) */ 1: incl %gs:pda_irqcount jne 2f - CFI_POPQ %rax /* move return address... */ + popq_cfi %rax /* move return address... */ mov %gs:pda_irqstackptr,%rsp EMPTY_FRAME 0 - CFI_PUSHQ %rax /* ... to the new stack */ + pushq_cfi %rax /* ... to the new stack */ /* * We entered an interrupt context - irqs are off: */ @@ -968,7 +969,7 @@ END(spurious_interrupt) .macro zeroentry sym INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - CFI_PUSHQ $-1 /* ORIG_RAX: no syscall to restart */ + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $15*8,%rsp CFI_ADJUST_CFA_OFFSET 15*8 call error_entry @@ -1103,21 +1104,21 @@ KPROBE_ENTRY(error_entry) CFI_ADJUST_CFA_OFFSET 15*8 /* oldrax contains error code */ cld - CFI_MOVQ rdi, RDI+8 - CFI_MOVQ rsi, RSI+8 - CFI_MOVQ rdx, RDX+8 - CFI_MOVQ rcx, RCX+8 - CFI_MOVQ rax, RAX+8 - CFI_MOVQ r8, R8+8 - CFI_MOVQ r9, R9+8 - CFI_MOVQ r10, R10+8 - CFI_MOVQ r11, R11+8 - CFI_MOVQ rbx, RBX+8 - CFI_MOVQ rbp, RBP+8 - CFI_MOVQ r12, R12+8 - CFI_MOVQ r13, R13+8 - CFI_MOVQ r14, R14+8 - CFI_MOVQ r15, R15+8 + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1515,9 +1516,9 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - CFI_PUSHQ $0 /* RIP */ - CFI_PUSHQ %r11 - CFI_PUSHQ %rcx + pushq_cfi $0 /* RIP */ + pushq_cfi %r11 + pushq_cfi %rcx jmp general_protection CFI_RESTORE_STATE 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ @@ -1527,7 +1528,7 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - CFI_PUSHQ $0 + pushq_cfi $0 SAVE_ALL jmp error_exit CFI_ENDPROC -- cgit v1.2.3 From c002a1e6b6b6f07ae04e68987054bf1f2150ae48 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 21 Nov 2008 16:41:55 +0100 Subject: x86: introduce save_rest and restructure the PTREGSCALL macro in entry_64.S Impact: cleanup The save_rest function completes a partial stack frame for use by the PTREGSCALL macro. This also avoids the indirect call in PTREGSCALLs. This adds the macro movq_cfi_restore to hide the CFI_RESTORE annotation when restoring a register from the stack frame. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 91 +++++++++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 92c5e18340db..ef95c45b9269 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -78,6 +78,11 @@ CFI_REL_OFFSET \reg, \offset .endm + .macro movq_cfi_restore offset reg + movq \offset(%rsp), %\reg + CFI_RESTORE \reg + .endm + #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) @@ -186,21 +191,21 @@ ENTRY(native_usergs_sysret64) */ /* %rsp:at FRAMEEND */ - .macro FIXUP_TOP_OF_STACK tmp - movq %gs:pda_oldrsp,\tmp - movq \tmp,RSP(%rsp) - movq $__USER_DS,SS(%rsp) - movq $__USER_CS,CS(%rsp) - movq $-1,RCX(%rsp) - movq R11(%rsp),\tmp /* get eflags */ - movq \tmp,EFLAGS(%rsp) + .macro FIXUP_TOP_OF_STACK tmp offset=0 + movq %gs:pda_oldrsp,\tmp + movq \tmp,RSP+\offset(%rsp) + movq $__USER_DS,SS+\offset(%rsp) + movq $__USER_CS,CS+\offset(%rsp) + movq $-1,RCX+\offset(%rsp) + movq R11+\offset(%rsp),\tmp /* get eflags */ + movq \tmp,EFLAGS+\offset(%rsp) .endm - .macro RESTORE_TOP_OF_STACK tmp,offset=0 - movq RSP-\offset(%rsp),\tmp - movq \tmp,%gs:pda_oldrsp - movq EFLAGS-\offset(%rsp),\tmp - movq \tmp,R11-\offset(%rsp) + .macro RESTORE_TOP_OF_STACK tmp offset=0 + movq RSP+\offset(%rsp),\tmp + movq \tmp,%gs:pda_oldrsp + movq EFLAGS+\offset(%rsp),\tmp + movq \tmp,R11+\offset(%rsp) .endm .macro FAKE_STACK_FRAME child_rip @@ -333,6 +338,21 @@ ENTRY(save_args) CFI_ENDPROC END(save_args) +ENTRY(save_rest) + PARTIAL_FRAME 1 REST_SKIP+8 + movq 5*8+16(%rsp), %r11 /* save return address */ + movq_cfi rbx, RBX+16 + movq_cfi rbp, RBP+16 + movq_cfi r12, R12+16 + movq_cfi r13, R13+16 + movq_cfi r14, R14+16 + movq_cfi r15, R15+16 + movq %r11, 8(%rsp) /* return address */ + FIXUP_TOP_OF_STACK %r11, 16 + ret + CFI_ENDPROC +END(save_rest) + /* * A newly forked process directly context switches into this. */ @@ -353,7 +373,7 @@ rff_action: je int_ret_from_sys_call testl $_TIF_IA32,TI_flags(%rcx) jnz int_ret_from_sys_call - RESTORE_TOP_OF_STACK %rdi,ARGOFFSET + RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET jmp ret_from_sys_call rff_trace: movq %rsp,%rdi @@ -626,18 +646,20 @@ END(system_call) /* * Certain special system calls that need to save a complete full stack frame. */ - .macro PTREGSCALL label,func,arg - .globl \label -\label: - leaq \func(%rip),%rax - leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ - jmp ptregscall_common +ENTRY(\label) + PARTIAL_FRAME 1 8 /* offset 8: return address */ + subq $REST_SKIP, %rsp + CFI_ADJUST_CFA_OFFSET REST_SKIP + call save_rest + DEFAULT_FRAME 0 8 /* offset 8: return address */ + leaq 8(%rsp), \arg /* pt_regs pointer */ + call \func + jmp ptregscall_common + CFI_ENDPROC END(\label) .endm - CFI_STARTPROC - PTREGSCALL stub_clone, sys_clone, %r8 PTREGSCALL stub_fork, sys_fork, %rdi PTREGSCALL stub_vfork, sys_vfork, %rdi @@ -645,22 +667,15 @@ END(\label) PTREGSCALL stub_iopl, sys_iopl, %rsi ENTRY(ptregscall_common) - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 - SAVE_REST - movq %r11, %r15 - CFI_REGISTER rip, r15 - FIXUP_TOP_OF_STACK %r11 - call *%rax - RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - CFI_REGISTER rip, r11 - RESTORE_REST - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rip, 0 - ret + DEFAULT_FRAME 1 8 /* offset 8: return address */ + RESTORE_TOP_OF_STACK %r11, 8 + movq_cfi_restore R15+8, r15 + movq_cfi_restore R14+8, r14 + movq_cfi_restore R13+8, r13 + movq_cfi_restore R12+8, r12 + movq_cfi_restore RBP+8, rbp + movq_cfi_restore RBX+8, rbx + ret $REST_SKIP /* pop extended registers */ CFI_ENDPROC END(ptregscall_common) -- cgit v1.2.3 From e2f6bc25b98dbb10d809ee50262b43fcae67840a Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 21 Nov 2008 16:43:18 +0100 Subject: x86: entry_64.S: factor out save_paranoid and paranoid_exit Impact: cleanup, shrink kernel image size Also expand the paranoid_exit0 macro into nmi_exit inside the nmi stub in the case of enabled irq-tracing. This gives a few hundred bytes code size reduction. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 151 ++++++++++++++++++++++++++++++--------------- 1 file changed, 102 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ef95c45b9269..fad777b11366 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -353,6 +353,36 @@ ENTRY(save_rest) CFI_ENDPROC END(save_rest) +/* save complete stack frame */ +ENTRY(save_paranoid) + XCPT_FRAME 1 RDI+8 + cld + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(save_paranoid) + /* * A newly forked process directly context switches into this. */ @@ -1012,24 +1042,15 @@ END(spurious_interrupt) .endm /* error code is on the stack already */ - /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym, ist=0, irqtrace=1 - SAVE_ALL - cld - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f - SWAPGS - xorl %ebx,%ebx -1: + .macro paranoidentry sym ist=0 + subq $15*8, %rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call save_paranoid + DEFAULT_FRAME 0 .if \ist movq %gs:pda_data_offset, %rbp .endif - .if \irqtrace TRACE_IRQS_OFF - .endif movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi movq $-1,ORIG_RAX(%rsp) @@ -1041,9 +1062,7 @@ END(spurious_interrupt) addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif DISABLE_INTERRUPTS(CLBR_NONE) - .if \irqtrace TRACE_IRQS_OFF - .endif .endm /* @@ -1058,57 +1077,48 @@ END(spurious_interrupt) * is fundamentally NMI-unsafe. (we cannot change the soft and * hard flags at once, atomically) */ - .macro paranoidexit trace=1 + /* ebx: no swapgs flag */ -paranoid_exit\trace: +KPROBE_ENTRY(paranoid_exit) + INTR_FRAME testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore\trace + jnz paranoid_restore testl $3,CS(%rsp) - jnz paranoid_userspace\trace -paranoid_swapgs\trace: - .if \trace + jnz paranoid_userspace +paranoid_swapgs: TRACE_IRQS_IRETQ 0 - .endif SWAPGS_UNSAFE_STACK -paranoid_restore\trace: +paranoid_restore: RESTORE_ALL 8 jmp irq_return -paranoid_userspace\trace: +paranoid_userspace: GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%ebx andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs\trace + jz paranoid_swapgs movq %rsp,%rdi /* &pt_regs */ call sync_regs movq %rax,%rsp /* switch stack for scheduling */ testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule\trace + jnz paranoid_schedule movl %ebx,%edx /* arg3: thread flags */ - .if \trace TRACE_IRQS_ON - .endif ENABLE_INTERRUPTS(CLBR_NONE) xorl %esi,%esi /* arg2: oldset */ movq %rsp,%rdi /* arg1: &pt_regs */ call do_notify_resume DISABLE_INTERRUPTS(CLBR_NONE) - .if \trace TRACE_IRQS_OFF - .endif - jmp paranoid_userspace\trace -paranoid_schedule\trace: - .if \trace + jmp paranoid_userspace +paranoid_schedule: TRACE_IRQS_ON - .endif ENABLE_INTERRUPTS(CLBR_ANY) call schedule DISABLE_INTERRUPTS(CLBR_ANY) - .if \trace TRACE_IRQS_OFF - .endif - jmp paranoid_userspace\trace + jmp paranoid_userspace CFI_ENDPROC - .endm +END(paranoid_exit) /* * Exception entry point. This expects an error code/orig_rax on the stack. @@ -1326,20 +1336,63 @@ KPROBE_ENTRY(debug) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK - paranoidexit + jmp paranoid_exit + CFI_ENDPROC KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $-1 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi, 0, 0 + pushq_cfi $-1 + subq $15*8, %rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call save_paranoid + DEFAULT_FRAME 0 + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp,%rdi + movq ORIG_RAX(%rsp),%rsi + movq $-1,ORIG_RAX(%rsp) + call do_nmi + DISABLE_INTERRUPTS(CLBR_NONE) #ifdef CONFIG_TRACE_IRQFLAGS - paranoidexit 0 + /* paranoidexit; without TRACE_IRQS_OFF */ + /* ebx: no swapgs flag */ +nmi_exit: + testl %ebx,%ebx /* swapgs needed? */ + jnz nmi_restore + testl $3,CS(%rsp) + jnz nmi_userspace +nmi_swapgs: + SWAPGS_UNSAFE_STACK +nmi_restore: + RESTORE_ALL 8 + jmp irq_return +nmi_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz nmi_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz nmi_schedule + movl %ebx,%edx /* arg3: thread flags */ + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + jmp nmi_userspace +nmi_schedule: + ENABLE_INTERRUPTS(CLBR_ANY) + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) + jmp nmi_userspace + CFI_ENDPROC #else - jmp paranoid_exit1 + jmp paranoid_exit CFI_ENDPROC #endif KPROBE_END(nmi) @@ -1350,7 +1403,7 @@ KPROBE_ENTRY(int3) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK - jmp paranoid_exit1 + jmp paranoid_exit CFI_ENDPROC KPROBE_END(int3) @@ -1375,7 +1428,7 @@ ENTRY(double_fault) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_double_fault - jmp paranoid_exit1 + jmp paranoid_exit CFI_ENDPROC END(double_fault) @@ -1392,7 +1445,7 @@ ENTRY(stack_segment) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_stack_segment - jmp paranoid_exit1 + jmp paranoid_exit CFI_ENDPROC END(stack_segment) @@ -1420,7 +1473,7 @@ ENTRY(machine_check) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check - jmp paranoid_exit1 + jmp paranoid_exit CFI_ENDPROC END(machine_check) #endif -- cgit v1.2.3 From b8b1d08bf6fe7c09e6cb2294bc0e5e964b361241 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 21 Nov 2008 16:44:28 +0100 Subject: x86: entry_64.S: split out some macro's and move common code to paranoid_exit Impact: cleanup DISABLE_INTERRUPTS(CLBR_NONE)/TRACE_IRQS_OFF is now always executed just before paranoid_exit. Move it there. Split out paranoidzeroentry, paranoiderrorentry, and paranoidzeroentry_ist to get more readable macro's. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 102 ++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index fad777b11366..692c1da61905 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1026,6 +1026,39 @@ END(spurious_interrupt) CFI_ENDPROC .endm + .macro paranoidzeroentry sym + INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + pushq $-1 /* ORIG_RAX: no syscall to restart */ + CFI_ADJUST_CFA_OFFSET 8 + subq $15*8, %rsp + call save_paranoid + TRACE_IRQS_OFF + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \sym + jmp paranoid_exit /* %ebx: no swapgs flag */ + CFI_ENDPROC + .endm + + .macro paranoidzeroentry_ist sym ist + INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + pushq $-1 /* ORIG_RAX: no syscall to restart */ + CFI_ADJUST_CFA_OFFSET 8 + subq $15*8, %rsp + call save_paranoid + TRACE_IRQS_OFF + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + movq %gs:pda_data_offset, %rbp + subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + call \sym + addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + jmp paranoid_exit /* %ebx: no swapgs flag */ + CFI_ENDPROC + .endm + .macro errorentry sym XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1042,27 +1075,20 @@ END(spurious_interrupt) .endm /* error code is on the stack already */ - .macro paranoidentry sym ist=0 - subq $15*8, %rsp + .macro paranoiderrorentry sym + XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + subq $15*8,%rsp CFI_ADJUST_CFA_OFFSET 15*8 call save_paranoid DEFAULT_FRAME 0 - .if \ist - movq %gs:pda_data_offset, %rbp - .endif TRACE_IRQS_OFF - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi - movq $-1,ORIG_RAX(%rsp) - .if \ist - subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - .endif + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ call \sym - .if \ist - addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - .endif - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF + jmp paranoid_exit /* %ebx: no swapgs flag */ + CFI_ENDPROC .endm /* @@ -1081,6 +1107,8 @@ END(spurious_interrupt) /* ebx: no swapgs flag */ KPROBE_ENTRY(paranoid_exit) INTR_FRAME + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_restore testl $3,CS(%rsp) @@ -1331,13 +1359,7 @@ END(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_debug, DEBUG_STACK - jmp paranoid_exit - CFI_ENDPROC + paranoidzeroentry_ist do_debug, DEBUG_STACK KPROBE_END(debug) /* runs on exception stack */ @@ -1351,14 +1373,12 @@ KPROBE_ENTRY(nmi) DEFAULT_FRAME 0 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi - movq $-1,ORIG_RAX(%rsp) + movq $-1,%rsi call do_nmi - DISABLE_INTERRUPTS(CLBR_NONE) #ifdef CONFIG_TRACE_IRQFLAGS /* paranoidexit; without TRACE_IRQS_OFF */ /* ebx: no swapgs flag */ -nmi_exit: + DISABLE_INTERRUPTS(CLBR_NONE) testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore testl $3,CS(%rsp) @@ -1398,13 +1418,7 @@ nmi_schedule: KPROBE_END(nmi) KPROBE_ENTRY(int3) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_int3, DEBUG_STACK - jmp paranoid_exit - CFI_ENDPROC + paranoidzeroentry_ist do_int3, DEBUG_STACK KPROBE_END(int3) ENTRY(overflow) @@ -1425,11 +1439,7 @@ END(coprocessor_segment_overrun) /* runs on exception stack */ ENTRY(double_fault) - XCPT_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - paranoidentry do_double_fault - jmp paranoid_exit - CFI_ENDPROC + paranoiderrorentry do_double_fault END(double_fault) ENTRY(invalid_TSS) @@ -1442,11 +1452,7 @@ END(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) - XCPT_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - paranoidentry do_stack_segment - jmp paranoid_exit - CFI_ENDPROC + paranoiderrorentry do_stack_segment END(stack_segment) KPROBE_ENTRY(general_protection) @@ -1468,13 +1474,7 @@ END(spurious_interrupt_bug) #ifdef CONFIG_X86_MCE /* runs on exception stack */ ENTRY(machine_check) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_machine_check - jmp paranoid_exit - CFI_ENDPROC + paranoidzeroentry do_machine_check END(machine_check) #endif -- cgit v1.2.3 From c81084114f6ff957bc6b5a0048350479c1c1f7b3 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 21 Nov 2008 22:59:52 +0100 Subject: x86: split out some macro's and move common code to paranoid_exit, fix Impact: fix bootup crash Even though it tested fine for me, there was still a bug in the first patch: I have overlooked a call to ptregscall_common. This patch fixes that, I think, but the code is never executed for me while running a debian install... (I tested this by putting an "1:jmp 1b" in there.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 692c1da61905..e5ddf573ded2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -529,10 +529,13 @@ sysret_signal: jc sysret_audit #endif /* edx: work flags (arg3) */ - leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 - call ptregscall_common + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call do_notify_resume + RESTORE_TOP_OF_STACK %r11 + RESTORE_REST movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ -- cgit v1.2.3 From 3aeb95d5b7839708a8d8e11aa274ee4d0d4042cc Mon Sep 17 00:00:00 2001 From: jia zhang Date: Sun, 23 Nov 2008 09:51:41 +0800 Subject: x86_64: fix the check in stack_overflow_check Impact: make stack overflow debug check and printout narrower stack_overflow_check() should consider the stack usage of pt_regs, and thus it could warn us in advance. Additionally, it looks better for the warning time to start at INITIAL_JIFFIES. Assuming that rsp gets close to the check point before interrupt arrives: when interrupt really happens, thread_info will be partly overrode. Signed-off-by: jia zhang Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 60eb84eb77a0..b842fc82be15 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -29,11 +29,12 @@ static inline void stack_overflow_check(struct pt_regs *regs) { u64 curbase = (u64)task_stack_page(current); - static unsigned long warned = -60*HZ; + static unsigned long warned = INITIAL_JIFFIES - 60*HZ; if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + 128 && - time_after(jiffies, warned + 60*HZ)) { + regs->sp < curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128 && + time_after(jiffies, warned + 60*HZ)) { printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", current->comm, curbase, regs->sp); show_stack(NULL,NULL); -- cgit v1.2.3 From f377fa123d0ec621e8e361ecc3f2a8ee70e81a2e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 23 Nov 2008 09:02:26 +0100 Subject: x86: clean up stack overflow debug check Impact: cleanup Simplify the irq-sampled stack overflow debug check: - eliminate an #idef - use WARN_ONCE() to emit a single warning (all bets are off after the first such warning anyway) Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index b842fc82be15..1d3d0e71b044 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,7 +18,6 @@ #include #include -#ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: * @@ -28,20 +27,18 @@ */ static inline void stack_overflow_check(struct pt_regs *regs) { +#ifdef CONFIG_DEBUG_STACKOVERFLOW u64 curbase = (u64)task_stack_page(current); - static unsigned long warned = INITIAL_JIFFIES - 60*HZ; - - if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128 && - time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", - current->comm, curbase, regs->sp); - show_stack(NULL,NULL); - warned = jiffies; - } -} + + WARN_ONCE(regs->sp >= curbase && + regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128, + + "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); #endif +} /* * do_IRQ handles all normal device IRQ's (the special @@ -61,9 +58,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; -#ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); -#endif desc = irq_to_desc(irq); if (likely(desc)) -- cgit v1.2.3 From 5c9b3a0c7b8be3cdef3d7418f0a49127e7cdc998 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 21 Nov 2008 17:36:41 -0800 Subject: x86: signal: cosmetic unification of including headers Impact: cleanup Make the headers portion of signal_32.c and signal_64.c the same. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 27 +++++++++++++++------------ arch/x86/kernel/signal_64.c | 7 ++++++- 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index c2aabeba27a5..0ff8d8750a7d 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -4,29 +4,32 @@ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes */ -#include -#include -#include -#include +#include +#include +#include #include -#include #include -#include -#include #include -#include #include +#include #include -#include -#include -#include +#include +#include +#include +#include #include #include -#include #include #include + +#ifdef CONFIG_X86_64 +#include +#include +#include +#endif /* CONFIG_X86_64 */ + #include #include diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 3d54d366ccb2..c52244ac19fc 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -19,17 +19,22 @@ #include #include #include -#include #include #include #include #include +#include + +#ifdef CONFIG_X86_64 #include #include #include +#endif /* CONFIG_X86_64 */ + #include #include + #include "sigframe.h" #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -- cgit v1.2.3 From 666ac7be049ec290625e65d5922ff59f7bdec527 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 21 Nov 2008 17:38:25 -0800 Subject: x86: signal: cosmetic unification of sys_sigaltstack() Impact: cleanup Add #ifdef directive for unification. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 9 +++++++++ arch/x86/kernel/signal_64.c | 15 +++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 0ff8d8750a7d..d9909881ac66 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -125,6 +125,7 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, return ret; } +#ifdef CONFIG_X86_32 asmlinkage int sys_sigaltstack(unsigned long bx) { /* @@ -137,6 +138,14 @@ asmlinkage int sys_sigaltstack(unsigned long bx) return do_sigaltstack(uss, uoss, regs->sp); } +#else /* !CONFIG_X86_32 */ +asmlinkage long +sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, + struct pt_regs *regs) +{ + return do_sigaltstack(uss, uoss, regs->sp); +} +#endif /* CONFIG_X86_32 */ #define COPY(x) { \ err |= __get_user(regs->x, &sc->x); \ diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index c52244ac19fc..b6e4fe03a36b 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -50,12 +50,27 @@ # define FIX_EFLAGS __FIX_EFLAGS #endif +#ifdef CONFIG_X86_32 +asmlinkage int sys_sigaltstack(unsigned long bx) +{ + /* + * This is needed to make gcc realize it doesn't own the + * "struct pt_regs" + */ + struct pt_regs *regs = (struct pt_regs *)&bx; + const stack_t __user *uss = (const stack_t __user *)bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; + + return do_sigaltstack(uss, uoss, regs->sp); +} +#else /* !CONFIG_X86_32 */ asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { return do_sigaltstack(uss, uoss, regs->sp); } +#endif /* CONFIG_X86_32 */ #define COPY(x) { \ err |= __get_user(regs->x, &sc->x); \ -- cgit v1.2.3 From 2456d738ef051f85170bf018faef63f83fa84eb5 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 21 Nov 2008 17:38:57 -0800 Subject: x86: signal: cosmetic unification of sys_rt_sigreturn() Impact: cleanup Add #ifdef directive for unification. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 7 +++++++ arch/x86/kernel/signal_64.c | 9 +++++++++ 2 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d9909881ac66..f7dd6c44c042 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -299,12 +299,19 @@ badframe: return 0; } +#ifdef CONFIG_X86_32 asmlinkage int sys_rt_sigreturn(unsigned long __unused) { struct pt_regs *regs = (struct pt_regs *)&__unused; return do_rt_sigreturn(regs); } +#else /* !CONFIG_X86_32 */ +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} +#endif /* CONFIG_X86_32 */ /* * Set up a signal frame. diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index b6e4fe03a36b..32718f5e4f61 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -169,10 +169,19 @@ badframe: return 0; } +#ifdef CONFIG_X86_32 +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *)&__unused; + + return do_rt_sigreturn(regs); +} +#else /* !CONFIG_X86_32 */ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) { return do_rt_sigreturn(regs); } +#endif /* CONFIG_X86_32 */ /* * Set up a signal frame. -- cgit v1.2.3 From c450d7805b2c5cac8846c5f490fddfd9030d2207 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Fri, 21 Nov 2008 23:17:09 +0100 Subject: x86: vmware - fix sparse warnings Impact: fix sparse build warning Fix the following sparse warnings: arch/x86/kernel/cpu/vmware.c:69:5: warning: symbol 'vmware_platform' was not declared. Should it be static? arch/x86/kernel/cpu/vmware.c:89:15: warning: symbol 'vmware_get_tsc_khz' was not declared. Should it be static? arch/x86/kernel/cpu/vmware.c:107:16: warning: symbol 'vmware_set_feature_bits' was not declared. Should it be static? Signed-off-by: Hannes Eder Cc: "Alok N Kataria" Cc: "Dan Hecht" Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/vmware.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index c034bda842d9..284c399e3234 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -23,6 +23,7 @@ #include #include +#include #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 -- cgit v1.2.3 From 4e42ebd57b2e727b28bf5f6068e95cd19b0e807b Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Fri, 21 Nov 2008 22:56:17 +0100 Subject: x86: hypervisor - fix sparse warnings Impact: fix sparse build warning Fix the following sparse warnings: arch/x86/kernel/cpu/hypervisor.c:37:15: warning: symbol 'get_hypervisor_tsc_freq' was not declared. Should it be static? arch/x86/kernel/cpu/hypervisor.c:53:16: warning: symbol 'init_hypervisor' was not declared. Should it be static? Signed-off-by: Hannes Eder Cc: "Alok N Kataria" Cc: "Dan Hecht" Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/hypervisor.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 35ae2b75226d..fb5b86af0b01 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -23,6 +23,7 @@ #include #include +#include static inline void __cpuinit detect_hypervisor_vendor(struct cpuinfo_x86 *c) -- cgit v1.2.3 From 8a2503fa4a6fae8ee42140b339f37373fc6acaae Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 23 Nov 2008 14:53:43 +0300 Subject: x86: move dwarf2 related macro to dwarf2.h Impact: cleanup Move recently introduced dwarf2 macros to dwarf2.h file. It allow us to not duplicate them in assembly files. Active usage of _cfi macros don't make assembly files more obvious to understand but we already have a lot of macros there which requires to search the definitions of them *anyway*. But at least it make every cfi usage one line shorter. Also some code alignment is done. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dwarf2.h | 97 +++++++++++++++++++++++++++++-------------- arch/x86/kernel/entry_64.S | 23 ---------- 2 files changed, 66 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 804b6e6be929..3afc5e87cfdd 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -6,56 +6,91 @@ #endif /* - Macros for dwarf2 CFI unwind table entries. - See "as.info" for details on these pseudo ops. Unfortunately - they are only supported in very new binutils, so define them - away for older version. + * Macros for dwarf2 CFI unwind table entries. + * See "as.info" for details on these pseudo ops. Unfortunately + * they are only supported in very new binutils, so define them + * away for older version. */ #ifdef CONFIG_AS_CFI -#define CFI_STARTPROC .cfi_startproc -#define CFI_ENDPROC .cfi_endproc -#define CFI_DEF_CFA .cfi_def_cfa -#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register -#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset -#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset -#define CFI_OFFSET .cfi_offset -#define CFI_REL_OFFSET .cfi_rel_offset -#define CFI_REGISTER .cfi_register -#define CFI_RESTORE .cfi_restore -#define CFI_REMEMBER_STATE .cfi_remember_state -#define CFI_RESTORE_STATE .cfi_restore_state -#define CFI_UNDEFINED .cfi_undefined +#define CFI_STARTPROC .cfi_startproc +#define CFI_ENDPROC .cfi_endproc +#define CFI_DEF_CFA .cfi_def_cfa +#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register +#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset +#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset +#define CFI_OFFSET .cfi_offset +#define CFI_REL_OFFSET .cfi_rel_offset +#define CFI_REGISTER .cfi_register +#define CFI_RESTORE .cfi_restore +#define CFI_REMEMBER_STATE .cfi_remember_state +#define CFI_RESTORE_STATE .cfi_restore_state +#define CFI_UNDEFINED .cfi_undefined #ifdef CONFIG_AS_CFI_SIGNAL_FRAME -#define CFI_SIGNAL_FRAME .cfi_signal_frame +#define CFI_SIGNAL_FRAME .cfi_signal_frame #else #define CFI_SIGNAL_FRAME #endif #else -/* Due to the structure of pre-exisiting code, don't use assembler line - comment character # to ignore the arguments. Instead, use a dummy macro. */ +/* + * Due to the structure of pre-exisiting code, don't use assembler line + * comment character # to ignore the arguments. Instead, use a dummy macro. + */ .macro cfi_ignore a=0, b=0, c=0, d=0 .endm -#define CFI_STARTPROC cfi_ignore -#define CFI_ENDPROC cfi_ignore -#define CFI_DEF_CFA cfi_ignore +#define CFI_STARTPROC cfi_ignore +#define CFI_ENDPROC cfi_ignore +#define CFI_DEF_CFA cfi_ignore #define CFI_DEF_CFA_REGISTER cfi_ignore #define CFI_DEF_CFA_OFFSET cfi_ignore #define CFI_ADJUST_CFA_OFFSET cfi_ignore -#define CFI_OFFSET cfi_ignore -#define CFI_REL_OFFSET cfi_ignore -#define CFI_REGISTER cfi_ignore -#define CFI_RESTORE cfi_ignore -#define CFI_REMEMBER_STATE cfi_ignore -#define CFI_RESTORE_STATE cfi_ignore -#define CFI_UNDEFINED cfi_ignore -#define CFI_SIGNAL_FRAME cfi_ignore +#define CFI_OFFSET cfi_ignore +#define CFI_REL_OFFSET cfi_ignore +#define CFI_REGISTER cfi_ignore +#define CFI_RESTORE cfi_ignore +#define CFI_REMEMBER_STATE cfi_ignore +#define CFI_RESTORE_STATE cfi_ignore +#define CFI_UNDEFINED cfi_ignore +#define CFI_SIGNAL_FRAME cfi_ignore #endif +/* + * An attempt to make CFI annotations more or less + * correct and shorter. It is implied that you know + * what you're doing if you use them. + */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_X86_64 + .macro pushq_cfi reg + pushq \reg + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro popq_cfi reg + popq \reg + CFI_ADJUST_CFA_OFFSET -8 + .endm + + .macro movq_cfi reg offset=0 + movq %\reg, \offset(%rsp) + CFI_REL_OFFSET \reg, \offset + .endm + + .macro movq_cfi_restore offset reg + movq \offset(%rsp), %\reg + CFI_RESTORE \reg + .endm +#else /*!CONFIG_X86_64*/ + + /* 32bit defenitions are missed yet */ + +#endif /*!CONFIG_X86_64*/ +#endif /*__ASSEMBLY__*/ + #endif /* _ASM_X86_DWARF2_H */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e5ddf573ded2..249eb604e71b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -60,29 +60,6 @@ #define __AUDIT_ARCH_LE 0x40000000 .code64 -/* - * Some macro's to hide the most frequently occuring CFI annotations. - */ - .macro pushq_cfi reg - pushq \reg - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro popq_cfi reg - popq \reg - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro movq_cfi reg offset=0 - movq %\reg, \offset(%rsp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movq_cfi_restore offset reg - movq \offset(%rsp), %\reg - CFI_RESTORE \reg - .endm - #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) -- cgit v1.2.3 From 050dc6944b9ca2186f4729ab44e0da3743933941 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 23 Nov 2008 13:35:48 +0100 Subject: x86: remove duplicate #define from 'cpufeature.h' Impact: cleanup Remove duplicate #define from 'cpufeature.h'. This also fixes the following sparse warning: arch/x86/kernel/cpu/capflags.c:54:3: warning: Initializer entry defined twice arch/x86/kernel/cpu/capflags.c:58:3: also defined here Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 694d1f8f1bee..5bce8ed02b44 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -80,7 +80,6 @@ #define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */ #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ #define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ #define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */ -- cgit v1.2.3 From 322648d1ba75280d62f114d47048beb0b35f5047 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sun, 23 Nov 2008 10:08:28 +0100 Subject: x86: include ENTRY/END in entry handlers in entry_64.S Impact: cleanup of entry_64.S Except for the order and the place of the functions, this patch should not change the generated code. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 259 +++++++++++++++++++-------------------------- 1 file changed, 109 insertions(+), 150 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 249eb604e71b..1a856c0b21a8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -922,76 +922,70 @@ END(common_interrupt) /* * APIC interrupts. */ - .p2align 5 - - .macro apicinterrupt num,func +.macro apicinterrupt num sym do_sym +ENTRY(\sym) INTR_FRAME pushq $~(\num) CFI_ADJUST_CFA_OFFSET 8 - interrupt \func + interrupt \do_sym jmp ret_from_intr CFI_ENDPROC - .endm +END(\sym) +.endm -ENTRY(thermal_interrupt) - apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt -END(thermal_interrupt) +#ifdef CONFIG_SMP +apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +#endif -ENTRY(threshold_interrupt) - apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt -END(threshold_interrupt) +apicinterrupt 220 \ + uv_bau_message_intr1 uv_bau_message_interrupt +apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt #ifdef CONFIG_SMP -ENTRY(reschedule_interrupt) - apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt -END(reschedule_interrupt) - - .macro INVALIDATE_ENTRY num -ENTRY(invalidate_interrupt\num) - apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt -END(invalidate_interrupt\num) - .endm - - INVALIDATE_ENTRY 0 - INVALIDATE_ENTRY 1 - INVALIDATE_ENTRY 2 - INVALIDATE_ENTRY 3 - INVALIDATE_ENTRY 4 - INVALIDATE_ENTRY 5 - INVALIDATE_ENTRY 6 - INVALIDATE_ENTRY 7 - -ENTRY(call_function_interrupt) - apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt -END(call_function_interrupt) -ENTRY(call_function_single_interrupt) - apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt -END(call_function_single_interrupt) -ENTRY(irq_move_cleanup_interrupt) - apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt -END(irq_move_cleanup_interrupt) +apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ + invalidate_interrupt0 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ + invalidate_interrupt1 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ + invalidate_interrupt2 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ + invalidate_interrupt3 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ + invalidate_interrupt4 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \ + invalidate_interrupt5 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ + invalidate_interrupt6 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ + invalidate_interrupt7 smp_invalidate_interrupt #endif -ENTRY(apic_timer_interrupt) - apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt -END(apic_timer_interrupt) - -ENTRY(uv_bau_message_intr1) - apicinterrupt 220,uv_bau_message_interrupt -END(uv_bau_message_intr1) +apicinterrupt THRESHOLD_APIC_VECTOR \ + threshold_interrupt mce_threshold_interrupt +apicinterrupt THERMAL_APIC_VECTOR \ + thermal_interrupt smp_thermal_interrupt -ENTRY(error_interrupt) - apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt -END(error_interrupt) +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ + call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR \ + call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR \ + reschedule_interrupt smp_reschedule_interrupt +#endif -ENTRY(spurious_interrupt) - apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt -END(spurious_interrupt) +apicinterrupt ERROR_APIC_VECTOR \ + error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt /* * Exception entry points. */ - .macro zeroentry sym +.macro zeroentry sym do_sym +ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ @@ -1001,12 +995,14 @@ END(spurious_interrupt) DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - call \sym + call \do_sym jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +END(\sym) +.endm - .macro paranoidzeroentry sym +.macro paranoidzeroentry sym do_sym +KPROBE_ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -1016,12 +1012,14 @@ END(spurious_interrupt) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - call \sym + call \do_sym jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +KPROBE_END(\sym) +.endm - .macro paranoidzeroentry_ist sym ist +.macro paranoidzeroentry_ist sym do_sym ist +KPROBE_ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -1033,13 +1031,19 @@ END(spurious_interrupt) xorl %esi,%esi /* no error code */ movq %gs:pda_data_offset, %rbp subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - call \sym + call \do_sym addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +KPROBE_END(\sym) +.endm - .macro errorentry sym +.macro errorentry sym do_sym entry=0 +.if \entry +KPROBE_ENTRY(\sym) +.else +ENTRY(\sym) +.endif XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME subq $15*8,%rsp @@ -1049,13 +1053,23 @@ END(spurious_interrupt) movq %rsp,%rdi /* pt_regs pointer */ movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - call \sym + call \do_sym jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +.if \entry +KPROBE_END(\sym) +.else +END(\sym) +.endif +.endm /* error code is on the stack already */ - .macro paranoiderrorentry sym +.macro paranoiderrorentry sym do_sym entry=1 +.if \entry +KPROBE_ENTRY(\sym) +.else +ENTRY(\sym) +.endif XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME subq $15*8,%rsp @@ -1066,10 +1080,37 @@ END(spurious_interrupt) movq %rsp,%rdi /* pt_regs pointer */ movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - call \sym + call \do_sym jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +.if \entry +KPROBE_END(\sym) +.else +END(\sym) +.endif +.endm + +zeroentry divide_error do_divide_error +paranoidzeroentry_ist debug do_debug DEBUG_STACK +paranoidzeroentry_ist int3 do_int3 DEBUG_STACK +zeroentry overflow do_overflow +zeroentry bounds do_bounds +zeroentry invalid_op do_invalid_op +zeroentry device_not_available do_device_not_available +paranoiderrorentry double_fault do_double_fault 0 +zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun +errorentry invalid_TSS do_invalid_TSS +errorentry segment_not_present do_segment_not_present +paranoiderrorentry stack_segment do_stack_segment +errorentry general_protection do_general_protection 1 +errorentry page_fault do_page_fault 1 +zeroentry spurious_interrupt_bug do_spurious_interrupt_bug +zeroentry coprocessor_error do_coprocessor_error +errorentry alignment_check do_alignment_check +#ifdef CONFIG_X86_MCE +paranoidzeroentry machine_check do_machine_check +#endif +zeroentry simd_coprocessor_error do_simd_coprocessor_error /* * "Paranoid" exit path from exception stack. @@ -1321,26 +1362,7 @@ ENTRY(kernel_execve) CFI_ENDPROC ENDPROC(kernel_execve) -KPROBE_ENTRY(page_fault) - errorentry do_page_fault -KPROBE_END(page_fault) -ENTRY(coprocessor_error) - zeroentry do_coprocessor_error -END(coprocessor_error) - -ENTRY(simd_coprocessor_error) - zeroentry do_simd_coprocessor_error -END(simd_coprocessor_error) - -ENTRY(device_not_available) - zeroentry do_device_not_available -END(device_not_available) - - /* runs on exception stack */ -KPROBE_ENTRY(debug) - paranoidzeroentry_ist do_debug, DEBUG_STACK -KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) @@ -1397,67 +1419,6 @@ nmi_schedule: #endif KPROBE_END(nmi) -KPROBE_ENTRY(int3) - paranoidzeroentry_ist do_int3, DEBUG_STACK -KPROBE_END(int3) - -ENTRY(overflow) - zeroentry do_overflow -END(overflow) - -ENTRY(bounds) - zeroentry do_bounds -END(bounds) - -ENTRY(invalid_op) - zeroentry do_invalid_op -END(invalid_op) - -ENTRY(coprocessor_segment_overrun) - zeroentry do_coprocessor_segment_overrun -END(coprocessor_segment_overrun) - - /* runs on exception stack */ -ENTRY(double_fault) - paranoiderrorentry do_double_fault -END(double_fault) - -ENTRY(invalid_TSS) - errorentry do_invalid_TSS -END(invalid_TSS) - -ENTRY(segment_not_present) - errorentry do_segment_not_present -END(segment_not_present) - - /* runs on exception stack */ -ENTRY(stack_segment) - paranoiderrorentry do_stack_segment -END(stack_segment) - -KPROBE_ENTRY(general_protection) - errorentry do_general_protection -KPROBE_END(general_protection) - -ENTRY(alignment_check) - errorentry do_alignment_check -END(alignment_check) - -ENTRY(divide_error) - zeroentry do_divide_error -END(divide_error) - -ENTRY(spurious_interrupt_bug) - zeroentry do_spurious_interrupt_bug -END(spurious_interrupt_bug) - -#ifdef CONFIG_X86_MCE - /* runs on exception stack */ -ENTRY(machine_check) - paranoidzeroentry do_machine_check -END(machine_check) -#endif - /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC @@ -1486,9 +1447,7 @@ KPROBE_ENTRY(ignore_sysret) ENDPROC(ignore_sysret) #ifdef CONFIG_XEN -ENTRY(xen_hypervisor_callback) - zeroentry xen_do_hypervisor_callback -END(xen_hypervisor_callback) +zeroentry xen_hypervisor_callback xen_do_hypervisor_callback /* # A note on the "critical region" in our callback handler. -- cgit v1.2.3 From 6efdcfaf16cc4fc76651603e083cf3ec4bd1e6de Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sun, 23 Nov 2008 10:15:32 +0100 Subject: x86: KPROBE_ENTRY should be paired wth KPROBE_END Impact: move some code out of .kprobes.text KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END uses .popsection to get back to the previous section (.text, normally). Also replace ENDPROC by END, for consistency. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1a856c0b21a8..f2d546e16354 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1167,7 +1167,7 @@ paranoid_schedule: TRACE_IRQS_OFF jmp paranoid_userspace CFI_ENDPROC -END(paranoid_exit) +KPROBE_END(paranoid_exit) /* * Exception entry point. This expects an error code/orig_rax on the stack. @@ -1259,7 +1259,7 @@ gs_change: CFI_ADJUST_CFA_OFFSET -8 ret CFI_ENDPROC -ENDPROC(native_load_gs_index) +END(native_load_gs_index) .section __ex_table,"a" .align 8 @@ -1313,7 +1313,7 @@ ENTRY(kernel_thread) UNFAKE_STACK_FRAME ret CFI_ENDPROC -ENDPROC(kernel_thread) +END(kernel_thread) child_rip: pushq $0 # fake return address @@ -1329,7 +1329,7 @@ child_rip: mov %eax, %edi call do_exit CFI_ENDPROC -ENDPROC(child_rip) +END(child_rip) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. @@ -1360,9 +1360,7 @@ ENTRY(kernel_execve) UNFAKE_STACK_FRAME ret CFI_ENDPROC -ENDPROC(kernel_execve) - - +END(kernel_execve) /* runs on exception stack */ KPROBE_ENTRY(nmi) @@ -1437,14 +1435,14 @@ ENTRY(call_softirq) decl %gs:pda_irqcount ret CFI_ENDPROC -ENDPROC(call_softirq) +END(call_softirq) KPROBE_ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax sysret CFI_ENDPROC -ENDPROC(ignore_sysret) +KPROBE_END(ignore_sysret) #ifdef CONFIG_XEN zeroentry xen_hypervisor_callback xen_do_hypervisor_callback -- cgit v1.2.3 From 3b6c52b5b634ae41d762cb174465272d69198160 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 23 Nov 2008 20:21:39 +0300 Subject: x86: introduce ENTRY(KPROBE_ENTRY)_X86 assembly helpers to catch unbalanced declaration v3 Impact: make ENTRY()/END() macros more capable It's usefull to catch unbalanced or messed or mixed declarations of ENTRY and KPROBES. These macros would help a bit. For example the following code would compile without problems ENTRY_X86(mcount) retq END_X86(mcount) But if you forget and mess the following form ENTRY_X86(mcount) retq END(mcount) ENTRY_X86(ftrace_caller) The assembler will issue the following message: Error: ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed Actually the checking is performed at every _X86 macro so maybe it's good idea to put ENTRY_KPROBE_FINAL_X86 at the end of .S file to be sure you didn't miss anything. Signed-off-by: Cyrill Gorcunov Cc: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/include/asm/linkage.h | 60 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index f61ee8f937e4..5d98d0b68ffc 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -57,5 +57,65 @@ #define __ALIGN_STR ".align 16,0x90" #endif +/* + * to check ENTRY_X86/END_X86 and + * KPROBE_ENTRY_X86/KPROBE_END_X86 + * unbalanced-missed-mixed appearance + */ +#define __set_entry_x86 .set ENTRY_X86_IN, 0 +#define __unset_entry_x86 .set ENTRY_X86_IN, 1 +#define __set_kprobe_x86 .set KPROBE_X86_IN, 0 +#define __unset_kprobe_x86 .set KPROBE_X86_IN, 1 + +#define __macro_err_x86 .error "ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed" + +#define __check_entry_x86 \ + .ifdef ENTRY_X86_IN; \ + .ifeq ENTRY_X86_IN; \ + __macro_err_x86; \ + .abort; \ + .endif; \ + .endif + +#define __check_kprobe_x86 \ + .ifdef KPROBE_X86_IN; \ + .ifeq KPROBE_X86_IN; \ + __macro_err_x86; \ + .abort; \ + .endif; \ + .endif + +#define __check_entry_kprobe_x86 \ + __check_entry_x86; \ + __check_kprobe_x86 + +#define ENTRY_KPROBE_FINAL_X86 __check_entry_kprobe_x86 + +#define ENTRY_X86(name) \ + __check_entry_kprobe_x86; \ + __set_entry_x86; \ + .globl name; \ + __ALIGN; \ + name: + +#define END_X86(name) \ + __unset_entry_x86; \ + __check_entry_kprobe_x86; \ + .size name, .-name + +#define KPROBE_ENTRY_X86(name) \ + __check_entry_kprobe_x86; \ + __set_kprobe_x86; \ + .pushsection .kprobes.text, "ax"; \ + .globl name; \ + __ALIGN; \ + name: + +#define KPROBE_END_X86(name) \ + __unset_kprobe_x86; \ + __check_entry_kprobe_x86; \ + .size name, .-name; \ + .popsection + #endif /* _ASM_X86_LINKAGE_H */ -- cgit v1.2.3 From a1a00b58855ccdbedf556b4f5638d5208b454472 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 23 Nov 2008 19:37:09 +0100 Subject: x86: boot - fix sparse warnings Impact: make global variables static Fix these sparse warnings: arch/x86/boot/video.c:233:3: warning: symbol 'saved' was not declared. Should it be static? arch/x86/boot/video-vga.c:37:13: warning: symbol 'video_vga' was not declared. Should it be static? Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- arch/x86/boot/video-vga.c | 4 ++-- arch/x86/boot/video.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index b939cb476dec..5d4742ed4aa2 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -34,7 +34,7 @@ static struct mode_info cga_modes[] = { { VIDEO_80x25, 80, 25, 0 }, }; -__videocard video_vga; +static __videocard video_vga; /* Set basic 80x25 mode */ static u8 vga_set_basic_mode(void) @@ -259,7 +259,7 @@ static int vga_probe(void) return mode_count[adapter]; } -__videocard video_vga = { +static __videocard video_vga = { .card_name = "VGA", .probe = vga_probe, .set_mode = vga_set_mode, diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 83598b23093a..3bef2c1febe9 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -226,7 +226,7 @@ static unsigned int mode_menu(void) #ifdef CONFIG_VIDEO_RETAIN /* Save screen content to the heap */ -struct saved_screen { +static struct saved_screen { int x, y; int curx, cury; u16 *data; -- cgit v1.2.3 From 5f5db591326779a80cfe490c5d6b6ce9fac08b31 Mon Sep 17 00:00:00 2001 From: jia zhang Date: Sun, 23 Nov 2008 22:47:10 +0800 Subject: x86, debug: remove the confusing entry in call trace Impact: improve backtrace quality avoid the confusion in call trace because of the lack of padding at the tail of function. When do_exit gets called, the return address behind call instruction is pushed into stack. If something get wrong in do_exit, for x86_64, the entry "kernel_execve +0x00/0xXX" rather than "child_rip +0xYY/0xZZ" is in the call trace. That looks confusing, so add a u2d to make the return address still part of the original call site. (This also catches any instances of us returning from that function somehow.) Signed-off-by: jia zhang Acked-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + arch/x86/kernel/entry_64.S | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 28b597ef9ca1..f6402c4ba10d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1051,6 +1051,7 @@ ENTRY(kernel_thread_helper) push %eax CFI_ADJUST_CFA_OFFSET 4 call do_exit + ud2 # padding for call trace CFI_ENDPROC ENDPROC(kernel_thread_helper) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ddeeb1052583..4a16bf31c783 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1172,6 +1172,7 @@ child_rip: # exit mov %eax, %edi call do_exit + ud2 # padding for call trace CFI_ENDPROC ENDPROC(child_rip) -- cgit v1.2.3 From 3b71e9e307b3406aa29960a7428247f8a48b810c Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 23 Nov 2008 20:19:33 +0100 Subject: x86: HPET: fix sparse warning Impact: make global variable static Fix this sparse warning: arch/x86/kernel/hpet.c:36:18: warning: symbol 'hpet_num_timers' was not declared. Should it be static? Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 067d8de913f6..15fcaacc1f84 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,7 +33,7 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; -unsigned long hpet_num_timers; +static unsigned long hpet_num_timers; static void __iomem *hpet_virt_address; struct hpet_dev { -- cgit v1.2.3 From e45f2c07742d447597df001c878bc4a8aafcde37 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Mon, 24 Nov 2008 11:28:36 +0300 Subject: x86: correct link to HPET timer specification Impact: update documentation / help text Original link is dead. Signed-off-by: Denis V. Lunev Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- drivers/char/hpet.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f7..19f0d97829ee 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -482,7 +482,7 @@ config HPET_TIMER The HPET provides a stable time base on SMP systems, unlike the TSC, but it is more expensive to access, as it is off-chip. You can find the HPET spec at - . + . You can safely choose Y here. However, HPET will only be activated if the platform and the BIOS support this feature. diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 53fdc7ff3870..32b8bbf5003e 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -46,7 +46,7 @@ /* * The High Precision Event Timer driver. * This driver is closely modelled after the rtc.c driver. - * http://www.intel.com/hardwaredesign/hpetspec.htm + * http://www.intel.com/hardwaredesign/hpetspec_1.pdf */ #define HPET_USER_FREQ (64) #define HPET_DRIFT (500) -- cgit v1.2.3 From ad07e914e681f18ec0eaba60db17f497ee7e7e78 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 24 Nov 2008 11:33:12 +0100 Subject: x86 defconfig: increase CONFIG_LOG_BUF_SHIFT Impact: double the defconfig printk buffer Booting defconfigs produces more output than 128K so the output is truncated - double it to 256K. Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 2 +- arch/x86/configs/x86_64_defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 13b8c86ae985..71fc39c70782 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -77,7 +77,7 @@ CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y CONFIG_AUDIT_TREE=y # CONFIG_IKCONFIG is not set -CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y # CONFIG_CGROUP_DEBUG is not set CONFIG_CGROUP_NS=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index f0a03d7a7d63..b38bbabc1706 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -77,7 +77,7 @@ CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y CONFIG_AUDIT_TREE=y # CONFIG_IKCONFIG is not set -CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y # CONFIG_CGROUP_DEBUG is not set CONFIG_CGROUP_NS=y -- cgit v1.2.3 From e951e4af2e399c46891004d4931333d2d8d520ab Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 25 Nov 2008 08:42:01 +0100 Subject: x86: fix unused variable warning in arch/x86/kernel/hpet.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix build warning this warning: arch/x86/kernel/hpet.c:36: warning: ‘hpet_num_timers’ defined but not used Triggers because hpet_num_timers is unused in the !CONFIG_PCI_MSI case. Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 15fcaacc1f84..3f0a3edf0a57 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,7 +33,9 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +#ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; +#endif static void __iomem *hpet_virt_address; struct hpet_dev { -- cgit v1.2.3 From 2601657d223d82053d4e1fe1063091401e6b860a Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 24 Nov 2008 18:21:37 -0800 Subject: x86: signal: move {setup|restore}_sigcontext() Impact: cleanup Move {setup|restore}_sigcontext() declaration onto head of file. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 271 ++++++++++++++++++++++---------------------- arch/x86/kernel/signal_64.c | 148 ++++++++++++------------ 2 files changed, 210 insertions(+), 209 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index f7dd6c44c042..b3f30d2a2178 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -70,6 +70,142 @@ static const struct { 0 }; +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp; \ +} + +#define COPY_SEG_CPL3(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + +#define GET_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + loadsegment(seg, tmp); \ +} + +static int +restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, + unsigned long *pax) +{ + void __user *buf; + unsigned int tmpflags; + unsigned int err = 0; + + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + +#ifdef CONFIG_X86_32 + GET_SEG(gs); + COPY_SEG(fs); + COPY_SEG(es); + COPY_SEG(ds); +#endif /* CONFIG_X86_32 */ + + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); + COPY(dx); COPY(cx); COPY(ip); + +#ifdef CONFIG_X86_64 + COPY(r8); + COPY(r9); + COPY(r10); + COPY(r11); + COPY(r12); + COPY(r13); + COPY(r14); + COPY(r15); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_X86_32 + COPY_SEG_CPL3(cs); + COPY_SEG_CPL3(ss); +#else /* !CONFIG_X86_32 */ + /* Kernel saves and restores only the CS segment register on signals, + * which is the bare minimum needed to allow mixed 32/64-bit code. + * App's signal handler can save/restore other segments if needed. */ + COPY_SEG_CPL3(cs); +#endif /* CONFIG_X86_32 */ + + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ + + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); + + err |= __get_user(*pax, &sc->ax); + return err; +} + +static int +setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) +{ + int err = 0; + +#ifdef CONFIG_X86_32 + { + unsigned int tmp; + + savesegment(gs, tmp); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + } + err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); + err |= __put_user(regs->es, (unsigned int __user *)&sc->es); + err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); +#endif /* CONFIG_X86_32 */ + + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); +#ifdef CONFIG_X86_64 + err |= __put_user(regs->r8, &sc->r8); + err |= __put_user(regs->r9, &sc->r9); + err |= __put_user(regs->r10, &sc->r10); + err |= __put_user(regs->r11, &sc->r11); + err |= __put_user(regs->r12, &sc->r12); + err |= __put_user(regs->r13, &sc->r13); + err |= __put_user(regs->r14, &sc->r14); + err |= __put_user(regs->r15, &sc->r15); +#endif /* CONFIG_X86_64 */ + + err |= __put_user(current->thread.trap_no, &sc->trapno); + err |= __put_user(current->thread.error_code, &sc->err); + err |= __put_user(regs->ip, &sc->ip); +#ifdef CONFIG_X86_32 + err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); +#else /* !CONFIG_X86_32 */ + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->cs, &sc->cs); + err |= __put_user(0, &sc->gs); + err |= __put_user(0, &sc->fs); +#endif /* CONFIG_X86_32 */ + + err |= __put_user(fpstate, &sc->fpstate); + + /* non-iBCS2 extensions.. */ + err |= __put_user(mask, &sc->oldmask); + err |= __put_user(current->thread.cr2, &sc->cr2); + + return err; +} + /* * Atomically swap in the new signal mask, and wait for a signal. */ @@ -147,84 +283,9 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, } #endif /* CONFIG_X86_32 */ -#define COPY(x) { \ - err |= __get_user(regs->x, &sc->x); \ -} - -#define COPY_SEG(seg) { \ - unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp; \ -} - -#define COPY_SEG_CPL3(seg) { \ - unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp | 3; \ -} - -#define GET_SEG(seg) { \ - unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - loadsegment(seg, tmp); \ -} - /* * Do a signal return; undo the signal stack. */ -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax) -{ - void __user *buf; - unsigned int tmpflags; - unsigned int err = 0; - - /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; - -#ifdef CONFIG_X86_32 - GET_SEG(gs); - COPY_SEG(fs); - COPY_SEG(es); - COPY_SEG(ds); -#endif /* CONFIG_X86_32 */ - - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); - -#ifdef CONFIG_X86_64 - COPY(r8); - COPY(r9); - COPY(r10); - COPY(r11); - COPY(r12); - COPY(r13); - COPY(r14); - COPY(r15); -#endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_X86_32 - COPY_SEG_CPL3(cs); - COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ - - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - - err |= __get_user(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); - - err |= __get_user(*pax, &sc->ax); - return err; -} - asmlinkage unsigned long sys_sigreturn(unsigned long __unused) { struct sigframe __user *frame; @@ -316,66 +377,6 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) /* * Set up a signal frame. */ -static int -setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, unsigned long mask) -{ - int err = 0; - -#ifdef CONFIG_X86_32 - { - unsigned int tmp; - - savesegment(gs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - } - err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); - err |= __put_user(regs->es, (unsigned int __user *)&sc->es); - err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); -#endif /* CONFIG_X86_32 */ - - err |= __put_user(regs->di, &sc->di); - err |= __put_user(regs->si, &sc->si); - err |= __put_user(regs->bp, &sc->bp); - err |= __put_user(regs->sp, &sc->sp); - err |= __put_user(regs->bx, &sc->bx); - err |= __put_user(regs->dx, &sc->dx); - err |= __put_user(regs->cx, &sc->cx); - err |= __put_user(regs->ax, &sc->ax); -#ifdef CONFIG_X86_64 - err |= __put_user(regs->r8, &sc->r8); - err |= __put_user(regs->r9, &sc->r9); - err |= __put_user(regs->r10, &sc->r10); - err |= __put_user(regs->r11, &sc->r11); - err |= __put_user(regs->r12, &sc->r12); - err |= __put_user(regs->r13, &sc->r13); - err |= __put_user(regs->r14, &sc->r14); - err |= __put_user(regs->r15, &sc->r15); -#endif /* CONFIG_X86_64 */ - - err |= __put_user(current->thread.trap_no, &sc->trapno); - err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->ip, &sc->ip); -#ifdef CONFIG_X86_32 - err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); - err |= __put_user(regs->flags, &sc->flags); - err |= __put_user(regs->sp, &sc->sp_at_signal); - err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); -#else /* !CONFIG_X86_32 */ - err |= __put_user(regs->flags, &sc->flags); - err |= __put_user(regs->cs, &sc->cs); - err |= __put_user(0, &sc->gs); - err |= __put_user(0, &sc->fs); -#endif /* CONFIG_X86_32 */ - - err |= __put_user(fpstate, &sc->fpstate); - - /* non-iBCS2 extensions.. */ - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->thread.cr2, &sc->cr2); - - return err; -} /* * Determine which stack to use.. diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 32718f5e4f61..771c8fcc8b0d 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -50,28 +50,6 @@ # define FIX_EFLAGS __FIX_EFLAGS #endif -#ifdef CONFIG_X86_32 -asmlinkage int sys_sigaltstack(unsigned long bx) -{ - /* - * This is needed to make gcc realize it doesn't own the - * "struct pt_regs" - */ - struct pt_regs *regs = (struct pt_regs *)&bx; - const stack_t __user *uss = (const stack_t __user *)bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; - - return do_sigaltstack(uss, uoss, regs->sp); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->sp); -} -#endif /* CONFIG_X86_32 */ - #define COPY(x) { \ err |= __get_user(regs->x, &sc->x); \ } @@ -82,9 +60,6 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, regs->seg = tmp | 3; \ } -/* - * Do a signal return; undo the signal stack. - */ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) @@ -138,54 +113,6 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, return err; } -static long do_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - unsigned long ax; - sigset_t set; - - frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) - goto badframe; - - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) - goto badframe; - - return ax; - -badframe: - signal_fault(regs, frame, "rt_sigreturn"); - return 0; -} - -#ifdef CONFIG_X86_32 -asmlinkage int sys_rt_sigreturn(unsigned long __unused) -{ - struct pt_regs *regs = (struct pt_regs *)&__unused; - - return do_rt_sigreturn(regs); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -{ - return do_rt_sigreturn(regs); -} -#endif /* CONFIG_X86_32 */ - -/* - * Set up a signal frame. - */ static int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) @@ -247,10 +174,83 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, return err; } +#ifdef CONFIG_X86_32 +asmlinkage int sys_sigaltstack(unsigned long bx) +{ + /* + * This is needed to make gcc realize it doesn't own the + * "struct pt_regs" + */ + struct pt_regs *regs = (struct pt_regs *)&bx; + const stack_t __user *uss = (const stack_t __user *)bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; + + return do_sigaltstack(uss, uoss, regs->sp); +} +#else /* !CONFIG_X86_32 */ +asmlinkage long +sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, + struct pt_regs *regs) +{ + return do_sigaltstack(uss, uoss, regs->sp); +} +#endif /* CONFIG_X86_32 */ + /* - * Determine which stack to use.. + * Do a signal return; undo the signal stack. + */ +static long do_rt_sigreturn(struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + unsigned long ax; + sigset_t set; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + goto badframe; + + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) + goto badframe; + + return ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +#ifdef CONFIG_X86_32 +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *)&__unused; + + return do_rt_sigreturn(regs); +} +#else /* !CONFIG_X86_32 */ +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} +#endif /* CONFIG_X86_32 */ + +/* + * Set up a signal frame. */ +/* + * Determine which stack to use.. + */ static void __user * get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size) { -- cgit v1.2.3 From bfeb91a9435889ef4fe7bfbb4b673f625e69e790 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 24 Nov 2008 18:23:12 -0800 Subject: x86: signal: cosmetic unification of __setup_sigframe() and __setup_rt_sigframe() Impact: cleanup Add #ifdef directive to unify __setup_sigframe() and __setup_rt_sigframe(). Move them after {setup|restore}_sigcontext() declaration. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 469 ++++++++++++++++++++++++++------------------ arch/x86/kernel/signal_64.c | 309 ++++++++++++++++++++++++----- 2 files changed, 536 insertions(+), 242 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index b3f30d2a2178..e9f71298e746 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -48,28 +48,6 @@ # define FIX_EFLAGS __FIX_EFLAGS #endif -static const struct { - u16 poplmovl; - u32 val; - u16 int80; -} __attribute__((packed)) retcode = { - 0xb858, /* popl %eax; movl $..., %eax */ - __NR_sigreturn, - 0x80cd, /* int $0x80 */ -}; - -static const struct { - u8 movl; - u32 val; - u16 int80; - u8 pad; -} __attribute__((packed)) rt_retcode = { - 0xb8, /* movl $..., %eax */ - __NR_rt_sigreturn, - 0x80cd, /* int $0x80 */ - 0 -}; - #define COPY(x) { \ err |= __get_user(regs->x, &sc->x); \ } @@ -207,176 +185,30 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, } /* - * Atomically swap in the new signal mask, and wait for a signal. + * Set up a signal frame. */ -asmlinkage int -sys_sigsuspend(int history0, int history1, old_sigset_t mask) -{ - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - - return -ERESTARTNOHAND; -} - -asmlinkage int -sys_sigaction(int sig, const struct old_sigaction __user *act, - struct old_sigaction __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - - if (act) { - old_sigset_t mask; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(new_ka.sa.sa_handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) - return -EFAULT; - - __get_user(new_ka.sa.sa_flags, &act->sa_flags); - __get_user(mask, &act->sa_mask); - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || - __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) - return -EFAULT; - - __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); - } - - return ret; -} - #ifdef CONFIG_X86_32 -asmlinkage int sys_sigaltstack(unsigned long bx) -{ - /* - * This is needed to make gcc realize it doesn't own the - * "struct pt_regs" - */ - struct pt_regs *regs = (struct pt_regs *)&bx; - const stack_t __user *uss = (const stack_t __user *)bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; - - return do_sigaltstack(uss, uoss, regs->sp); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->sp); -} -#endif /* CONFIG_X86_32 */ - -/* - * Do a signal return; undo the signal stack. - */ -asmlinkage unsigned long sys_sigreturn(unsigned long __unused) -{ - struct sigframe __user *frame; - struct pt_regs *regs; - unsigned long ax; - sigset_t set; - - regs = (struct pt_regs *) &__unused; - frame = (struct sigframe __user *)(regs->sp - 8); - - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 - && __copy_from_user(&set.sig[1], &frame->extramask, - sizeof(frame->extramask)))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->sc, &ax)) - goto badframe; - return ax; - -badframe: - if (show_unhandled_signals && printk_ratelimit()) { - printk("%s%s[%d] bad frame in sigreturn frame:" - "%p ip:%lx sp:%lx oeax:%lx", - task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, - current->comm, task_pid_nr(current), frame, regs->ip, - regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); - } - - force_sig(SIGSEGV, current); - - return 0; -} - -static long do_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - unsigned long ax; - sigset_t set; - - frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) - goto badframe; - - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) - goto badframe; - - return ax; - -badframe: - signal_fault(regs, frame, "rt_sigreturn"); - return 0; -} - -#ifdef CONFIG_X86_32 -asmlinkage int sys_rt_sigreturn(unsigned long __unused) -{ - struct pt_regs *regs = (struct pt_regs *)&__unused; - - return do_rt_sigreturn(regs); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -{ - return do_rt_sigreturn(regs); -} -#endif /* CONFIG_X86_32 */ +static const struct { + u16 poplmovl; + u32 val; + u16 int80; +} __attribute__((packed)) retcode = { + 0xb858, /* popl %eax; movl $..., %eax */ + __NR_sigreturn, + 0x80cd, /* int $0x80 */ +}; -/* - * Set up a signal frame. - */ +static const struct { + u8 movl; + u32 val; + u16 int80; + u8 pad; +} __attribute__((packed)) rt_retcode = { + 0xb8, /* movl $..., %eax */ + __NR_rt_sigreturn, + 0x80cd, /* int $0x80 */ + 0 +}; /* * Determine which stack to use.. @@ -557,6 +389,265 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return 0; } +#else /* !CONFIG_X86_32 */ +/* + * Determine which stack to use.. + */ +static void __user * +get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size) +{ + /* Default to using normal stack - redzone*/ + sp -= 128; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; + } + + return (void __user *)round_down(sp - size, 64); +} + +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + void __user *fp = NULL; + int err = 0; + struct task_struct *me = current; + + if (used_math()) { + fp = get_stack(ka, regs->sp, sig_xstate_size); + frame = (void __user *)round_down( + (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; + + if (save_i387_xstate(fp) < 0) + return -EFAULT; + } else + frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8; + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (ka->sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, info)) + return -EFAULT; + } + + /* Create the ucontext. */ + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(regs->sp), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + /* x86-64 should always use SA_RESTORER. */ + if (ka->sa.sa_flags & SA_RESTORER) { + err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); + } else { + /* could use a vstub here */ + return -EFAULT; + } + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->di = sig; + /* In case the signal handler was declared without prototypes */ + regs->ax = 0; + + /* This also works for non SA_SIGINFO handlers because they expect the + next argument after the signal number on the stack. */ + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ka->sa.sa_handler; + + regs->sp = (unsigned long)frame; + + /* Set up the CS register to run signal handlers in 64-bit mode, + even if the handler happens to be interrupting 32-bit code. */ + regs->cs = __USER_CS; + + return 0; +} +#endif /* CONFIG_X86_32 */ + +/* + * Atomically swap in the new signal mask, and wait for a signal. + */ +asmlinkage int +sys_sigsuspend(int history0, int history1, old_sigset_t mask) +{ + mask &= _BLOCKABLE; + spin_lock_irq(¤t->sighand->siglock); + current->saved_sigmask = current->blocked; + siginitset(¤t->blocked, mask); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_restore_sigmask(); + + return -ERESTARTNOHAND; +} + +asmlinkage int +sys_sigaction(int sig, const struct old_sigaction __user *act, + struct old_sigaction __user *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + return -EFAULT; + + __get_user(new_ka.sa.sa_flags, &act->sa_flags); + __get_user(mask, &act->sa_mask); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + return -EFAULT; + + __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); + } + + return ret; +} + +#ifdef CONFIG_X86_32 +asmlinkage int sys_sigaltstack(unsigned long bx) +{ + /* + * This is needed to make gcc realize it doesn't own the + * "struct pt_regs" + */ + struct pt_regs *regs = (struct pt_regs *)&bx; + const stack_t __user *uss = (const stack_t __user *)bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; + + return do_sigaltstack(uss, uoss, regs->sp); +} +#else /* !CONFIG_X86_32 */ +asmlinkage long +sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, + struct pt_regs *regs) +{ + return do_sigaltstack(uss, uoss, regs->sp); +} +#endif /* CONFIG_X86_32 */ + +/* + * Do a signal return; undo the signal stack. + */ +asmlinkage unsigned long sys_sigreturn(unsigned long __unused) +{ + struct sigframe __user *frame; + struct pt_regs *regs; + unsigned long ax; + sigset_t set; + + regs = (struct pt_regs *) &__unused; + frame = (struct sigframe __user *)(regs->sp - 8); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 + && __copy_from_user(&set.sig[1], &frame->extramask, + sizeof(frame->extramask)))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->sc, &ax)) + goto badframe; + return ax; + +badframe: + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s%s[%d] bad frame in sigreturn frame:" + "%p ip:%lx sp:%lx oeax:%lx", + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, + current->comm, task_pid_nr(current), frame, regs->ip, + regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk(KERN_CONT "\n"); + } + + force_sig(SIGSEGV, current); + + return 0; +} + +static long do_rt_sigreturn(struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + unsigned long ax; + sigset_t set; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + goto badframe; + + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) + goto badframe; + + return ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +#ifdef CONFIG_X86_32 +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *)&__unused; + + return do_rt_sigreturn(regs); +} +#else /* !CONFIG_X86_32 */ +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} +#endif /* CONFIG_X86_32 */ /* * OK, we're invoking a handler: diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 771c8fcc8b0d..2da7e6e60807 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -174,80 +174,212 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, return err; } +/* + * Set up a signal frame. + */ #ifdef CONFIG_X86_32 -asmlinkage int sys_sigaltstack(unsigned long bx) +static const struct { + u16 poplmovl; + u32 val; + u16 int80; +} __attribute__((packed)) retcode = { + 0xb858, /* popl %eax; movl $..., %eax */ + __NR_sigreturn, + 0x80cd, /* int $0x80 */ +}; + +static const struct { + u8 movl; + u32 val; + u16 int80; + u8 pad; +} __attribute__((packed)) rt_retcode = { + 0xb8, /* movl $..., %eax */ + __NR_rt_sigreturn, + 0x80cd, /* int $0x80 */ + 0 +}; + +/* + * Determine which stack to use.. + */ +static inline void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, + void **fpstate) { + unsigned long sp; + + /* Default to using normal stack */ + sp = regs->sp; + /* - * This is needed to make gcc realize it doesn't own the - * "struct pt_regs" + * If we are on the alternate signal stack and would overflow it, don't. + * Return an always-bogus address instead so we will die with SIGSEGV. */ - struct pt_regs *regs = (struct pt_regs *)&bx; - const stack_t __user *uss = (const stack_t __user *)bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; + if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size))) + return (void __user *) -1L; - return do_sigaltstack(uss, uoss, regs->sp); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->sp); + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; + } else { + /* This is the legacy signal stack switching. */ + if ((regs->ss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) + sp = (unsigned long) ka->sa.sa_restorer; + } + + if (used_math()) { + sp = sp - sig_xstate_size; + *fpstate = (struct _fpstate *) sp; + if (save_i387_xstate(*fpstate) < 0) + return (void __user *)-1L; + } + + sp -= frame_size; + /* + * Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. + */ + sp = ((sp + 4) & -16ul) - 4; + + return (void __user *) sp; } -#endif /* CONFIG_X86_32 */ -/* - * Do a signal return; undo the signal stack. - */ -static long do_rt_sigreturn(struct pt_regs *regs) +static int +__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, + struct pt_regs *regs) { - struct rt_sigframe __user *frame; - unsigned long ax; - sigset_t set; + struct sigframe __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; - frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) - goto badframe; + if (__put_user(sig, &frame->sig)) + return -EFAULT; - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) - goto badframe; + if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + return -EFAULT; - return ax; + if (_NSIG_WORDS > 1) { + if (__copy_to_user(&frame->extramask, &set->sig[1], + sizeof(frame->extramask))) + return -EFAULT; + } + + if (current->mm->context.vdso) + restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); + else + restorer = &frame->retcode; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + + /* Set up to return from userspace. */ + err |= __put_user(restorer, &frame->pretcode); + + /* + * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long)frame; + regs->ip = (unsigned long)ka->sa.sa_handler; + regs->ax = (unsigned long)sig; + regs->dx = 0; + regs->cx = 0; + + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; -badframe: - signal_fault(regs, frame, "rt_sigreturn"); return 0; } -#ifdef CONFIG_X86_32 -asmlinkage int sys_rt_sigreturn(unsigned long __unused) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { - struct pt_regs *regs = (struct pt_regs *)&__unused; + struct rt_sigframe __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; - return do_rt_sigreturn(regs); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -{ - return do_rt_sigreturn(regs); -} -#endif /* CONFIG_X86_32 */ + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); -/* - * Set up a signal frame. - */ + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + err |= __put_user(sig, &frame->sig); + err |= __put_user(&frame->info, &frame->pinfo); + err |= __put_user(&frame->uc, &frame->puc); + err |= copy_siginfo_to_user(&frame->info, info); + if (err) + return -EFAULT; + + /* Create the ucontext. */ + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(regs->sp), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) + return -EFAULT; + + /* Set up to return from userspace. */ + restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + err |= __put_user(restorer, &frame->pretcode); + + /* + * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode); + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long)frame; + regs->ip = (unsigned long)ka->sa.sa_handler; + regs->ax = (unsigned long)sig; + regs->dx = (unsigned long)&frame->info; + regs->cx = (unsigned long)&frame->uc; + + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + + return 0; +} +#else /* !CONFIG_X86_32 */ /* * Determine which stack to use.. */ @@ -337,6 +469,77 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return 0; } +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_32 +asmlinkage int sys_sigaltstack(unsigned long bx) +{ + /* + * This is needed to make gcc realize it doesn't own the + * "struct pt_regs" + */ + struct pt_regs *regs = (struct pt_regs *)&bx; + const stack_t __user *uss = (const stack_t __user *)bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; + + return do_sigaltstack(uss, uoss, regs->sp); +} +#else /* !CONFIG_X86_32 */ +asmlinkage long +sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, + struct pt_regs *regs) +{ + return do_sigaltstack(uss, uoss, regs->sp); +} +#endif /* CONFIG_X86_32 */ + +/* + * Do a signal return; undo the signal stack. + */ +static long do_rt_sigreturn(struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + unsigned long ax; + sigset_t set; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + goto badframe; + + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) + goto badframe; + + return ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +#ifdef CONFIG_X86_32 +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *)&__unused; + + return do_rt_sigreturn(regs); +} +#else /* !CONFIG_X86_32 */ +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} +#endif /* CONFIG_X86_32 */ /* * OK, we're invoking a handler -- cgit v1.2.3 From e5fa2d063cf2ca38eae5fb3469315db669d5c041 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 24 Nov 2008 18:24:11 -0800 Subject: x86: signal: unify signal_{32|64}.c, prepare Impact: cleanup Add #ifdef directive for 32-bit only code. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 6 +++ arch/x86/kernel/signal_64.c | 116 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 121 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index e9f71298e746..b1f4d34e0a38 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -1,8 +1,10 @@ /* * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-2002 x86-64 support by Andi Kleen */ #include @@ -481,6 +483,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } #endif /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_32 /* * Atomically swap in the new signal mask, and wait for a signal. */ @@ -535,6 +538,7 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, return ret; } +#endif /* CONFIG_X86_32 */ #ifdef CONFIG_X86_32 asmlinkage int sys_sigaltstack(unsigned long bx) @@ -561,6 +565,7 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, /* * Do a signal return; undo the signal stack. */ +#ifdef CONFIG_X86_32 asmlinkage unsigned long sys_sigreturn(unsigned long __unused) { struct sigframe __user *frame; @@ -603,6 +608,7 @@ badframe: return 0; } +#endif /* CONFIG_X86_32 */ static long do_rt_sigreturn(struct pt_regs *regs) { diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 2da7e6e60807..b1f4d34e0a38 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -54,12 +54,24 @@ err |= __get_user(regs->x, &sc->x); \ } +#define COPY_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp; \ +} + #define COPY_SEG_CPL3(seg) { \ unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ regs->seg = tmp | 3; \ } +#define GET_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + loadsegment(seg, tmp); \ +} + static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) @@ -471,6 +483,63 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } #endif /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_32 +/* + * Atomically swap in the new signal mask, and wait for a signal. + */ +asmlinkage int +sys_sigsuspend(int history0, int history1, old_sigset_t mask) +{ + mask &= _BLOCKABLE; + spin_lock_irq(¤t->sighand->siglock); + current->saved_sigmask = current->blocked; + siginitset(¤t->blocked, mask); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_restore_sigmask(); + + return -ERESTARTNOHAND; +} + +asmlinkage int +sys_sigaction(int sig, const struct old_sigaction __user *act, + struct old_sigaction __user *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + return -EFAULT; + + __get_user(new_ka.sa.sa_flags, &act->sa_flags); + __get_user(mask, &act->sa_mask); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + return -EFAULT; + + __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); + } + + return ret; +} +#endif /* CONFIG_X86_32 */ + #ifdef CONFIG_X86_32 asmlinkage int sys_sigaltstack(unsigned long bx) { @@ -496,6 +565,51 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, /* * Do a signal return; undo the signal stack. */ +#ifdef CONFIG_X86_32 +asmlinkage unsigned long sys_sigreturn(unsigned long __unused) +{ + struct sigframe __user *frame; + struct pt_regs *regs; + unsigned long ax; + sigset_t set; + + regs = (struct pt_regs *) &__unused; + frame = (struct sigframe __user *)(regs->sp - 8); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 + && __copy_from_user(&set.sig[1], &frame->extramask, + sizeof(frame->extramask)))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->sc, &ax)) + goto badframe; + return ax; + +badframe: + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s%s[%d] bad frame in sigreturn frame:" + "%p ip:%lx sp:%lx oeax:%lx", + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, + current->comm, task_pid_nr(current), frame, regs->ip, + regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk(KERN_CONT "\n"); + } + + force_sig(SIGSEGV, current); + + return 0; +} +#endif /* CONFIG_X86_32 */ + static long do_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; @@ -542,7 +656,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) #endif /* CONFIG_X86_32 */ /* - * OK, we're invoking a handler + * OK, we're invoking a handler: */ static int signr_convert(int sig) { -- cgit v1.2.3 From 5ceb40da9bacc8b056805d72efb1a52502d56b6b Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 24 Nov 2008 18:24:11 -0800 Subject: x86: signal: unify signal_{32|64}.c Impact: cleanup Unify signal_{32|64}.c! Mechanic unification - the two files are the same. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/signal.c | 915 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/signal_32.c | 915 -------------------------------------------- arch/x86/kernel/signal_64.c | 915 -------------------------------------------- 4 files changed, 916 insertions(+), 1831 deletions(-) create mode 100644 arch/x86/kernel/signal.c delete mode 100644 arch/x86/kernel/signal_32.c delete mode 100644 arch/x86/kernel/signal_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d7e5a58ee22f..ef28c210ebf8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) -obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o +obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c new file mode 100644 index 000000000000..b1f4d34e0a38 --- /dev/null +++ b/arch/x86/kernel/signal.c @@ -0,0 +1,915 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-2002 x86-64 support by Andi Kleen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#include +#include +#endif /* CONFIG_X86_64 */ + +#include +#include + +#include "sigframe.h" + +#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + +#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ + X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ + X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ + X86_EFLAGS_CF) + +#ifdef CONFIG_X86_32 +# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF) +#else +# define FIX_EFLAGS __FIX_EFLAGS +#endif + +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp; \ +} + +#define COPY_SEG_CPL3(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + +#define GET_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + loadsegment(seg, tmp); \ +} + +static int +restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, + unsigned long *pax) +{ + void __user *buf; + unsigned int tmpflags; + unsigned int err = 0; + + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + +#ifdef CONFIG_X86_32 + GET_SEG(gs); + COPY_SEG(fs); + COPY_SEG(es); + COPY_SEG(ds); +#endif /* CONFIG_X86_32 */ + + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); + COPY(dx); COPY(cx); COPY(ip); + +#ifdef CONFIG_X86_64 + COPY(r8); + COPY(r9); + COPY(r10); + COPY(r11); + COPY(r12); + COPY(r13); + COPY(r14); + COPY(r15); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_X86_32 + COPY_SEG_CPL3(cs); + COPY_SEG_CPL3(ss); +#else /* !CONFIG_X86_32 */ + /* Kernel saves and restores only the CS segment register on signals, + * which is the bare minimum needed to allow mixed 32/64-bit code. + * App's signal handler can save/restore other segments if needed. */ + COPY_SEG_CPL3(cs); +#endif /* CONFIG_X86_32 */ + + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ + + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); + + err |= __get_user(*pax, &sc->ax); + return err; +} + +static int +setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) +{ + int err = 0; + +#ifdef CONFIG_X86_32 + { + unsigned int tmp; + + savesegment(gs, tmp); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + } + err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); + err |= __put_user(regs->es, (unsigned int __user *)&sc->es); + err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); +#endif /* CONFIG_X86_32 */ + + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); +#ifdef CONFIG_X86_64 + err |= __put_user(regs->r8, &sc->r8); + err |= __put_user(regs->r9, &sc->r9); + err |= __put_user(regs->r10, &sc->r10); + err |= __put_user(regs->r11, &sc->r11); + err |= __put_user(regs->r12, &sc->r12); + err |= __put_user(regs->r13, &sc->r13); + err |= __put_user(regs->r14, &sc->r14); + err |= __put_user(regs->r15, &sc->r15); +#endif /* CONFIG_X86_64 */ + + err |= __put_user(current->thread.trap_no, &sc->trapno); + err |= __put_user(current->thread.error_code, &sc->err); + err |= __put_user(regs->ip, &sc->ip); +#ifdef CONFIG_X86_32 + err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); +#else /* !CONFIG_X86_32 */ + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->cs, &sc->cs); + err |= __put_user(0, &sc->gs); + err |= __put_user(0, &sc->fs); +#endif /* CONFIG_X86_32 */ + + err |= __put_user(fpstate, &sc->fpstate); + + /* non-iBCS2 extensions.. */ + err |= __put_user(mask, &sc->oldmask); + err |= __put_user(current->thread.cr2, &sc->cr2); + + return err; +} + +/* + * Set up a signal frame. + */ +#ifdef CONFIG_X86_32 +static const struct { + u16 poplmovl; + u32 val; + u16 int80; +} __attribute__((packed)) retcode = { + 0xb858, /* popl %eax; movl $..., %eax */ + __NR_sigreturn, + 0x80cd, /* int $0x80 */ +}; + +static const struct { + u8 movl; + u32 val; + u16 int80; + u8 pad; +} __attribute__((packed)) rt_retcode = { + 0xb8, /* movl $..., %eax */ + __NR_rt_sigreturn, + 0x80cd, /* int $0x80 */ + 0 +}; + +/* + * Determine which stack to use.. + */ +static inline void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, + void **fpstate) +{ + unsigned long sp; + + /* Default to using normal stack */ + sp = regs->sp; + + /* + * If we are on the alternate signal stack and would overflow it, don't. + * Return an always-bogus address instead so we will die with SIGSEGV. + */ + if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size))) + return (void __user *) -1L; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; + } else { + /* This is the legacy signal stack switching. */ + if ((regs->ss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) + sp = (unsigned long) ka->sa.sa_restorer; + } + + if (used_math()) { + sp = sp - sig_xstate_size; + *fpstate = (struct _fpstate *) sp; + if (save_i387_xstate(*fpstate) < 0) + return (void __user *)-1L; + } + + sp -= frame_size; + /* + * Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. + */ + sp = ((sp + 4) & -16ul) - 4; + + return (void __user *) sp; +} + +static int +__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, + struct pt_regs *regs) +{ + struct sigframe __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (__put_user(sig, &frame->sig)) + return -EFAULT; + + if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + return -EFAULT; + + if (_NSIG_WORDS > 1) { + if (__copy_to_user(&frame->extramask, &set->sig[1], + sizeof(frame->extramask))) + return -EFAULT; + } + + if (current->mm->context.vdso) + restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); + else + restorer = &frame->retcode; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + + /* Set up to return from userspace. */ + err |= __put_user(restorer, &frame->pretcode); + + /* + * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long)frame; + regs->ip = (unsigned long)ka->sa.sa_handler; + regs->ax = (unsigned long)sig; + regs->dx = 0; + regs->cx = 0; + + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + + return 0; +} + +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + err |= __put_user(sig, &frame->sig); + err |= __put_user(&frame->info, &frame->pinfo); + err |= __put_user(&frame->uc, &frame->puc); + err |= copy_siginfo_to_user(&frame->info, info); + if (err) + return -EFAULT; + + /* Create the ucontext. */ + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(regs->sp), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) + return -EFAULT; + + /* Set up to return from userspace. */ + restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + err |= __put_user(restorer, &frame->pretcode); + + /* + * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long)frame; + regs->ip = (unsigned long)ka->sa.sa_handler; + regs->ax = (unsigned long)sig; + regs->dx = (unsigned long)&frame->info; + regs->cx = (unsigned long)&frame->uc; + + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + + return 0; +} +#else /* !CONFIG_X86_32 */ +/* + * Determine which stack to use.. + */ +static void __user * +get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size) +{ + /* Default to using normal stack - redzone*/ + sp -= 128; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; + } + + return (void __user *)round_down(sp - size, 64); +} + +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + void __user *fp = NULL; + int err = 0; + struct task_struct *me = current; + + if (used_math()) { + fp = get_stack(ka, regs->sp, sig_xstate_size); + frame = (void __user *)round_down( + (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; + + if (save_i387_xstate(fp) < 0) + return -EFAULT; + } else + frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8; + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (ka->sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, info)) + return -EFAULT; + } + + /* Create the ucontext. */ + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(regs->sp), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + /* x86-64 should always use SA_RESTORER. */ + if (ka->sa.sa_flags & SA_RESTORER) { + err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); + } else { + /* could use a vstub here */ + return -EFAULT; + } + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->di = sig; + /* In case the signal handler was declared without prototypes */ + regs->ax = 0; + + /* This also works for non SA_SIGINFO handlers because they expect the + next argument after the signal number on the stack. */ + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ka->sa.sa_handler; + + regs->sp = (unsigned long)frame; + + /* Set up the CS register to run signal handlers in 64-bit mode, + even if the handler happens to be interrupting 32-bit code. */ + regs->cs = __USER_CS; + + return 0; +} +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_32 +/* + * Atomically swap in the new signal mask, and wait for a signal. + */ +asmlinkage int +sys_sigsuspend(int history0, int history1, old_sigset_t mask) +{ + mask &= _BLOCKABLE; + spin_lock_irq(¤t->sighand->siglock); + current->saved_sigmask = current->blocked; + siginitset(¤t->blocked, mask); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_restore_sigmask(); + + return -ERESTARTNOHAND; +} + +asmlinkage int +sys_sigaction(int sig, const struct old_sigaction __user *act, + struct old_sigaction __user *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + return -EFAULT; + + __get_user(new_ka.sa.sa_flags, &act->sa_flags); + __get_user(mask, &act->sa_mask); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + return -EFAULT; + + __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); + } + + return ret; +} +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_32 +asmlinkage int sys_sigaltstack(unsigned long bx) +{ + /* + * This is needed to make gcc realize it doesn't own the + * "struct pt_regs" + */ + struct pt_regs *regs = (struct pt_regs *)&bx; + const stack_t __user *uss = (const stack_t __user *)bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; + + return do_sigaltstack(uss, uoss, regs->sp); +} +#else /* !CONFIG_X86_32 */ +asmlinkage long +sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, + struct pt_regs *regs) +{ + return do_sigaltstack(uss, uoss, regs->sp); +} +#endif /* CONFIG_X86_32 */ + +/* + * Do a signal return; undo the signal stack. + */ +#ifdef CONFIG_X86_32 +asmlinkage unsigned long sys_sigreturn(unsigned long __unused) +{ + struct sigframe __user *frame; + struct pt_regs *regs; + unsigned long ax; + sigset_t set; + + regs = (struct pt_regs *) &__unused; + frame = (struct sigframe __user *)(regs->sp - 8); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 + && __copy_from_user(&set.sig[1], &frame->extramask, + sizeof(frame->extramask)))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->sc, &ax)) + goto badframe; + return ax; + +badframe: + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s%s[%d] bad frame in sigreturn frame:" + "%p ip:%lx sp:%lx oeax:%lx", + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, + current->comm, task_pid_nr(current), frame, regs->ip, + regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk(KERN_CONT "\n"); + } + + force_sig(SIGSEGV, current); + + return 0; +} +#endif /* CONFIG_X86_32 */ + +static long do_rt_sigreturn(struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + unsigned long ax; + sigset_t set; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + goto badframe; + + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) + goto badframe; + + return ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +#ifdef CONFIG_X86_32 +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *)&__unused; + + return do_rt_sigreturn(regs); +} +#else /* !CONFIG_X86_32 */ +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} +#endif /* CONFIG_X86_32 */ + +/* + * OK, we're invoking a handler: + */ +static int signr_convert(int sig) +{ +#ifdef CONFIG_X86_32 + struct thread_info *info = current_thread_info(); + + if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) + return info->exec_domain->signal_invmap[sig]; +#endif /* CONFIG_X86_32 */ + return sig; +} + +#ifdef CONFIG_X86_32 + +#define is_ia32 1 +#define ia32_setup_frame __setup_frame +#define ia32_setup_rt_frame __setup_rt_frame + +#else /* !CONFIG_X86_32 */ + +#ifdef CONFIG_IA32_EMULATION +#define is_ia32 test_thread_flag(TIF_IA32) +#else /* !CONFIG_IA32_EMULATION */ +#define is_ia32 0 +#endif /* CONFIG_IA32_EMULATION */ + +#endif /* CONFIG_X86_32 */ + +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int usig = signr_convert(sig); + int ret; + + /* Set up the stack frame */ + if (is_ia32) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); + else + ret = ia32_setup_frame(usig, ka, set, regs); + } else + ret = __setup_rt_frame(sig, ka, info, set, regs); + + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + + return ret; +} + +static int +handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, + sigset_t *oldset, struct pt_regs *regs) +{ + int ret; + + /* Are we from a system call? */ + if (syscall_get_nr(current, regs) >= 0) { + /* If so, check system call restarting.. */ + switch (syscall_get_error(current, regs)) { + case -ERESTART_RESTARTBLOCK: + case -ERESTARTNOHAND: + regs->ax = -EINTR; + break; + + case -ERESTARTSYS: + if (!(ka->sa.sa_flags & SA_RESTART)) { + regs->ax = -EINTR; + break; + } + /* fallthrough */ + case -ERESTARTNOINTR: + regs->ax = regs->orig_ax; + regs->ip -= 2; + break; + } + } + + /* + * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF + * flag so that register information in the sigcontext is correct. + */ + if (unlikely(regs->flags & X86_EFLAGS_TF) && + likely(test_and_clear_thread_flag(TIF_FORCED_TF))) + regs->flags &= ~X86_EFLAGS_TF; + + ret = setup_rt_frame(sig, ka, info, oldset, regs); + + if (ret) + return ret; + +#ifdef CONFIG_X86_64 + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); +#endif + + /* + * Clear the direction flag as per the ABI for function entry. + */ + regs->flags &= ~X86_EFLAGS_DF; + + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~X86_EFLAGS_TF; + + spin_lock_irq(¤t->sighand->siglock); + sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(¤t->blocked, sig); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); + + return 0; +} + +#ifdef CONFIG_X86_32 +#define NR_restart_syscall __NR_restart_syscall +#else /* !CONFIG_X86_32 */ +#define NR_restart_syscall \ + test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall +#endif /* CONFIG_X86_32 */ + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + */ +static void do_signal(struct pt_regs *regs) +{ + struct k_sigaction ka; + siginfo_t info; + int signr; + sigset_t *oldset; + + /* + * We want the common case to go fast, which is why we may in certain + * cases get here from kernel mode. Just return without doing anything + * if so. + * X86_32: vm86 regs switched out by assembly code before reaching + * here, so testing against kernel CS suffices. + */ + if (!user_mode(regs)) + return; + + if (current_thread_info()->status & TS_RESTORE_SIGMASK) + oldset = ¤t->saved_sigmask; + else + oldset = ¤t->blocked; + + signr = get_signal_to_deliver(&info, &ka, regs, NULL); + if (signr > 0) { + /* + * Re-enable any watchpoints before delivering the + * signal to user space. The processor register will + * have been cleared if the watchpoint triggered + * inside the kernel. + */ + if (current->thread.debugreg7) + set_debugreg(current->thread.debugreg7, 7); + + /* Whee! Actually deliver the signal. */ + if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + /* + * A signal was successfully delivered; the saved + * sigmask will have been stored in the signal frame, + * and will be restored by sigreturn, so we can simply + * clear the TS_RESTORE_SIGMASK flag. + */ + current_thread_info()->status &= ~TS_RESTORE_SIGMASK; + } + return; + } + + /* Did we come from a system call? */ + if (syscall_get_nr(current, regs) >= 0) { + /* Restart the system call - no handlers present */ + switch (syscall_get_error(current, regs)) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->ax = regs->orig_ax; + regs->ip -= 2; + break; + + case -ERESTART_RESTARTBLOCK: + regs->ax = NR_restart_syscall; + regs->ip -= 2; + break; + } + } + + /* + * If there's no signal to deliver, we just put the saved sigmask + * back. + */ + if (current_thread_info()->status & TS_RESTORE_SIGMASK) { + current_thread_info()->status &= ~TS_RESTORE_SIGMASK; + sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); + } +} + +/* + * notification of userspace execution resumption + * - triggered by the TIF_WORK_MASK flags + */ +void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) +{ +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_user(); +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + + /* deal with pending signal delivery */ + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + +#ifdef CONFIG_X86_32 + clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ +} + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +{ + struct task_struct *me = current; + + if (show_unhandled_signals && printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk(KERN_CONT "\n"); + } + + force_sig(SIGSEGV, me); +} diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c deleted file mode 100644 index b1f4d34e0a38..000000000000 --- a/arch/x86/kernel/signal_32.c +++ /dev/null @@ -1,915 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes - * 2000-2002 x86-64 support by Andi Kleen - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#ifdef CONFIG_X86_64 -#include -#include -#include -#endif /* CONFIG_X86_64 */ - -#include -#include - -#include "sigframe.h" - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ - X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ - X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ - X86_EFLAGS_CF) - -#ifdef CONFIG_X86_32 -# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF) -#else -# define FIX_EFLAGS __FIX_EFLAGS -#endif - -#define COPY(x) { \ - err |= __get_user(regs->x, &sc->x); \ -} - -#define COPY_SEG(seg) { \ - unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp; \ -} - -#define COPY_SEG_CPL3(seg) { \ - unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp | 3; \ -} - -#define GET_SEG(seg) { \ - unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - loadsegment(seg, tmp); \ -} - -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax) -{ - void __user *buf; - unsigned int tmpflags; - unsigned int err = 0; - - /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; - -#ifdef CONFIG_X86_32 - GET_SEG(gs); - COPY_SEG(fs); - COPY_SEG(es); - COPY_SEG(ds); -#endif /* CONFIG_X86_32 */ - - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); - -#ifdef CONFIG_X86_64 - COPY(r8); - COPY(r9); - COPY(r10); - COPY(r11); - COPY(r12); - COPY(r13); - COPY(r14); - COPY(r15); -#endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_X86_32 - COPY_SEG_CPL3(cs); - COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ - - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - - err |= __get_user(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); - - err |= __get_user(*pax, &sc->ax); - return err; -} - -static int -setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, unsigned long mask) -{ - int err = 0; - -#ifdef CONFIG_X86_32 - { - unsigned int tmp; - - savesegment(gs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - } - err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); - err |= __put_user(regs->es, (unsigned int __user *)&sc->es); - err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); -#endif /* CONFIG_X86_32 */ - - err |= __put_user(regs->di, &sc->di); - err |= __put_user(regs->si, &sc->si); - err |= __put_user(regs->bp, &sc->bp); - err |= __put_user(regs->sp, &sc->sp); - err |= __put_user(regs->bx, &sc->bx); - err |= __put_user(regs->dx, &sc->dx); - err |= __put_user(regs->cx, &sc->cx); - err |= __put_user(regs->ax, &sc->ax); -#ifdef CONFIG_X86_64 - err |= __put_user(regs->r8, &sc->r8); - err |= __put_user(regs->r9, &sc->r9); - err |= __put_user(regs->r10, &sc->r10); - err |= __put_user(regs->r11, &sc->r11); - err |= __put_user(regs->r12, &sc->r12); - err |= __put_user(regs->r13, &sc->r13); - err |= __put_user(regs->r14, &sc->r14); - err |= __put_user(regs->r15, &sc->r15); -#endif /* CONFIG_X86_64 */ - - err |= __put_user(current->thread.trap_no, &sc->trapno); - err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->ip, &sc->ip); -#ifdef CONFIG_X86_32 - err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); - err |= __put_user(regs->flags, &sc->flags); - err |= __put_user(regs->sp, &sc->sp_at_signal); - err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); -#else /* !CONFIG_X86_32 */ - err |= __put_user(regs->flags, &sc->flags); - err |= __put_user(regs->cs, &sc->cs); - err |= __put_user(0, &sc->gs); - err |= __put_user(0, &sc->fs); -#endif /* CONFIG_X86_32 */ - - err |= __put_user(fpstate, &sc->fpstate); - - /* non-iBCS2 extensions.. */ - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->thread.cr2, &sc->cr2); - - return err; -} - -/* - * Set up a signal frame. - */ -#ifdef CONFIG_X86_32 -static const struct { - u16 poplmovl; - u32 val; - u16 int80; -} __attribute__((packed)) retcode = { - 0xb858, /* popl %eax; movl $..., %eax */ - __NR_sigreturn, - 0x80cd, /* int $0x80 */ -}; - -static const struct { - u8 movl; - u32 val; - u16 int80; - u8 pad; -} __attribute__((packed)) rt_retcode = { - 0xb8, /* movl $..., %eax */ - __NR_rt_sigreturn, - 0x80cd, /* int $0x80 */ - 0 -}; - -/* - * Determine which stack to use.. - */ -static inline void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, - void **fpstate) -{ - unsigned long sp; - - /* Default to using normal stack */ - sp = regs->sp; - - /* - * If we are on the alternate signal stack and would overflow it, don't. - * Return an always-bogus address instead so we will die with SIGSEGV. - */ - if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size))) - return (void __user *) -1L; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(sp) == 0) - sp = current->sas_ss_sp + current->sas_ss_size; - } else { - /* This is the legacy signal stack switching. */ - if ((regs->ss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) - sp = (unsigned long) ka->sa.sa_restorer; - } - - if (used_math()) { - sp = sp - sig_xstate_size; - *fpstate = (struct _fpstate *) sp; - if (save_i387_xstate(*fpstate) < 0) - return (void __user *)-1L; - } - - sp -= frame_size; - /* - * Align the stack pointer according to the i386 ABI, - * i.e. so that on function entry ((sp + 4) & 15) == 0. - */ - sp = ((sp + 4) & -16ul) - 4; - - return (void __user *) sp; -} - -static int -__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, - struct pt_regs *regs) -{ - struct sigframe __user *frame; - void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return -EFAULT; - - if (__put_user(sig, &frame->sig)) - return -EFAULT; - - if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) - return -EFAULT; - - if (_NSIG_WORDS > 1) { - if (__copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask))) - return -EFAULT; - } - - if (current->mm->context.vdso) - restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); - else - restorer = &frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - - /* Set up to return from userspace. */ - err |= __put_user(restorer, &frame->pretcode); - - /* - * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); - - if (err) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ka->sa.sa_handler; - regs->ax = (unsigned long)sig; - regs->dx = 0; - regs->cx = 0; - - regs->ds = __USER_DS; - regs->es = __USER_DS; - regs->ss = __USER_DS; - regs->cs = __USER_CS; - - return 0; -} - -static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return -EFAULT; - - err |= __put_user(sig, &frame->sig); - err |= __put_user(&frame->info, &frame->pinfo); - err |= __put_user(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, info); - if (err) - return -EFAULT; - - /* Create the ucontext. */ - if (cpu_has_xsave) - err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); - else - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - if (err) - return -EFAULT; - - /* Set up to return from userspace. */ - restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - err |= __put_user(restorer, &frame->pretcode); - - /* - * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode); - - if (err) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ka->sa.sa_handler; - regs->ax = (unsigned long)sig; - regs->dx = (unsigned long)&frame->info; - regs->cx = (unsigned long)&frame->uc; - - regs->ds = __USER_DS; - regs->es = __USER_DS; - regs->ss = __USER_DS; - regs->cs = __USER_CS; - - return 0; -} -#else /* !CONFIG_X86_32 */ -/* - * Determine which stack to use.. - */ -static void __user * -get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size) -{ - /* Default to using normal stack - redzone*/ - sp -= 128; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(sp) == 0) - sp = current->sas_ss_sp + current->sas_ss_size; - } - - return (void __user *)round_down(sp - size, 64); -} - -static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *fp = NULL; - int err = 0; - struct task_struct *me = current; - - if (used_math()) { - fp = get_stack(ka, regs->sp, sig_xstate_size); - frame = (void __user *)round_down( - (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - - if (save_i387_xstate(fp) < 0) - return -EFAULT; - } else - frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8; - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return -EFAULT; - - if (ka->sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user(&frame->info, info)) - return -EFAULT; - } - - /* Create the ucontext. */ - if (cpu_has_xsave) - err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); - else - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) { - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); - } else { - /* could use a vstub here */ - return -EFAULT; - } - - if (err) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->di = sig; - /* In case the signal handler was declared without prototypes */ - regs->ax = 0; - - /* This also works for non SA_SIGINFO handlers because they expect the - next argument after the signal number on the stack. */ - regs->si = (unsigned long)&frame->info; - regs->dx = (unsigned long)&frame->uc; - regs->ip = (unsigned long) ka->sa.sa_handler; - - regs->sp = (unsigned long)frame; - - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ - regs->cs = __USER_CS; - - return 0; -} -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_X86_32 -/* - * Atomically swap in the new signal mask, and wait for a signal. - */ -asmlinkage int -sys_sigsuspend(int history0, int history1, old_sigset_t mask) -{ - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - - return -ERESTARTNOHAND; -} - -asmlinkage int -sys_sigaction(int sig, const struct old_sigaction __user *act, - struct old_sigaction __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - - if (act) { - old_sigset_t mask; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(new_ka.sa.sa_handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) - return -EFAULT; - - __get_user(new_ka.sa.sa_flags, &act->sa_flags); - __get_user(mask, &act->sa_mask); - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || - __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) - return -EFAULT; - - __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); - } - - return ret; -} -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_X86_32 -asmlinkage int sys_sigaltstack(unsigned long bx) -{ - /* - * This is needed to make gcc realize it doesn't own the - * "struct pt_regs" - */ - struct pt_regs *regs = (struct pt_regs *)&bx; - const stack_t __user *uss = (const stack_t __user *)bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; - - return do_sigaltstack(uss, uoss, regs->sp); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->sp); -} -#endif /* CONFIG_X86_32 */ - -/* - * Do a signal return; undo the signal stack. - */ -#ifdef CONFIG_X86_32 -asmlinkage unsigned long sys_sigreturn(unsigned long __unused) -{ - struct sigframe __user *frame; - struct pt_regs *regs; - unsigned long ax; - sigset_t set; - - regs = (struct pt_regs *) &__unused; - frame = (struct sigframe __user *)(regs->sp - 8); - - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 - && __copy_from_user(&set.sig[1], &frame->extramask, - sizeof(frame->extramask)))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->sc, &ax)) - goto badframe; - return ax; - -badframe: - if (show_unhandled_signals && printk_ratelimit()) { - printk("%s%s[%d] bad frame in sigreturn frame:" - "%p ip:%lx sp:%lx oeax:%lx", - task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, - current->comm, task_pid_nr(current), frame, regs->ip, - regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); - } - - force_sig(SIGSEGV, current); - - return 0; -} -#endif /* CONFIG_X86_32 */ - -static long do_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - unsigned long ax; - sigset_t set; - - frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) - goto badframe; - - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) - goto badframe; - - return ax; - -badframe: - signal_fault(regs, frame, "rt_sigreturn"); - return 0; -} - -#ifdef CONFIG_X86_32 -asmlinkage int sys_rt_sigreturn(unsigned long __unused) -{ - struct pt_regs *regs = (struct pt_regs *)&__unused; - - return do_rt_sigreturn(regs); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -{ - return do_rt_sigreturn(regs); -} -#endif /* CONFIG_X86_32 */ - -/* - * OK, we're invoking a handler: - */ -static int signr_convert(int sig) -{ -#ifdef CONFIG_X86_32 - struct thread_info *info = current_thread_info(); - - if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) - return info->exec_domain->signal_invmap[sig]; -#endif /* CONFIG_X86_32 */ - return sig; -} - -#ifdef CONFIG_X86_32 - -#define is_ia32 1 -#define ia32_setup_frame __setup_frame -#define ia32_setup_rt_frame __setup_rt_frame - -#else /* !CONFIG_X86_32 */ - -#ifdef CONFIG_IA32_EMULATION -#define is_ia32 test_thread_flag(TIF_IA32) -#else /* !CONFIG_IA32_EMULATION */ -#define is_ia32 0 -#endif /* CONFIG_IA32_EMULATION */ - -#endif /* CONFIG_X86_32 */ - -static int -setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) -{ - int usig = signr_convert(sig); - int ret; - - /* Set up the stack frame */ - if (is_ia32) { - if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(usig, ka, info, set, regs); - else - ret = ia32_setup_frame(usig, ka, set, regs); - } else - ret = __setup_rt_frame(sig, ka, info, set, regs); - - if (ret) { - force_sigsegv(sig, current); - return -EFAULT; - } - - return ret; -} - -static int -handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) -{ - int ret; - - /* Are we from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { - /* If so, check system call restarting.. */ - switch (syscall_get_error(current, regs)) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->ax = -EINTR; - break; - - case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->ax = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - regs->ax = regs->orig_ax; - regs->ip -= 2; - break; - } - } - - /* - * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF - * flag so that register information in the sigcontext is correct. - */ - if (unlikely(regs->flags & X86_EFLAGS_TF) && - likely(test_and_clear_thread_flag(TIF_FORCED_TF))) - regs->flags &= ~X86_EFLAGS_TF; - - ret = setup_rt_frame(sig, ka, info, oldset, regs); - - if (ret) - return ret; - -#ifdef CONFIG_X86_64 - /* - * This has nothing to do with segment registers, - * despite the name. This magic affects uaccess.h - * macros' behavior. Reset it to the normal setting. - */ - set_fs(USER_DS); -#endif - - /* - * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; - - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~X86_EFLAGS_TF; - - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - tracehook_signal_handler(sig, info, ka, regs, - test_thread_flag(TIF_SINGLESTEP)); - - return 0; -} - -#ifdef CONFIG_X86_32 -#define NR_restart_syscall __NR_restart_syscall -#else /* !CONFIG_X86_32 */ -#define NR_restart_syscall \ - test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall -#endif /* CONFIG_X86_32 */ - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - */ -static void do_signal(struct pt_regs *regs) -{ - struct k_sigaction ka; - siginfo_t info; - int signr; - sigset_t *oldset; - - /* - * We want the common case to go fast, which is why we may in certain - * cases get here from kernel mode. Just return without doing anything - * if so. - * X86_32: vm86 regs switched out by assembly code before reaching - * here, so testing against kernel CS suffices. - */ - if (!user_mode(regs)) - return; - - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - - signr = get_signal_to_deliver(&info, &ka, regs, NULL); - if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - - /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - } - return; - } - - /* Did we come from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { - /* Restart the system call - no handlers present */ - switch (syscall_get_error(current, regs)) { - case -ERESTARTNOHAND: - case -ERESTARTSYS: - case -ERESTARTNOINTR: - regs->ax = regs->orig_ax; - regs->ip -= 2; - break; - - case -ERESTART_RESTARTBLOCK: - regs->ax = NR_restart_syscall; - regs->ip -= 2; - break; - } - } - - /* - * If there's no signal to deliver, we just put the saved sigmask - * back. - */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } -} - -/* - * notification of userspace execution resumption - * - triggered by the TIF_WORK_MASK flags - */ -void -do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) -{ -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) - /* notify userspace of pending MCEs */ - if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_user(); -#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ - - /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - } - -#ifdef CONFIG_X86_32 - clear_thread_flag(TIF_IRET); -#endif /* CONFIG_X86_32 */ -} - -void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -{ - struct task_struct *me = current; - - if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", - me->comm, me->pid, where, frame, - regs->ip, regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); - } - - force_sig(SIGSEGV, me); -} diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c deleted file mode 100644 index b1f4d34e0a38..000000000000 --- a/arch/x86/kernel/signal_64.c +++ /dev/null @@ -1,915 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes - * 2000-2002 x86-64 support by Andi Kleen - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#ifdef CONFIG_X86_64 -#include -#include -#include -#endif /* CONFIG_X86_64 */ - -#include -#include - -#include "sigframe.h" - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ - X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ - X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ - X86_EFLAGS_CF) - -#ifdef CONFIG_X86_32 -# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF) -#else -# define FIX_EFLAGS __FIX_EFLAGS -#endif - -#define COPY(x) { \ - err |= __get_user(regs->x, &sc->x); \ -} - -#define COPY_SEG(seg) { \ - unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp; \ -} - -#define COPY_SEG_CPL3(seg) { \ - unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp | 3; \ -} - -#define GET_SEG(seg) { \ - unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - loadsegment(seg, tmp); \ -} - -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax) -{ - void __user *buf; - unsigned int tmpflags; - unsigned int err = 0; - - /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; - -#ifdef CONFIG_X86_32 - GET_SEG(gs); - COPY_SEG(fs); - COPY_SEG(es); - COPY_SEG(ds); -#endif /* CONFIG_X86_32 */ - - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); - -#ifdef CONFIG_X86_64 - COPY(r8); - COPY(r9); - COPY(r10); - COPY(r11); - COPY(r12); - COPY(r13); - COPY(r14); - COPY(r15); -#endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_X86_32 - COPY_SEG_CPL3(cs); - COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ - - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - - err |= __get_user(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); - - err |= __get_user(*pax, &sc->ax); - return err; -} - -static int -setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, - struct pt_regs *regs, unsigned long mask) -{ - int err = 0; - -#ifdef CONFIG_X86_32 - { - unsigned int tmp; - - savesegment(gs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - } - err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); - err |= __put_user(regs->es, (unsigned int __user *)&sc->es); - err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); -#endif /* CONFIG_X86_32 */ - - err |= __put_user(regs->di, &sc->di); - err |= __put_user(regs->si, &sc->si); - err |= __put_user(regs->bp, &sc->bp); - err |= __put_user(regs->sp, &sc->sp); - err |= __put_user(regs->bx, &sc->bx); - err |= __put_user(regs->dx, &sc->dx); - err |= __put_user(regs->cx, &sc->cx); - err |= __put_user(regs->ax, &sc->ax); -#ifdef CONFIG_X86_64 - err |= __put_user(regs->r8, &sc->r8); - err |= __put_user(regs->r9, &sc->r9); - err |= __put_user(regs->r10, &sc->r10); - err |= __put_user(regs->r11, &sc->r11); - err |= __put_user(regs->r12, &sc->r12); - err |= __put_user(regs->r13, &sc->r13); - err |= __put_user(regs->r14, &sc->r14); - err |= __put_user(regs->r15, &sc->r15); -#endif /* CONFIG_X86_64 */ - - err |= __put_user(current->thread.trap_no, &sc->trapno); - err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->ip, &sc->ip); -#ifdef CONFIG_X86_32 - err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); - err |= __put_user(regs->flags, &sc->flags); - err |= __put_user(regs->sp, &sc->sp_at_signal); - err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); -#else /* !CONFIG_X86_32 */ - err |= __put_user(regs->flags, &sc->flags); - err |= __put_user(regs->cs, &sc->cs); - err |= __put_user(0, &sc->gs); - err |= __put_user(0, &sc->fs); -#endif /* CONFIG_X86_32 */ - - err |= __put_user(fpstate, &sc->fpstate); - - /* non-iBCS2 extensions.. */ - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->thread.cr2, &sc->cr2); - - return err; -} - -/* - * Set up a signal frame. - */ -#ifdef CONFIG_X86_32 -static const struct { - u16 poplmovl; - u32 val; - u16 int80; -} __attribute__((packed)) retcode = { - 0xb858, /* popl %eax; movl $..., %eax */ - __NR_sigreturn, - 0x80cd, /* int $0x80 */ -}; - -static const struct { - u8 movl; - u32 val; - u16 int80; - u8 pad; -} __attribute__((packed)) rt_retcode = { - 0xb8, /* movl $..., %eax */ - __NR_rt_sigreturn, - 0x80cd, /* int $0x80 */ - 0 -}; - -/* - * Determine which stack to use.. - */ -static inline void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, - void **fpstate) -{ - unsigned long sp; - - /* Default to using normal stack */ - sp = regs->sp; - - /* - * If we are on the alternate signal stack and would overflow it, don't. - * Return an always-bogus address instead so we will die with SIGSEGV. - */ - if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size))) - return (void __user *) -1L; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(sp) == 0) - sp = current->sas_ss_sp + current->sas_ss_size; - } else { - /* This is the legacy signal stack switching. */ - if ((regs->ss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) - sp = (unsigned long) ka->sa.sa_restorer; - } - - if (used_math()) { - sp = sp - sig_xstate_size; - *fpstate = (struct _fpstate *) sp; - if (save_i387_xstate(*fpstate) < 0) - return (void __user *)-1L; - } - - sp -= frame_size; - /* - * Align the stack pointer according to the i386 ABI, - * i.e. so that on function entry ((sp + 4) & 15) == 0. - */ - sp = ((sp + 4) & -16ul) - 4; - - return (void __user *) sp; -} - -static int -__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, - struct pt_regs *regs) -{ - struct sigframe __user *frame; - void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return -EFAULT; - - if (__put_user(sig, &frame->sig)) - return -EFAULT; - - if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) - return -EFAULT; - - if (_NSIG_WORDS > 1) { - if (__copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask))) - return -EFAULT; - } - - if (current->mm->context.vdso) - restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); - else - restorer = &frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - - /* Set up to return from userspace. */ - err |= __put_user(restorer, &frame->pretcode); - - /* - * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); - - if (err) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ka->sa.sa_handler; - regs->ax = (unsigned long)sig; - regs->dx = 0; - regs->cx = 0; - - regs->ds = __USER_DS; - regs->es = __USER_DS; - regs->ss = __USER_DS; - regs->cs = __USER_CS; - - return 0; -} - -static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return -EFAULT; - - err |= __put_user(sig, &frame->sig); - err |= __put_user(&frame->info, &frame->pinfo); - err |= __put_user(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, info); - if (err) - return -EFAULT; - - /* Create the ucontext. */ - if (cpu_has_xsave) - err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); - else - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - if (err) - return -EFAULT; - - /* Set up to return from userspace. */ - restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - err |= __put_user(restorer, &frame->pretcode); - - /* - * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode); - - if (err) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ka->sa.sa_handler; - regs->ax = (unsigned long)sig; - regs->dx = (unsigned long)&frame->info; - regs->cx = (unsigned long)&frame->uc; - - regs->ds = __USER_DS; - regs->es = __USER_DS; - regs->ss = __USER_DS; - regs->cs = __USER_CS; - - return 0; -} -#else /* !CONFIG_X86_32 */ -/* - * Determine which stack to use.. - */ -static void __user * -get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size) -{ - /* Default to using normal stack - redzone*/ - sp -= 128; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(sp) == 0) - sp = current->sas_ss_sp + current->sas_ss_size; - } - - return (void __user *)round_down(sp - size, 64); -} - -static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *fp = NULL; - int err = 0; - struct task_struct *me = current; - - if (used_math()) { - fp = get_stack(ka, regs->sp, sig_xstate_size); - frame = (void __user *)round_down( - (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - - if (save_i387_xstate(fp) < 0) - return -EFAULT; - } else - frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8; - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return -EFAULT; - - if (ka->sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user(&frame->info, info)) - return -EFAULT; - } - - /* Create the ucontext. */ - if (cpu_has_xsave) - err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); - else - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) { - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); - } else { - /* could use a vstub here */ - return -EFAULT; - } - - if (err) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->di = sig; - /* In case the signal handler was declared without prototypes */ - regs->ax = 0; - - /* This also works for non SA_SIGINFO handlers because they expect the - next argument after the signal number on the stack. */ - regs->si = (unsigned long)&frame->info; - regs->dx = (unsigned long)&frame->uc; - regs->ip = (unsigned long) ka->sa.sa_handler; - - regs->sp = (unsigned long)frame; - - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ - regs->cs = __USER_CS; - - return 0; -} -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_X86_32 -/* - * Atomically swap in the new signal mask, and wait for a signal. - */ -asmlinkage int -sys_sigsuspend(int history0, int history1, old_sigset_t mask) -{ - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - - return -ERESTARTNOHAND; -} - -asmlinkage int -sys_sigaction(int sig, const struct old_sigaction __user *act, - struct old_sigaction __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - - if (act) { - old_sigset_t mask; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(new_ka.sa.sa_handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) - return -EFAULT; - - __get_user(new_ka.sa.sa_flags, &act->sa_flags); - __get_user(mask, &act->sa_mask); - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || - __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) - return -EFAULT; - - __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); - } - - return ret; -} -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_X86_32 -asmlinkage int sys_sigaltstack(unsigned long bx) -{ - /* - * This is needed to make gcc realize it doesn't own the - * "struct pt_regs" - */ - struct pt_regs *regs = (struct pt_regs *)&bx; - const stack_t __user *uss = (const stack_t __user *)bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; - - return do_sigaltstack(uss, uoss, regs->sp); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->sp); -} -#endif /* CONFIG_X86_32 */ - -/* - * Do a signal return; undo the signal stack. - */ -#ifdef CONFIG_X86_32 -asmlinkage unsigned long sys_sigreturn(unsigned long __unused) -{ - struct sigframe __user *frame; - struct pt_regs *regs; - unsigned long ax; - sigset_t set; - - regs = (struct pt_regs *) &__unused; - frame = (struct sigframe __user *)(regs->sp - 8); - - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 - && __copy_from_user(&set.sig[1], &frame->extramask, - sizeof(frame->extramask)))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->sc, &ax)) - goto badframe; - return ax; - -badframe: - if (show_unhandled_signals && printk_ratelimit()) { - printk("%s%s[%d] bad frame in sigreturn frame:" - "%p ip:%lx sp:%lx oeax:%lx", - task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, - current->comm, task_pid_nr(current), frame, regs->ip, - regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); - } - - force_sig(SIGSEGV, current); - - return 0; -} -#endif /* CONFIG_X86_32 */ - -static long do_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - unsigned long ax; - sigset_t set; - - frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) - goto badframe; - - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) - goto badframe; - - return ax; - -badframe: - signal_fault(regs, frame, "rt_sigreturn"); - return 0; -} - -#ifdef CONFIG_X86_32 -asmlinkage int sys_rt_sigreturn(unsigned long __unused) -{ - struct pt_regs *regs = (struct pt_regs *)&__unused; - - return do_rt_sigreturn(regs); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -{ - return do_rt_sigreturn(regs); -} -#endif /* CONFIG_X86_32 */ - -/* - * OK, we're invoking a handler: - */ -static int signr_convert(int sig) -{ -#ifdef CONFIG_X86_32 - struct thread_info *info = current_thread_info(); - - if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) - return info->exec_domain->signal_invmap[sig]; -#endif /* CONFIG_X86_32 */ - return sig; -} - -#ifdef CONFIG_X86_32 - -#define is_ia32 1 -#define ia32_setup_frame __setup_frame -#define ia32_setup_rt_frame __setup_rt_frame - -#else /* !CONFIG_X86_32 */ - -#ifdef CONFIG_IA32_EMULATION -#define is_ia32 test_thread_flag(TIF_IA32) -#else /* !CONFIG_IA32_EMULATION */ -#define is_ia32 0 -#endif /* CONFIG_IA32_EMULATION */ - -#endif /* CONFIG_X86_32 */ - -static int -setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) -{ - int usig = signr_convert(sig); - int ret; - - /* Set up the stack frame */ - if (is_ia32) { - if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(usig, ka, info, set, regs); - else - ret = ia32_setup_frame(usig, ka, set, regs); - } else - ret = __setup_rt_frame(sig, ka, info, set, regs); - - if (ret) { - force_sigsegv(sig, current); - return -EFAULT; - } - - return ret; -} - -static int -handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) -{ - int ret; - - /* Are we from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { - /* If so, check system call restarting.. */ - switch (syscall_get_error(current, regs)) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->ax = -EINTR; - break; - - case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->ax = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - regs->ax = regs->orig_ax; - regs->ip -= 2; - break; - } - } - - /* - * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF - * flag so that register information in the sigcontext is correct. - */ - if (unlikely(regs->flags & X86_EFLAGS_TF) && - likely(test_and_clear_thread_flag(TIF_FORCED_TF))) - regs->flags &= ~X86_EFLAGS_TF; - - ret = setup_rt_frame(sig, ka, info, oldset, regs); - - if (ret) - return ret; - -#ifdef CONFIG_X86_64 - /* - * This has nothing to do with segment registers, - * despite the name. This magic affects uaccess.h - * macros' behavior. Reset it to the normal setting. - */ - set_fs(USER_DS); -#endif - - /* - * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; - - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~X86_EFLAGS_TF; - - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - tracehook_signal_handler(sig, info, ka, regs, - test_thread_flag(TIF_SINGLESTEP)); - - return 0; -} - -#ifdef CONFIG_X86_32 -#define NR_restart_syscall __NR_restart_syscall -#else /* !CONFIG_X86_32 */ -#define NR_restart_syscall \ - test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall -#endif /* CONFIG_X86_32 */ - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - */ -static void do_signal(struct pt_regs *regs) -{ - struct k_sigaction ka; - siginfo_t info; - int signr; - sigset_t *oldset; - - /* - * We want the common case to go fast, which is why we may in certain - * cases get here from kernel mode. Just return without doing anything - * if so. - * X86_32: vm86 regs switched out by assembly code before reaching - * here, so testing against kernel CS suffices. - */ - if (!user_mode(regs)) - return; - - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - - signr = get_signal_to_deliver(&info, &ka, regs, NULL); - if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - - /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - } - return; - } - - /* Did we come from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { - /* Restart the system call - no handlers present */ - switch (syscall_get_error(current, regs)) { - case -ERESTARTNOHAND: - case -ERESTARTSYS: - case -ERESTARTNOINTR: - regs->ax = regs->orig_ax; - regs->ip -= 2; - break; - - case -ERESTART_RESTARTBLOCK: - regs->ax = NR_restart_syscall; - regs->ip -= 2; - break; - } - } - - /* - * If there's no signal to deliver, we just put the saved sigmask - * back. - */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } -} - -/* - * notification of userspace execution resumption - * - triggered by the TIF_WORK_MASK flags - */ -void -do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) -{ -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) - /* notify userspace of pending MCEs */ - if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_user(); -#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ - - /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - } - -#ifdef CONFIG_X86_32 - clear_thread_flag(TIF_IRET); -#endif /* CONFIG_X86_32 */ -} - -void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -{ - struct task_struct *me = current; - - if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", - me->comm, me->pid, where, frame, - regs->ip, regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); - } - - force_sig(SIGSEGV, me); -} -- cgit v1.2.3 From 4db646b1af8fdcf01d690d29eeae44cd937edb0d Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 23 Nov 2008 20:49:52 +0100 Subject: x86: microcode: fix sparse warnings Impact: make global variables and a function static Fix following sparse warnings: arch/x86/kernel/microcode_core.c:102:22: warning: symbol 'microcode_ops' was not declared. Should it be static? arch/x86/kernel/microcode_core.c:206:24: warning: symbol 'microcode_pdev' was not declared. Should it be static? arch/x86/kernel/microcode_core.c:322:6: warning: symbol 'microcode_update_cpu' was not declared. Should it be static? arch/x86/kernel/microcode_intel.c:468:22: warning: symbol 'microcode_intel_ops' was not declared. Should it be static? Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 6 +++--- arch/x86/kernel/microcode_intel.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 82fb2809ce32..5b711a534495 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -99,7 +99,7 @@ MODULE_LICENSE("GPL"); #define MICROCODE_VERSION "2.00" -struct microcode_ops *microcode_ops; +static struct microcode_ops *microcode_ops; /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); @@ -203,7 +203,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #endif /* fake device for request_firmware */ -struct platform_device *microcode_pdev; +static struct platform_device *microcode_pdev; static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, @@ -319,7 +319,7 @@ static int microcode_resume_cpu(int cpu) return 0; } -void microcode_update_cpu(int cpu) +static void microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 622dc4a21784..c34c820ee486 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -465,7 +465,7 @@ static void microcode_fini_cpu(int cpu) uci->mc = NULL; } -struct microcode_ops microcode_intel_ops = { +static struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, -- cgit v1.2.3 From ddeb8f2149de280d54f0c8910cead42e6042b2cb Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Mon, 24 Nov 2008 13:24:28 +0100 Subject: x86_64: get rid of the use of KPROBE_ENTRY / KPROBE_END Impact: clean up assembly macros and annotations - with some object impact entry_64.S is the only user of KPROBE_ENTRY / KPROBE_END on x86_64. This patch reorders entry_64.S and explicitly generates a separate section for functions that need the protection. The generated code before and after the patch is equal. Implicitly changing sections in assembly files makes it more difficult to follow why the assembler is doing certain things. For example, .p2align 5 KPROBE_ENTRY(...) was not doing what you would expect. Other section changes (__ex_table, .fixup, .init.rodata) are done explicitly already. Signed-off-by: Alexander van Heukelum Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 444 ++++++++++++++++++++++----------------------- 1 file changed, 220 insertions(+), 224 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f2d546e16354..38fcd0517c31 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1002,7 +1002,7 @@ END(\sym) .endm .macro paranoidzeroentry sym do_sym -KPROBE_ENTRY(\sym) +ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -1015,11 +1015,11 @@ KPROBE_ENTRY(\sym) call \do_sym jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC -KPROBE_END(\sym) +END(\sym) .endm .macro paranoidzeroentry_ist sym do_sym ist -KPROBE_ENTRY(\sym) +ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -1035,15 +1035,11 @@ KPROBE_ENTRY(\sym) addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC -KPROBE_END(\sym) +END(\sym) .endm -.macro errorentry sym do_sym entry=0 -.if \entry -KPROBE_ENTRY(\sym) -.else +.macro errorentry sym do_sym ENTRY(\sym) -.endif XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME subq $15*8,%rsp @@ -1056,20 +1052,12 @@ ENTRY(\sym) call \do_sym jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC -.if \entry -KPROBE_END(\sym) -.else END(\sym) -.endif .endm /* error code is on the stack already */ -.macro paranoiderrorentry sym do_sym entry=1 -.if \entry -KPROBE_ENTRY(\sym) -.else +.macro paranoiderrorentry sym do_sym ENTRY(\sym) -.endif XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME subq $15*8,%rsp @@ -1083,166 +1071,23 @@ ENTRY(\sym) call \do_sym jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC -.if \entry -KPROBE_END(\sym) -.else END(\sym) -.endif .endm zeroentry divide_error do_divide_error -paranoidzeroentry_ist debug do_debug DEBUG_STACK -paranoidzeroentry_ist int3 do_int3 DEBUG_STACK zeroentry overflow do_overflow zeroentry bounds do_bounds zeroentry invalid_op do_invalid_op zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault 0 +paranoiderrorentry double_fault do_double_fault zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun errorentry invalid_TSS do_invalid_TSS errorentry segment_not_present do_segment_not_present -paranoiderrorentry stack_segment do_stack_segment -errorentry general_protection do_general_protection 1 -errorentry page_fault do_page_fault 1 zeroentry spurious_interrupt_bug do_spurious_interrupt_bug zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check -#ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check do_machine_check -#endif zeroentry simd_coprocessor_error do_simd_coprocessor_error - /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. - * - * "trace" is 0 for the NMI handler only, because irq-tracing - * is fundamentally NMI-unsafe. (we cannot change the soft and - * hard flags at once, atomically) - */ - - /* ebx: no swapgs flag */ -KPROBE_ENTRY(paranoid_exit) - INTR_FRAME - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - testl $3,CS(%rsp) - jnz paranoid_userspace -paranoid_swapgs: - TRACE_IRQS_IRETQ 0 - SWAPGS_UNSAFE_STACK -paranoid_restore: - RESTORE_ALL 8 - jmp irq_return -paranoid_userspace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule - movl %ebx,%edx /* arg3: thread flags */ - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp paranoid_userspace -paranoid_schedule: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - jmp paranoid_userspace - CFI_ENDPROC -KPROBE_END(paranoid_exit) - -/* - * Exception entry point. This expects an error code/orig_rax on the stack. - * returns in "no swapgs flag" in %ebx. - */ -KPROBE_ENTRY(error_entry) - XCPT_FRAME - CFI_ADJUST_CFA_OFFSET 15*8 - /* oldrax contains error code */ - cld - movq_cfi rdi, RDI+8 - movq_cfi rsi, RSI+8 - movq_cfi rdx, RDX+8 - movq_cfi rcx, RCX+8 - movq_cfi rax, RAX+8 - movq_cfi r8, R8+8 - movq_cfi r9, R9+8 - movq_cfi r10, R10+8 - movq_cfi r11, R11+8 - movq_cfi rbx, RBX+8 - movq_cfi rbp, RBP+8 - movq_cfi r12, R12+8 - movq_cfi r13, R13+8 - movq_cfi r14, R14+8 - movq_cfi r15, R15+8 - xorl %ebx,%ebx - testl $3,CS+8(%rsp) - je error_kernelspace -error_swapgs: - SWAPGS -error_sti: - TRACE_IRQS_OFF - ret - CFI_ENDPROC - -/* - * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. The exception handlers after iret run with - * kernel gs again, so don't set the user space flag. B stepping K8s - * sometimes report an truncated RIP for IRET exceptions returning to - * compat mode. Check for these here too. - */ -error_kernelspace: - incl %ebx - leaq irq_return(%rip),%rcx - cmpq %rcx,RIP+8(%rsp) - je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP+8(%rsp) - je error_swapgs - cmpq $gs_change,RIP+8(%rsp) - je error_swapgs - jmp error_sti -KPROBE_END(error_entry) - - -/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ -KPROBE_ENTRY(error_exit) - DEFAULT_FRAME - movl %ebx,%eax - RESTORE_REST - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - testl %eax,%eax - jne retint_kernel - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - jmp retint_swapgs - CFI_ENDPROC -KPROBE_END(error_exit) - /* Reload gs selector with exception handling */ /* edi: new selector */ ENTRY(native_load_gs_index) @@ -1362,61 +1207,6 @@ ENTRY(kernel_execve) CFI_ENDPROC END(kernel_execve) - /* runs on exception stack */ -KPROBE_ENTRY(nmi) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 - subq $15*8, %rsp - CFI_ADJUST_CFA_OFFSET 15*8 - call save_paranoid - DEFAULT_FRAME 0 - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp,%rdi - movq $-1,%rsi - call do_nmi -#ifdef CONFIG_TRACE_IRQFLAGS - /* paranoidexit; without TRACE_IRQS_OFF */ - /* ebx: no swapgs flag */ - DISABLE_INTERRUPTS(CLBR_NONE) - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore - testl $3,CS(%rsp) - jnz nmi_userspace -nmi_swapgs: - SWAPGS_UNSAFE_STACK -nmi_restore: - RESTORE_ALL 8 - jmp irq_return -nmi_userspace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz nmi_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz nmi_schedule - movl %ebx,%edx /* arg3: thread flags */ - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - jmp nmi_userspace -nmi_schedule: - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - jmp nmi_userspace - CFI_ENDPROC -#else - jmp paranoid_exit - CFI_ENDPROC -#endif -KPROBE_END(nmi) - /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC @@ -1437,13 +1227,6 @@ ENTRY(call_softirq) CFI_ENDPROC END(call_softirq) -KPROBE_ENTRY(ignore_sysret) - CFI_STARTPROC - mov $-ENOSYS,%eax - sysret - CFI_ENDPROC -KPROBE_END(ignore_sysret) - #ifdef CONFIG_XEN zeroentry xen_hypervisor_callback xen_do_hypervisor_callback @@ -1540,3 +1323,216 @@ ENTRY(xen_failsafe_callback) END(xen_failsafe_callback) #endif /* CONFIG_XEN */ + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +paranoidzeroentry_ist debug do_debug DEBUG_STACK +paranoidzeroentry_ist int3 do_int3 DEBUG_STACK +paranoiderrorentry stack_segment do_stack_segment +errorentry general_protection do_general_protection +errorentry page_fault do_page_fault +#ifdef CONFIG_X86_MCE +paranoidzeroentry machine_check do_machine_check +#endif + + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + + /* ebx: no swapgs flag */ +ENTRY(paranoid_exit) + INTR_FRAME + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore + testl $3,CS(%rsp) + jnz paranoid_userspace +paranoid_swapgs: + TRACE_IRQS_IRETQ 0 + SWAPGS_UNSAFE_STACK +paranoid_restore: + RESTORE_ALL 8 + jmp irq_return +paranoid_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule + movl %ebx,%edx /* arg3: thread flags */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp paranoid_userspace +paranoid_schedule: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + jmp paranoid_userspace + CFI_ENDPROC +END(paranoid_exit) + +/* + * Exception entry point. This expects an error code/orig_rax on the stack. + * returns in "no swapgs flag" in %ebx. + */ +ENTRY(error_entry) + XCPT_FRAME + CFI_ADJUST_CFA_OFFSET 15*8 + /* oldrax contains error code */ + cld + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 + xorl %ebx,%ebx + testl $3,CS+8(%rsp) + je error_kernelspace +error_swapgs: + SWAPGS +error_sti: + TRACE_IRQS_OFF + ret + CFI_ENDPROC + +/* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. The exception handlers after iret run with + * kernel gs again, so don't set the user space flag. B stepping K8s + * sometimes report an truncated RIP for IRET exceptions returning to + * compat mode. Check for these here too. + */ +error_kernelspace: + incl %ebx + leaq irq_return(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + movl %ecx,%ecx /* zero extend */ + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti +END(error_entry) + + +/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +ENTRY(error_exit) + DEFAULT_FRAME + movl %ebx,%eax + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + testl %eax,%eax + jne retint_kernel + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + jmp retint_swapgs + CFI_ENDPROC +END(error_exit) + + + /* runs on exception stack */ +ENTRY(nmi) + INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + pushq_cfi $-1 + subq $15*8, %rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call save_paranoid + DEFAULT_FRAME 0 + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp,%rdi + movq $-1,%rsi + call do_nmi +#ifdef CONFIG_TRACE_IRQFLAGS + /* paranoidexit; without TRACE_IRQS_OFF */ + /* ebx: no swapgs flag */ + DISABLE_INTERRUPTS(CLBR_NONE) + testl %ebx,%ebx /* swapgs needed? */ + jnz nmi_restore + testl $3,CS(%rsp) + jnz nmi_userspace +nmi_swapgs: + SWAPGS_UNSAFE_STACK +nmi_restore: + RESTORE_ALL 8 + jmp irq_return +nmi_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz nmi_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz nmi_schedule + movl %ebx,%edx /* arg3: thread flags */ + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + jmp nmi_userspace +nmi_schedule: + ENABLE_INTERRUPTS(CLBR_ANY) + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) + jmp nmi_userspace + CFI_ENDPROC +#else + jmp paranoid_exit + CFI_ENDPROC +#endif +END(nmi) + +ENTRY(ignore_sysret) + CFI_STARTPROC + mov $-ENOSYS,%eax + sysret + CFI_ENDPROC +END(ignore_sysret) + +/* + * End of kprobes section + */ + .popsection -- cgit v1.2.3 From d211af055d0c12dc3416c2886e6fbdc6eb74a381 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Mon, 24 Nov 2008 15:38:45 +0100 Subject: i386: get rid of the use of KPROBE_ENTRY / KPROBE_END entry_32.S is now the only user of KPROBE_ENTRY / KPROBE_END, treewide. This patch reorders entry_64.S and explicitly generates a separate section for functions that need the protection. The generated code before and after the patch is equal. The KPROBE_ENTRY and KPROBE_END macro's are removed too. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 438 +++++++++++++++++++++++---------------------- include/linux/linkage.h | 8 - 2 files changed, 224 insertions(+), 222 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index bd02ec77edc4..6e96028d1a9c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -688,65 +688,6 @@ ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -KPROBE_ENTRY(page_fault) - RING0_EC_FRAME - pushl $do_page_fault - CFI_ADJUST_CFA_OFFSET 4 - ALIGN -error_code: - /* the function address is in %fs's slot on the stack */ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET es, 0*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET ds, 0*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebx, 0 - cld - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET fs, 0*/ - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs - UNWIND_ESPFIX_STACK - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - /*CFI_REGISTER es, ecx*/ - movl PT_FS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - mov %ecx, PT_FS(%esp) - /*CFI_REL_OFFSET fs, ES*/ - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(page_fault) - ENTRY(coprocessor_error) RING0_INT_FRAME pushl $0 @@ -777,140 +718,6 @@ ENTRY(device_not_available) CFI_ENDPROC END(device_not_available) -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -#define FIX_STACK(offset, ok, label) \ - cmpw $__KERNEL_CS,4(%esp); \ - jne ok; \ -label: \ - movl TSS_sysenter_sp0+offset(%esp),%esp; \ - CFI_DEF_CFA esp, 0; \ - CFI_UNDEFINED eip; \ - pushfl; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $__KERNEL_CS; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $sysenter_past_esp; \ - CFI_ADJUST_CFA_OFFSET 4; \ - CFI_REL_OFFSET eip, 0 - -KPROBE_ENTRY(debug) - RING0_INT_FRAME - cmpl $ia32_sysenter_target,(%esp) - jne debug_stack_correct - FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) -debug_stack_correct: - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(debug) - -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -KPROBE_ENTRY(nmi) - RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - je nmi_espfix_stack - cmpl $ia32_sysenter_target,(%esp) - je nmi_stack_fixup - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - /* We have a RING0_INT_FRAME here */ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_nocheck_notrace - CFI_ENDPROC - -nmi_stack_fixup: - RING0_INT_FRAME - FIX_STACK(12,nmi_stack_correct, 1) - jmp nmi_stack_correct - -nmi_debug_stack_check: - /* We have a RING0_INT_FRAME here */ - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct - FIX_STACK(24,nmi_stack_correct, 1) - jmp nmi_stack_correct - -nmi_espfix_stack: - /* We have a RING0_INT_FRAME here. - * - * create the pointer to lss back - */ - pushl %ss - CFI_ADJUST_CFA_OFFSET 4 - pushl %esp - CFI_ADJUST_CFA_OFFSET 4 - addw $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - CFI_ADJUST_CFA_OFFSET 4 - .endr - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 - jmp irq_return - CFI_ENDPROC -KPROBE_END(nmi) - #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iret @@ -926,19 +733,6 @@ ENTRY(native_irq_enable_sysexit) END(native_irq_enable_sysexit) #endif -KPROBE_ENTRY(int3) - RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(int3) - ENTRY(overflow) RING0_INT_FRAME pushl $0 @@ -1003,14 +797,6 @@ ENTRY(stack_segment) CFI_ENDPROC END(stack_segment) -KPROBE_ENTRY(general_protection) - RING0_EC_FRAME - pushl $do_general_protection - CFI_ADJUST_CFA_OFFSET 4 - jmp error_code - CFI_ENDPROC -KPROBE_END(general_protection) - ENTRY(alignment_check) RING0_EC_FRAME pushl $do_alignment_check @@ -1220,3 +1006,227 @@ END(mcount) #include "syscall_table_32.S" syscall_table_size=(.-sys_call_table) + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 + ALIGN +error_code: + /* the function address is in %fs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_REGISTER es, ecx*/ + movl PT_FS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %ecx, PT_FS(%esp) + /*CFI_REL_OFFSET fs, ES*/ + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC +END(page_fault) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl TSS_sysenter_sp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ + pushfl; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $__KERNEL_CS; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $sysenter_past_esp; \ + CFI_ADJUST_CFA_OFFSET 4; \ + CFI_REL_OFFSET eip, 0 + +ENTRY(debug) + RING0_INT_FRAME + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + je nmi_espfix_stack + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_nocheck_notrace + CFI_ENDPROC + +nmi_stack_fixup: + RING0_INT_FRAME + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ + pushl %ss + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 + addw $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 + .endr + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 + jmp irq_return + CFI_ENDPROC +END(nmi) + +ENTRY(int3) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC +END(int3) + +ENTRY(general_protection) + RING0_EC_FRAME + pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(general_protection) + +/* + * End of kprobes section + */ + .popsection diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 9fd1f859021b..fee9e59649c1 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -64,14 +64,6 @@ name: #endif -#define KPROBE_ENTRY(name) \ - .pushsection .kprobes.text, "ax"; \ - ENTRY(name) - -#define KPROBE_END(name) \ - END(name); \ - .popsection - #ifndef END #define END(name) \ .size name, .-name -- cgit v1.2.3 From 33454539f386a2beb38269bea5fff82b3d56b0e9 Mon Sep 17 00:00:00 2001 From: "gorcunov@gmail.com" Date: Wed, 26 Nov 2008 22:17:02 +0300 Subject: x86: entry_64.S - use X86_EFLAGS_IF instead of hardcoded number Impact: cleanup Signed-off-by: Cyrill Gorcunov Acked-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38fcd0517c31..1c309d546518 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -194,7 +194,7 @@ ENTRY(native_usergs_sysret64) pushq %rax /* rsp */ CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rsp,0 - pushq $(1<<9) /* eflags - interrupts on */ + pushq $X86_EFLAGS_IF /* eflags - interrupts on */ CFI_ADJUST_CFA_OFFSET 8 /*CFI_REL_OFFSET rflags,0*/ pushq $__KERNEL_CS /* cs */ -- cgit v1.2.3 From c2c631e318091118587f3b766347d259c9265b8b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 26 Nov 2008 22:17:00 +0300 Subject: x86: entry_64.S - use ENTRY to define child_rip child_rip is called not by its name but indirectly rather so make it global and aligned. Signed-off-by: Cyrill Gorcunov Acked-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1c309d546518..0a910a7f85f5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1160,7 +1160,7 @@ ENTRY(kernel_thread) CFI_ENDPROC END(kernel_thread) -child_rip: +ENTRY(child_rip) pushq $0 # fake return address CFI_STARTPROC /* -- cgit v1.2.3 From 1d9b16d1690fe5edb1c907fe4746681cf026cdf3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 27 Nov 2008 18:39:15 +0100 Subject: x86: move GART specific stuff from iommu.h to gart.h Impact: cleanup Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/include/asm/gart.h | 33 +++++++++++++++++++++++++++++++++ arch/x86/include/asm/iommu.h | 33 --------------------------------- arch/x86/kernel/amd_iommu.c | 1 + arch/x86/kernel/amd_iommu_init.c | 1 + arch/x86/kernel/early-quirks.c | 1 + arch/x86/kernel/pci-dma.c | 1 + arch/x86/kernel/setup.c | 1 + 7 files changed, 38 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 74252264433d..6cfdafa409d8 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -29,6 +29,39 @@ extern int fix_aperture; #define AMD64_GARTCACHECTL 0x9c #define AMD64_GARTEN (1<<0) +#ifdef CONFIG_GART_IOMMU +extern int gart_iommu_aperture; +extern int gart_iommu_aperture_allowed; +extern int gart_iommu_aperture_disabled; + +extern void early_gart_iommu_check(void); +extern void gart_iommu_init(void); +extern void gart_iommu_shutdown(void); +extern void __init gart_parse_options(char *); +extern void gart_iommu_hole_init(void); + +#else +#define gart_iommu_aperture 0 +#define gart_iommu_aperture_allowed 0 +#define gart_iommu_aperture_disabled 1 + +static inline void early_gart_iommu_check(void) +{ +} +static inline void gart_iommu_init(void) +{ +} +static inline void gart_iommu_shutdown(void) +{ +} +static inline void gart_parse_options(char *options) +{ +} +static inline void gart_iommu_hole_init(void) +{ +} +#endif + extern int agp_amd64_init(void); static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 0b500c5b6446..295b13193f4d 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -12,37 +12,4 @@ extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len); /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) -#ifdef CONFIG_GART_IOMMU -extern int gart_iommu_aperture; -extern int gart_iommu_aperture_allowed; -extern int gart_iommu_aperture_disabled; - -extern void early_gart_iommu_check(void); -extern void gart_iommu_init(void); -extern void gart_iommu_shutdown(void); -extern void __init gart_parse_options(char *); -extern void gart_iommu_hole_init(void); - -#else -#define gart_iommu_aperture 0 -#define gart_iommu_aperture_allowed 0 -#define gart_iommu_aperture_disabled 1 - -static inline void early_gart_iommu_check(void) -{ -} -static inline void gart_iommu_init(void) -{ -} -static inline void gart_iommu_shutdown(void) -{ -} -static inline void gart_parse_options(char *options) -{ -} -static inline void gart_iommu_hole_init(void) -{ -} -#endif - #endif /* _ASM_X86_IOMMU_H */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 331b318304eb..172e0dc4641e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0cdcda35a05f..7685f0774a8f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * definitions for the ACPI scanning code diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 1b894b72c0f5..744aa7fc49d5 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,6 +17,7 @@ #include #include #include +#include static void __init fix_hypertransport_config(int num, int slot, int func) { diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 192624820217..12eeb4bfcdeb 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd3..67d5979e654e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 5ae3a139cf4fc2349f1dfa1993a66c1dcc119468 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 27 Nov 2008 00:02:10 +0300 Subject: x86: uv bau interrupt -- use proper interrupt number Signed-off-by: Cyrill Gorcunov Acked-by: Cliff Wickman Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0a910a7f85f5..57d7f7a5ad2f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -938,7 +938,7 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt #endif -apicinterrupt 220 \ +apicinterrupt UV_BAU_MESSAGE \ uv_bau_message_intr1 uv_bau_message_interrupt apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt -- cgit v1.2.3 From 9f1e87ea3ecb3c46c21f6a1a202ec82f99ed2473 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 27 Nov 2008 21:10:08 +0300 Subject: x86: entry_64.S - trivial: space, comments fixup Impact: cleanup Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 94 +++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 57d7f7a5ad2f..08c0c9777a09 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1020,7 +1020,7 @@ END(\sym) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) - INTR_FRAME + INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 /* ORIG_RAX: no syscall to restart */ CFI_ADJUST_CFA_OFFSET 8 @@ -1088,36 +1088,36 @@ zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error - /* Reload gs selector with exception handling */ - /* edi: new selector */ + /* Reload gs selector with exception handling */ + /* edi: new selector */ ENTRY(native_load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) - SWAPGS + SWAPGS gs_change: - movl %edi,%gs + movl %edi,%gs 2: mfence /* workaround */ SWAPGS - popf + popf CFI_ADJUST_CFA_OFFSET -8 - ret + ret CFI_ENDPROC END(native_load_gs_index) - .section __ex_table,"a" - .align 8 - .quad gs_change,bad_gs - .previous - .section .fixup,"ax" + .section __ex_table,"a" + .align 8 + .quad gs_change,bad_gs + .previous + .section .fixup,"ax" /* running with kernelgs */ bad_gs: SWAPGS /* switch back to user gs */ xorl %eax,%eax - movl %eax,%gs - jmp 2b - .previous + movl %eax,%gs + jmp 2b + .previous /* * Create a kernel thread. @@ -1152,7 +1152,7 @@ ENTRY(kernel_thread) * so internally to the x86_64 port you can rely on kernel_thread() * not to reschedule the child before returning, this avoids the need * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] + * [Hopefully no generic code relies on the reschedule -AK] */ RESTORE_ALL UNFAKE_STACK_FRAME @@ -1231,22 +1231,24 @@ END(call_softirq) zeroentry xen_hypervisor_callback xen_do_hypervisor_callback /* -# A note on the "critical region" in our callback handler. -# We want to avoid stacking callback handlers due to events occurring -# during handling of the last event. To do this, we keep events disabled -# until we've done all processing. HOWEVER, we must enable events before -# popping the stack frame (can't be done atomically) and so it would still -# be possible to get enough handler activations to overflow the stack. -# Although unlikely, bugs of that kind are hard to track down, so we'd -# like to avoid the possibility. -# So, on entry to the handler we detect whether we interrupted an -# existing activation in its critical region -- if so, we pop the current -# activation and restart the handler using the previous one. -*/ + * A note on the "critical region" in our callback handler. + * We want to avoid stacking callback handlers due to events occurring + * during handling of the last event. To do this, we keep events disabled + * until we've done all processing. HOWEVER, we must enable events before + * popping the stack frame (can't be done atomically) and so it would still + * be possible to get enough handler activations to overflow the stack. + * Although unlikely, bugs of that kind are hard to track down, so we'd + * like to avoid the possibility. + * So, on entry to the handler we detect whether we interrupted an + * existing activation in its critical region -- if so, we pop the current + * activation and restart the handler using the previous one. + */ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) CFI_STARTPROC -/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will - see the correct pointer to the pt_regs */ +/* + * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + * see the correct pointer to the pt_regs + */ movq %rdi, %rsp # we don't return, adjust the stack frame CFI_ENDPROC DEFAULT_FRAME @@ -1264,18 +1266,18 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) END(do_hypervisor_callback) /* -# Hypervisor uses this for application faults while it executes. -# We get here for two reasons: -# 1. Fault while reloading DS, ES, FS or GS -# 2. Fault while executing IRET -# Category 1 we do not need to fix up as Xen has already reloaded all segment -# registers that could be reloaded and zeroed the others. -# Category 2 we fix up by killing the current process. We cannot use the -# normal Linux return path in this case because if we use the IRET hypercall -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -# We distinguish between categories by comparing each saved segment register -# with its current contents: any discrepancy means we in category 1. -*/ + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we do not need to fix up as Xen has already reloaded all segment + * registers that could be reloaded and zeroed the others. + * Category 2 we fix up by killing the current process. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by comparing each saved segment register + * with its current contents: any discrepancy means we in category 1. + */ ENTRY(xen_failsafe_callback) INTR_FRAME 1 (6*8) /*CFI_REL_OFFSET gs,GS*/ @@ -1339,8 +1341,8 @@ paranoidzeroentry machine_check do_machine_check #endif /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take * any kernel state for granted. * We don't do kernel preemption checks here, because only * NMI should be common and it does not enable IRQs and @@ -1445,7 +1447,7 @@ error_kernelspace: cmpq %rcx,RIP+8(%rsp) je error_swapgs cmpq $gs_change,RIP+8(%rsp) - je error_swapgs + je error_swapgs jmp error_sti END(error_entry) @@ -1521,7 +1523,7 @@ nmi_schedule: CFI_ENDPROC #else jmp paranoid_exit - CFI_ENDPROC + CFI_ENDPROC #endif END(nmi) -- cgit v1.2.3 From 5b3eec0c80038c8739ccd465b897a35c0dff1cc4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Nov 2008 14:41:21 +0100 Subject: x86: ret_from_fork - get rid of jump back Impact: remove dead code If we take a closer look at the rff_trace/rff_action ret_from_fork code, we have to realize that it does all the wrong things: for example it checks the TIF flag - while later on jumping back to the ret-from-syscall path - duplicating the check needlessly. But checking for _TIF_SYSCALL_TRACE is completely unnecessary here because we clear that flag for every freshly forked task. So the whole "tracing" code here, for which there is a out of line jump optimization that makes it even harder to read, is in reality completely dead code ... Reported-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar Tested-by: Cyrill Gorcunov --- arch/x86/kernel/entry_64.S | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e41734a537bd..3194636a4293 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -361,34 +361,35 @@ ENTRY(save_paranoid) END(save_paranoid) /* - * A newly forked process directly context switches into this. + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from */ -/* rdi: prev */ ENTRY(ret_from_fork) DEFAULT_FRAME + push kernel_eflags(%rip) CFI_ADJUST_CFA_OFFSET 8 - popf # reset kernel eflags + popf # reset kernel eflags CFI_ADJUST_CFA_OFFSET -8 - call schedule_tail + + call schedule_tail # rdi: 'prev' task parameter + GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + CFI_REMEMBER_STATE - jnz rff_trace -rff_action: RESTORE_REST - testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? + + testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? je int_ret_from_sys_call - testl $_TIF_IA32,TI_flags(%rcx) + + testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET jnz int_ret_from_sys_call + RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET - jmp ret_from_sys_call + jmp ret_from_sys_call # go to the SYSRET fastpath + CFI_RESTORE_STATE -rff_trace: - movq %rsp,%rdi - call syscall_trace_leave - GET_THREAD_INFO(%rcx) - jmp rff_action CFI_ENDPROC END(ret_from_fork) -- cgit v1.2.3 From 8caac56305cef98f9357b060a77939d17699937d Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 26 Nov 2008 17:15:27 +0100 Subject: aperture_64.c: clarify that too small aperture is valid reason for this code Impact: update comment Clarify that too small aperture is valid reason for this code. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9a32b37ee2ee..676debfc1702 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -1,8 +1,9 @@ /* * Firmware replacement code. * - * Work around broken BIOSes that don't set an aperture or only set the - * aperture in the AGP bridge. + * Work around broken BIOSes that don't set an aperture, only set the + * aperture in the AGP bridge, or set too small aperture. + * * If all fails map the aperture over some low memory. This is cheaper than * doing bounce buffering. The memory is lost. This is done at early boot * because only the bootmem allocator can allocate 32+MB. -- cgit v1.2.3 From 4385cecf1f5866fb33fc95e2ee26a44e9b6f6be2 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Sat, 29 Nov 2008 22:33:16 +0100 Subject: x86: intel_cacheinfo, minor show_type cleanup Impact: cleanup Signed-off-by: Jiri Slaby Cc: Jiri Slaby Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 3f46afbb1cf1..68b5d8681cbb 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -644,20 +644,17 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) return show_shared_cpu_map_func(leaf, 1, buf); } -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { - switch(this_leaf->eax.split.type) { - case CACHE_TYPE_DATA: +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) +{ + switch (this_leaf->eax.split.type) { + case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); - break; - case CACHE_TYPE_INST: + case CACHE_TYPE_INST: return sprintf(buf, "Instruction\n"); - break; - case CACHE_TYPE_UNIFIED: + case CACHE_TYPE_UNIFIED: return sprintf(buf, "Unified\n"); - break; - default: + default: return sprintf(buf, "Unknown\n"); - break; } } -- cgit v1.2.3 From 2c5643b1c5c7fbb13f340d4c58944d9642f41796 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Sun, 30 Nov 2008 17:16:04 +0900 Subject: x86: provide readq()/writeq() on 32-bit too Impact: add new API for drivers Add implementation of readq/writeq to x86_32, and add config value to the x86 architecture to determine existence of readq/writeq. Signed-off-by: Hitoshi Mitake Acked-by: Sam Ravnborg Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 ++ arch/x86/include/asm/io.h | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f7..a7d50f5d118c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,6 +19,8 @@ config X86_64 config X86 def_bool y select HAVE_AOUT if X86_32 + select HAVE_READQ + select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index ac2abc88cd95..25946449df4f 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -4,6 +4,7 @@ #define ARCH_HAS_IOREMAP_WC #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -57,6 +58,29 @@ build_mmio_write(__writeq, "q", unsigned long, "r", ) /* Let people know we have them */ #define readq readq #define writeq writeq + +#else /* CONFIG_X86_32 from here */ + +static inline __u64 readq(const volatile void __iomem *addr) +{ + const volatile u32 __iomem *p = addr; + u32 l, h; + + l = readl(p); + h = readl(p + 1); + + return l + ((u64)h << 32); +} + +static inline void writeq(__u64 val, volatile void __iomem *addr) +{ + writel(val, addr); + writel(val >> 32, addr+4); +} + +#define readq readq +#define writeq writeq + #endif extern int iommu_bio_merge; -- cgit v1.2.3 From a0b1131e479e5af32eefac8bc54c9742e23d638e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 30 Nov 2008 09:33:55 +0100 Subject: x86: provide readq()/writeq() on 32-bit too, cleanup Impact: cleanup Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 25946449df4f..3ccfaf610c89 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -55,21 +55,17 @@ build_mmio_write(__writeq, "q", unsigned long, "r", ) #define __raw_readq __readq #define __raw_writeq writeq -/* Let people know we have them */ -#define readq readq -#define writeq writeq - #else /* CONFIG_X86_32 from here */ static inline __u64 readq(const volatile void __iomem *addr) { const volatile u32 __iomem *p = addr; - u32 l, h; + u32 low, high; - l = readl(p); - h = readl(p + 1); + low = readl(p); + high = readl(p + 1); - return l + ((u64)h << 32); + return low + ((u64)high << 32); } static inline void writeq(__u64 val, volatile void __iomem *addr) @@ -78,11 +74,12 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) writel(val >> 32, addr+4); } +#endif + +/* Let people know that we have them */ #define readq readq #define writeq writeq -#endif - extern int iommu_bio_merge; #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 93093d099e5dd0c258fd530c12668e828c20df41 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 30 Nov 2008 10:20:20 +0100 Subject: x86: provide readq()/writeq() on 32-bit too, complete if HAVE_READQ/HAVE_WRITEQ are defined, the full range of readq/writeq APIs has to be provided to drivers: drivers/infiniband/hw/amso1100/c2.c: In function 'c2_tx_ring_alloc': drivers/infiniband/hw/amso1100/c2.c:133: error: implicit declaration of function '__raw_writeq' So provide them on 32-bit as well. Also, map all the APIs to the strongest ordering variant. It's way too easy to mess such details up in drivers and the difference between "memory" and "" constrained asm() constructs is in the noise range. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 3ccfaf610c89..33513b9a67f3 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -46,16 +46,11 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define mmiowb() barrier() #ifdef CONFIG_X86_64 + build_mmio_read(readq, "q", unsigned long, "=r", :"memory") -build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") -build_mmio_write(__writeq, "q", unsigned long, "r", ) - -#define readq_relaxed(a) __readq(a) -#define __raw_readq __readq -#define __raw_writeq writeq -#else /* CONFIG_X86_32 from here */ +#else static inline __u64 readq(const volatile void __iomem *addr) { @@ -76,9 +71,14 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) #endif +#define readq_relaxed(a) readq(a) + +#define __raw_readq(a) readq(a) +#define __raw_writeq(val, addr) writeq(val, addr) + /* Let people know that we have them */ -#define readq readq -#define writeq writeq +#define readq readq +#define writeq writeq extern int iommu_bio_merge; -- cgit v1.2.3 From 50cec5c51c18301ff60262fdbe920f4a907c9d81 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 2 Dec 2008 02:17:15 +0900 Subject: x86: fix dma_mapping_error for 32bit x86, cleanup This removes ifdef CONFIG_X86_64 in dma_mapping_error(): 1) Xen people plan to use swiotlb on X86_32 for Dom0 support. swiotlb uses ops->mapping_error so X86_32 also needs to check ops->mapping_error. 2) Removing #ifdef hack is almost always a good thing. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dma-mapping.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 097794ff6b79..dc22c0733282 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -71,12 +71,10 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) /* Make sure we keep the same behaviour */ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { -#ifdef CONFIG_X86_64 struct dma_mapping_ops *ops = get_dma_ops(dev); if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); -#endif return (dma_addr == bad_dma_address); } -- cgit v1.2.3 From dcb7731a185efbf3d800618d874af99895df5afb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 2 Dec 2008 20:16:03 +0100 Subject: x86: fix broken flushing in GART nofullflush path Impact: remove stale IOTLB entries In the non-default nofullflush case the GART is only flushed when next_bit wraps around. But it can happen that an unmap operation unmaps memory which is behind the current next_bit location. If these addresses are reused it may result in stale GART IO/TLB entries. Fix this by setting the GART next_bit always behind an unmapped location. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a42b02b4df68..ba7ad83e20a8 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -123,6 +123,8 @@ static void free_iommu(unsigned long offset, int size) spin_lock_irqsave(&iommu_bitmap_lock, flags); iommu_area_free(iommu_gart_bitmap, offset, size); + if (offset >= next_bit) + next_bit = offset + size; spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } -- cgit v1.2.3 From 181de82ee3ffda1175f89d50c991dae31b79280c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 3 Dec 2008 14:53:04 +0900 Subject: x86: remove dead BIO_VMERGE_BOUNDARY definition Impact: cleanup, remove dead code The block layer dropped the virtual merge feature (b8b3e16cfe6435d961f6aaebcfd52a1ff2a988c5). BIO_VMERGE_BOUNDARY definition is meaningless now. Signed-off-by: FUJITA Tomonori Acked-by: Jens Axboe Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 2 -- arch/x86/include/asm/io_64.h | 2 -- arch/x86/kernel/pci-dma.c | 6 ------ 3 files changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 33513b9a67f3..05cfed4485fa 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -80,8 +80,6 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) #define readq readq #define writeq writeq -extern int iommu_bio_merge; - #ifdef CONFIG_X86_32 # include "io_32.h" #else diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h index fea325a1122f..563c16270ba6 100644 --- a/arch/x86/include/asm/io_64.h +++ b/arch/x86/include/asm/io_64.h @@ -232,8 +232,6 @@ void memset_io(volatile void __iomem *a, int b, size_t c); #define flush_write_buffers() -#define BIO_VMERGE_BOUNDARY iommu_bio_merge - /* * Convert a virtual cached pointer to an uncached pointer */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 12eeb4bfcdeb..da93c65f8f0b 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -31,11 +31,6 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -/* This tells the BIO block layer to assume merging. Default to off - because we cannot guarantee merging later. */ -int iommu_bio_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_bio_merge); - dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -189,7 +184,6 @@ static __init int iommu_setup(char *p) } if (!strncmp(p, "biomerge", 8)) { - iommu_bio_merge = 4096; iommu_merge = 1; force_iommu = 1; } -- cgit v1.2.3 From affa219b60a11b3295637a97f5b1b8ef231490fc Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Wed, 3 Dec 2008 18:58:19 -0500 Subject: x86: change thread_info's flag field back to 32 bits Impact: pack struct thread_info more tightly Change x86_64's thread_info 'flags' field back to __u32. This was changed to 'unsigned long' when the thread_info*.h for i386 and x86_64 were merged. Change it back. We can do this as only 27 bits of 'flags' are actually used. This change actually packs down thread_info by 64 bits: 32 bits are saved by the smaller flags, and 32 bits are saved by the following 'mm_segment_t field' becoming naturally 64-bit aligned. Signed-off-by: Joe Korty Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad2..8dbc57390d25 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -24,7 +24,7 @@ struct exec_domain; struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ - unsigned long flags; /* low level flags */ + __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, -- cgit v1.2.3 From 55c395b47042e12d5c25aa07f271f56ffe44f793 Mon Sep 17 00:00:00 2001 From: Michael Tokarev Date: Fri, 5 Dec 2008 14:42:20 +0300 Subject: x86: fix missing space in printk Just come across this when booting on an old hw.. Looks somewhat ugly, that single missing space ;) Signed-off-by: Michael Tokarev Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7b1093397319..1a3c3253f0ed 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1086,8 +1086,10 @@ static int __init smp_sanity_check(unsigned max_cpus) #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING "weird, boot CPU (#%d) not listed" - "by the BIOS.\n", hard_smp_processor_id()); + printk(KERN_WARNING + "weird, boot CPU (#%d) not listed by the BIOS.\n", + hard_smp_processor_id()); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); } -- cgit v1.2.3 From a0286c94f07636380082608196d41dd725a83229 Mon Sep 17 00:00:00 2001 From: Michael Tokarev Date: Fri, 5 Dec 2008 15:47:29 +0300 Subject: x86: fix missing space in printk, #2 Impact: clean up printk Signed-off-by: Michael Tokarev Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 192624820217..dc572994703d 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -300,8 +300,8 @@ fs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO "PCI: VIA PCI bridge detected." - "Disabling DAC.\n"); + printk(KERN_INFO + "PCI: VIA PCI bridge detected. Disabling DAC.\n"); forbid_dac = 1; } } -- cgit v1.2.3 From 3e1e9002aa8b32bd4c95ac6c8fad376b7a8127fb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 8 Dec 2008 00:50:22 +0100 Subject: x86: change static allocation of trampoline area Impact: fix trampoline sizing bug, save space While debugging a suspend-to-RAM related issue it occured to me that if the trampoline code had grown past 4 KB, we would have been allocating too little memory for it, since the 4 KB size of the trampoline is hardcoded into arch/x86/kernel/e820.c . Change that by making the kernel compute the trampoline size and allocate as much memory as necessary. Signed-off-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trampoline.h | 7 +++++++ arch/x86/kernel/e820.c | 16 ---------------- arch/x86/kernel/head32.c | 3 +++ arch/x86/kernel/head64.c | 3 +++ arch/x86/kernel/trampoline.c | 19 +++++++++++++++++-- 5 files changed, 30 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index fa0d79facdbc..780ba0ab94f9 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -3,6 +3,7 @@ #ifndef __ASSEMBLY__ +#ifdef CONFIG_X86_TRAMPOLINE /* * Trampoline 80x86 program as an array. */ @@ -13,8 +14,14 @@ extern unsigned char *trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; +#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) #define TRAMPOLINE_BASE 0x6000 + extern unsigned long setup_trampoline(void); +extern void __init reserve_trampoline_memory(void); +#else +static inline void reserve_trampoline_memory(void) {}; +#endif /* CONFIG_X86_TRAMPOLINE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7aafeb5263ef..65a13943e098 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -677,22 +677,6 @@ struct early_res { }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE) - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, -#endif -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" }, - /* - * Has to be in very low memory so we can execute - * real-mode AP code. - */ - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" }, -#endif {} }; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index fa1d25dd83e3..ac108d1fe182 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -12,9 +12,12 @@ #include #include #include +#include void __init i386_start_kernel(void) { + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index d16084f90649..388e05a5fc17 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,6 +24,7 @@ #include #include #include +#include /* boot cpu pda */ static struct x8664_pda _boot_cpu_pda __read_mostly; @@ -120,6 +121,8 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 1106fac6024d..808031a5ba19 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -1,10 +1,26 @@ #include #include +#include /* ready for x86_64 and x86 */ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); +void __init reserve_trampoline_memory(void) +{ +#ifdef CONFIG_X86_32 + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); +#endif + /* Has to be in very low memory so we can execute real-mode AP code. */ + reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, + "TRAMPOLINE"); +} + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); */ unsigned long setup_trampoline(void) { - memcpy(trampoline_base, trampoline_data, - trampoline_end - trampoline_data); + memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } -- cgit v1.2.3 From 4217458dafaa57d8e26a46f5d05ab8c53cf64191 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Dec 2008 17:17:09 -0800 Subject: x86: signal: change type of paramter for sys_rt_sigreturn() Impact: cleanup on 32-bit Peter pointed this parameter can be changed. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/syscalls.h | 2 +- arch/x86/kernel/signal.c | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 87803da44010..3a5252c4b8d6 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -33,7 +33,7 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); asmlinkage int sys_sigaltstack(unsigned long); asmlinkage unsigned long sys_sigreturn(unsigned long); -asmlinkage int sys_rt_sigreturn(unsigned long); +asmlinkage int sys_rt_sigreturn(struct pt_regs); /* kernel/ioport.c */ asmlinkage long sys_iopl(unsigned long); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b1f4d34e0a38..b1cc6da64208 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -642,11 +642,9 @@ badframe: } #ifdef CONFIG_X86_32 -asmlinkage int sys_rt_sigreturn(unsigned long __unused) +asmlinkage int sys_rt_sigreturn(struct pt_regs regs) { - struct pt_regs *regs = (struct pt_regs *)&__unused; - - return do_rt_sigreturn(regs); + return do_rt_sigreturn(®s); } #else /* !CONFIG_X86_32 */ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -- cgit v1.2.3 From fd13f6c85144bb2026c534a35be1d7cb7628a64a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Sun, 19 Oct 2008 21:00:09 +0200 Subject: oprofile: comment cleanup This fixes the coding style of some comments. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 31 ++++++++++++++++--------------- drivers/oprofile/cpu_buffer.c | 25 ++++++++++++++++--------- 2 files changed, 32 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 509513760a6e..fb67e1999d85 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -65,8 +65,10 @@ static unsigned long reset_value[NUM_COUNTERS]; #define IBS_FETCH_BEGIN 3 #define IBS_OP_BEGIN 4 -/* The function interface needs to be fixed, something like add - data. Should then be added to linux/oprofile.h. */ +/* + * The function interface needs to be fixed, something like add + * data. Should then be added to linux/oprofile.h. + */ extern void oprofile_add_ibs_sample(struct pt_regs *const regs, unsigned int *const ibs_sample, int ibs_code); @@ -106,7 +108,7 @@ struct ibs_op_sample { /* * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ -*/ + */ static void clear_ibs_nmi(void); static int ibs_allowed; /* AMD Family10h and later */ @@ -223,7 +225,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, (unsigned int *)&ibs_fetch, IBS_FETCH_BEGIN); - /*reenable the IRQ */ + /* reenable the IRQ */ rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); high &= ~IBS_FETCH_HIGH_VALID_BIT; high |= IBS_FETCH_HIGH_ENABLE; @@ -331,8 +333,10 @@ static void op_amd_stop(struct op_msrs const * const msrs) unsigned int low, high; int i; - /* Subtle: stop on all counters to avoid race with - * setting our pm callback */ + /* + * Subtle: stop on all counters to avoid race with setting our + * pm callback + */ for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (!reset_value[i]) continue; @@ -343,13 +347,15 @@ static void op_amd_stop(struct op_msrs const * const msrs) #ifdef CONFIG_OPROFILE_IBS if (ibs_allowed && ibs_config.fetch_enabled) { - low = 0; /* clear max count and enable */ + /* clear max count and enable */ + low = 0; high = 0; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } if (ibs_allowed && ibs_config.op_enabled) { - low = 0; /* clear max count and enable */ + /* clear max count and enable */ + low = 0; high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } @@ -443,10 +449,7 @@ static int pfm_amd64_setup_eilvt(void) return 0; } -/* - * initialize the APIC for the IBS interrupts - * if available (AMD Family10h rev B0 and later) - */ +/* initialize the APIC for the IBS interrupts if available */ static void setup_ibs(void) { ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); @@ -463,9 +466,7 @@ static void setup_ibs(void) } -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h - * rev B0 and later */ +/* uninitialize the APIC for the IBS interrupts if needed */ static void clear_ibs_nmi(void) { if (ibs_allowed) diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 01d38e78cde1..3958107723fb 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -127,9 +127,10 @@ void end_cpu_work(void) /* Resets the cpu buffer to a sane state. */ void cpu_buffer_reset(struct oprofile_cpu_buffer *cpu_buf) { - /* reset these to invalid values; the next sample - * collected will populate the buffer with proper - * values to initialize the buffer + /* + * reset these to invalid values; the next sample collected + * will populate the buffer with proper values to initialize + * the buffer */ cpu_buf->last_is_kernel = -1; cpu_buf->last_task = NULL; @@ -151,8 +152,10 @@ static void increment_head(struct oprofile_cpu_buffer *b) { unsigned long new_head = b->head_pos + 1; - /* Ensure anything written to the slot before we - * increment is visible */ + /* + * Ensure anything written to the slot before we increment is + * visible + */ wmb(); if (new_head < b->buffer_size) @@ -253,8 +256,10 @@ void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, if (!oprofile_begin_trace(cpu_buf)) return; - /* if log_sample() fail we can't backtrace since we lost the source - * of this event */ + /* + * if log_sample() fail we can't backtrace since we lost the + * source of this event + */ if (log_sample(cpu_buf, pc, is_kernel, event)) oprofile_ops.backtrace(regs, backtrace_depth); oprofile_end_trace(cpu_buf); @@ -338,8 +343,10 @@ void oprofile_add_trace(unsigned long pc) return; } - /* broken frame can give an eip with the same value as an escape code, - * abort the trace if we get it */ + /* + * broken frame can give an eip with the same value as an + * escape code, abort the trace if we get it + */ if (pc == ESCAPE_CODE) { cpu_buf->tracing = 0; cpu_buf->backtrace_aborted++; -- cgit v1.2.3 From cdc1834d1aa2e5b574a25e66f82625b44cdd0d8f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 26 Sep 2008 22:18:44 -0400 Subject: oprofile: whitspace changes only Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 4 ++-- drivers/oprofile/cpu_buffer.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index fb67e1999d85..f71bd218b488 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -70,8 +70,8 @@ static unsigned long reset_value[NUM_COUNTERS]; * data. Should then be added to linux/oprofile.h. */ extern void -oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int *const ibs_sample, int ibs_code); +oprofile_add_ibs_sample(struct pt_regs * const regs, + unsigned int * const ibs_sample, int ibs_code); struct ibs_fetch_sample { /* MSRC001_1031 IBS Fetch Linear Address Register */ diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 3958107723fb..2c4d54187b90 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -277,8 +277,8 @@ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) #define MAX_IBS_SAMPLE_SIZE 14 -void oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int *const ibs_sample, int ibs_code) +void oprofile_add_ibs_sample(struct pt_regs * const regs, + unsigned int * const ibs_sample, int ibs_code) { int is_kernel = !user_mode(regs); struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); -- cgit v1.2.3 From 9fa6812dbab9207f7af52c3d0417f1f9eb89c386 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 24 Nov 2008 14:21:03 +0100 Subject: x86/oprofile: reordering IBS code in op_model_amd.c This is part of the cpu buffer rework. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index f71bd218b488..8ff657b3ff89 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -376,18 +376,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } -#ifndef CONFIG_OPROFILE_IBS - -/* no IBS support */ - -static int op_amd_init(struct oprofile_operations *ops) -{ - return 0; -} - -static void op_amd_exit(void) {} - -#else +#ifdef CONFIG_OPROFILE_IBS static u8 ibs_eilvt_off; @@ -531,7 +520,18 @@ static void op_amd_exit(void) clear_ibs_nmi(); } -#endif +#else + +/* no IBS support */ + +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) {} + +#endif /* CONFIG_OPROFILE_IBS */ struct op_x86_model_spec const op_amd_spec = { .init = op_amd_init, -- cgit v1.2.3 From fe615cbf34fc6a1c53c359417da4696328a488ed Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 24 Nov 2008 14:58:03 +0100 Subject: x86/oprofile: cleanup IBS init/exit functions in op_model_amd.c Implementation of pairwise init/exit funcions for IBS and IBS NMI setup. There are also some function renames and the removal of forward function declarations. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 8ff657b3ff89..98658f25f542 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -106,11 +106,6 @@ struct ibs_op_sample { unsigned int ibs_dc_phys_high; }; -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ - */ -static void clear_ibs_nmi(void); - static int ibs_allowed; /* AMD Family10h and later */ struct op_ibs_config { @@ -390,7 +385,7 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg) setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); } -static int pfm_amd64_setup_eilvt(void) +static int init_ibs_nmi(void) { #define IBSCTL_LVTOFFSETVAL (1 << 8) #define IBSCTL 0x1cc @@ -438,15 +433,22 @@ static int pfm_amd64_setup_eilvt(void) return 0; } +/* uninitialize the APIC for the IBS interrupts if needed */ +static void clear_ibs_nmi(void) +{ + if (ibs_allowed) + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); +} + /* initialize the APIC for the IBS interrupts if available */ -static void setup_ibs(void) +static void ibs_init(void) { ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); if (!ibs_allowed) return; - if (pfm_amd64_setup_eilvt()) { + if (init_ibs_nmi()) { ibs_allowed = 0; return; } @@ -454,12 +456,12 @@ static void setup_ibs(void) printk(KERN_INFO "oprofile: AMD IBS detected\n"); } - -/* uninitialize the APIC for the IBS interrupts if needed */ -static void clear_ibs_nmi(void) +static void ibs_exit(void) { - if (ibs_allowed) - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); + if (!ibs_allowed) + return; + + clear_ibs_nmi(); } static int (*create_arch_files)(struct super_block *sb, struct dentry *root); @@ -509,7 +511,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) static int op_amd_init(struct oprofile_operations *ops) { - setup_ibs(); + ibs_init(); create_arch_files = ops->create_files; ops->create_files = setup_ibs_files; return 0; @@ -517,7 +519,7 @@ static int op_amd_init(struct oprofile_operations *ops) static void op_amd_exit(void) { - clear_ibs_nmi(); + ibs_exit(); } #else -- cgit v1.2.3 From b0884e25fe361f2ca228808fb5fd1b74cb04e711 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 11 Dec 2008 13:45:23 +0100 Subject: x86, bts: turn BUG_ON into WARN_ON_ONCE Impact: make the ds code more debuggable Turn BUG_ON's into WARN_ON_ONCE. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 4 ++-- arch/x86/kernel/ptrace.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 19a8c2c0389f..095306988667 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -452,7 +452,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) { - BUG_ON(tracer->context->owner[qual] != tracer); + WARN_ON_ONCE(tracer->context->owner[qual] != tracer); tracer->context->owner[qual] = NULL; put_tracer(tracer->context->task); @@ -774,7 +774,7 @@ ds_configure(const struct ds_configuration *cfg) printk(KERN_INFO "DS available\n"); - BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); + WARN_ON_ONCE(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2c8ec1ba75e6..b2998fe1166b 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -878,7 +878,8 @@ static int ptrace_bts_write_record(struct task_struct *child, { unsigned char bts_record[BTS_MAX_RECORD_SIZE]; - BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); + if (BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts) + return -EOVERFLOW; memset(bts_record, 0, bts_cfg.sizeof_bts); switch (in->qualifier) { @@ -1133,7 +1134,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) ret = ds_get_bts_index(child->bts, &size); if (ret == 0) { - BUG_ON(size != (int) size); + WARN_ON_ONCE(size != (int) size); ret = (int) size; } break; -- cgit v1.2.3 From c2724775ce57c98b8af9694857b941dc61056516 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 11 Dec 2008 13:49:59 +0100 Subject: x86, bts: provide in-kernel branch-trace interface Impact: cleanup Move the BTS bits from ptrace.c into ds.c. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 241 ++++++----- arch/x86/include/asm/processor.h | 13 + arch/x86/include/asm/ptrace.h | 36 -- arch/x86/include/asm/thread_info.h | 5 +- arch/x86/kernel/cpu/intel.c | 4 - arch/x86/kernel/ds.c | 857 +++++++++++++++++++++++-------------- arch/x86/kernel/process_32.c | 59 +-- arch/x86/kernel/process_64.c | 50 +-- arch/x86/kernel/ptrace.c | 416 +++++------------- include/linux/sched.h | 1 + 10 files changed, 811 insertions(+), 871 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 99b6c39774a4..ee0ea3a96c11 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -6,13 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS + * - DS and BTS hardware configuration * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all traced tasks - * - current is allowed to trace tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -31,6 +31,7 @@ #ifdef CONFIG_X86_DS struct task_struct; +struct ds_context; struct ds_tracer; struct bts_tracer; struct pebs_tracer; @@ -38,6 +39,38 @@ struct pebs_tracer; typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); + +/* + * A list of features plus corresponding macros to talk about them in + * the ds_request function's flags parameter. + * + * We use the enum to index an array of corresponding control bits; + * we use the macro to index a flags bit-vector. + */ +enum ds_feature { + dsf_bts = 0, + dsf_bts_kernel, +#define BTS_KERNEL (1 << dsf_bts_kernel) + /* trace kernel-mode branches */ + + dsf_bts_user, +#define BTS_USER (1 << dsf_bts_user) + /* trace user-mode branches */ + + dsf_bts_overflow, + dsf_bts_max, + dsf_pebs = dsf_bts_max, + + dsf_pebs_max, + dsf_ctl_max = dsf_pebs_max, + dsf_bts_timestamps = dsf_ctl_max, +#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps) + /* add timestamps into BTS trace */ + +#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS) +}; + + /* * Request BTS or PEBS * @@ -58,92 +91,135 @@ typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); * NULL if cyclic buffer requested * th: the interrupt threshold in records from the end of the buffer; * -1 if no interrupt threshold is requested. + * flags: a bit-mask of the above flags */ extern struct bts_tracer *ds_request_bts(struct task_struct *task, void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th); + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, void *base, size_t size, pebs_ovfl_callback_t ovfl, - size_t th); + size_t th, unsigned int flags); /* * Release BTS or PEBS resources - * - * Returns 0 on success; -Eerrno otherwise + * Suspend and resume BTS or PEBS tracing * * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_release_bts(struct bts_tracer *tracer); -extern int ds_release_pebs(struct pebs_tracer *tracer); +extern void ds_release_bts(struct bts_tracer *tracer); +extern void ds_suspend_bts(struct bts_tracer *tracer); +extern void ds_resume_bts(struct bts_tracer *tracer); +extern void ds_release_pebs(struct pebs_tracer *tracer); +extern void ds_suspend_pebs(struct pebs_tracer *tracer); +extern void ds_resume_pebs(struct pebs_tracer *tracer); + /* - * Get the (array) index of the write pointer. - * (assuming an array of BTS/PEBS records) - * - * Returns 0 on success; -Eerrno on error + * The raw DS buffer state as it is used for BTS and PEBS recording. * - * tracer: the tracer handle returned from ds_request_~() - * pos (out): will hold the result + * This is the low-level, arch-dependent interface for working + * directly on the raw trace data. */ -extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); -extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos); +struct ds_trace { + /* the number of bts/pebs records */ + size_t n; + /* the size of a bts/pebs record in bytes */ + size_t size; + /* pointers into the raw buffer: + - to the first entry */ + void *begin; + /* - one beyond the last entry */ + void *end; + /* - one beyond the newest entry */ + void *top; + /* - the interrupt threshold */ + void *ith; + /* flags given on ds_request() */ + unsigned int flags; +}; /* - * Get the (array) index one record beyond the end of the array. - * (assuming an array of BTS/PEBS records) - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_~() - * pos (out): will hold the result + * An arch-independent view on branch trace data. */ -extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); -extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos); +enum bts_qualifier { + bts_invalid, +#define BTS_INVALID bts_invalid + + bts_branch, +#define BTS_BRANCH bts_branch + + bts_task_arrives, +#define BTS_TASK_ARRIVES bts_task_arrives + + bts_task_departs, +#define BTS_TASK_DEPARTS bts_task_departs + + bts_qual_bit_size = 4, + bts_qual_max = (1 << bts_qual_bit_size), +}; + +struct bts_struct { + __u64 qualifier; + union { + /* BTS_BRANCH */ + struct { + __u64 from; + __u64 to; + } lbr; + /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ + struct { + __u64 jiffies; + pid_t pid; + } timestamp; + } variant; +}; + /* - * Provide a pointer to the BTS/PEBS record at parameter index. - * (assuming an array of BTS/PEBS records) - * - * The pointer points directly into the buffer. The user is - * responsible for copying the record. - * - * Returns the size of a single record on success; -Eerrno on error + * The BTS state. * - * tracer: the tracer handle returned from ds_request_~() - * index: the index of the requested record - * record (out): pointer to the requested record + * This gives access to the raw DS state and adds functions to provide + * an arch-independent view of the BTS data. */ -extern int ds_access_bts(struct bts_tracer *tracer, - size_t index, const void **record); -extern int ds_access_pebs(struct pebs_tracer *tracer, - size_t index, const void **record); +struct bts_trace { + struct ds_trace ds; + + int (*read)(struct bts_tracer *tracer, const void *at, + struct bts_struct *out); + int (*write)(struct bts_tracer *tracer, const struct bts_struct *in); +}; + /* - * Write one or more BTS/PEBS records at the write pointer index and - * advance the write pointer. + * The PEBS state. * - * If size is not a multiple of the record size, trailing bytes are - * zeroed out. - * - * May result in one or more overflow notifications. - * - * If called during overflow handling, that is, with index >= - * interrupt threshold, the write will wrap around. + * This gives access to the raw DS state and the PEBS-specific counter + * reset value. + */ +struct pebs_trace { + struct ds_trace ds; + + /* the PEBS reset value */ + unsigned long long reset_value; +}; + + +/* + * Read the BTS or PEBS trace. * - * An overflow notification is given if and when the interrupt - * threshold is reached during or after the write. + * Returns a view on the trace collected for the parameter tracer. * - * Returns the number of bytes written or -Eerrno. + * The view remains valid as long as the traced task is not running or + * the tracer is suspended. + * Writes into the trace buffer are not reflected. * * tracer: the tracer handle returned from ds_request_~() - * buffer: the buffer to write - * size: the size of the buffer */ -extern int ds_write_bts(struct bts_tracer *tracer, - const void *buffer, size_t size); -extern int ds_write_pebs(struct pebs_tracer *tracer, - const void *buffer, size_t size); +extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer); +extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer); + /* * Reset the write pointer of the BTS/PEBS buffer. @@ -155,27 +231,6 @@ extern int ds_write_pebs(struct pebs_tracer *tracer, extern int ds_reset_bts(struct bts_tracer *tracer); extern int ds_reset_pebs(struct pebs_tracer *tracer); -/* - * Clear the BTS/PEBS buffer and reset the write pointer. - * The entire buffer will be zeroed out. - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_~() - */ -extern int ds_clear_bts(struct bts_tracer *tracer); -extern int ds_clear_pebs(struct pebs_tracer *tracer); - -/* - * Provide the PEBS counter reset value. - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_pebs() - * value (out): the counter reset value - */ -extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value); - /* * Set the PEBS counter reset value. * @@ -192,35 +247,17 @@ extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); struct cpuinfo_x86; extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); - - /* - * The DS context - part of struct thread_struct. + * Context switch work */ -#define MAX_SIZEOF_DS (12 * 8) - -struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char ds[MAX_SIZEOF_DS]; - /* the owner of the BTS and PEBS configuration, respectively */ - struct ds_tracer *owner[2]; - /* use count */ - unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ - struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ - struct task_struct *task; -}; - -/* called by exit_thread() to free leftover contexts */ -extern void ds_free(struct ds_context *context); +extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} +static inline void ds_switch_to(struct task_struct *prev, + struct task_struct *next) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ca01e383269..aa5914f8e501 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -752,6 +752,19 @@ extern void switch_to_new_gdt(void); extern void cpu_init(void); extern void init_gdt(int cpu); +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index eefb0594b058..fbf744215911 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -6,7 +6,6 @@ #include #ifdef __KERNEL__ -#include /* the DS BTS struct is used for ptrace too */ #include #endif @@ -128,34 +127,6 @@ struct pt_regs { #endif /* !__i386__ */ -#ifdef CONFIG_X86_PTRACE_BTS -/* a branch trace record entry - * - * In order to unify the interface between various processor versions, - * we use the below data structure for all processors. - */ -enum bts_qualifier { - BTS_INVALID = 0, - BTS_BRANCH, - BTS_TASK_ARRIVES, - BTS_TASK_DEPARTS -}; - -struct bts_struct { - __u64 qualifier; - union { - /* BTS_BRANCH */ - struct { - __u64 from_ip; - __u64 to_ip; - } lbr; - /* BTS_TASK_ARRIVES or - BTS_TASK_DEPARTS */ - __u64 jiffies; - } variant; -}; -#endif /* CONFIG_X86_PTRACE_BTS */ - #ifdef __KERNEL__ #include @@ -163,13 +134,6 @@ struct bts_struct { struct cpuinfo_x86; struct task_struct; -#ifdef CONFIG_X86_PTRACE_BTS -extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *); -extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier); -#else -#define ptrace_bts_init_intel(config) do {} while (0) -#endif /* CONFIG_X86_PTRACE_BTS */ - extern unsigned long profile_pc(struct pt_regs *regs); extern unsigned long diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 0921b4018c11..bf8113d16a33 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -93,7 +93,6 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ -#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -115,7 +114,6 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) -#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -141,8 +139,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ - _TIF_NOTSC) + (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 816f27f289b1..cd413d9a0218 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -309,9 +308,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P3); #endif - if (cpu_has_bts) - ptrace_bts_init_intel(c); - detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 095306988667..f0583005b75e 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -6,13 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS + * - DS and BTS hardware configuration * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all traced tasks - * - current is allowed to trace tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -34,15 +34,30 @@ * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the size of the DS structure in bytes */ - unsigned char sizeof_ds; - /* the size of one pointer-typed field in the DS structure in bytes; - this covers the first 8 fields related to buffer management. */ + /* the name of the configuration */ + const char *name; + /* the size of one pointer-typed field in the DS structure and + in the BTS and PEBS buffers in bytes; + this covers the first 8 DS fields related to buffer management. */ unsigned char sizeof_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; + /* a series of bit-masks to control various features indexed + * by enum ds_feature */ + unsigned long ctl[dsf_ctl_max]; }; -static struct ds_configuration ds_cfg; +static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); + +#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) + +#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ +#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ + +#define BTS_CONTROL \ + (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ + ds_cfg.ctl[dsf_bts_overflow]) + /* * A BTS or PEBS tracer. @@ -61,6 +76,8 @@ struct ds_tracer { struct bts_tracer { /* the common DS part */ struct ds_tracer ds; + /* the trace including the DS configuration */ + struct bts_trace trace; /* buffer overflow notification function */ bts_ovfl_callback_t ovfl; }; @@ -68,6 +85,8 @@ struct bts_tracer { struct pebs_tracer { /* the common DS part */ struct ds_tracer ds; + /* the trace including the DS configuration */ + struct pebs_trace trace; /* buffer overflow notification function */ pebs_ovfl_callback_t ovfl; }; @@ -134,13 +153,11 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, (*(unsigned long *)base) = value; } -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ - /* * Locking is done only for allocating BTS or PEBS resources. */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); +static DEFINE_SPINLOCK(ds_lock); /* @@ -156,27 +173,32 @@ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); * >0 number of per-thread tracers * <0 number of per-cpu tracers * - * The below functions to get and put tracers and to check the - * allocation type require the ds_lock to be held by the caller. - * * Tracers essentially gives the number of ds contexts for a certain * type of allocation. */ -static long tracers; +static atomic_t tracers = ATOMIC_INIT(0); static inline void get_tracer(struct task_struct *task) { - tracers += (task ? 1 : -1); + if (task) + atomic_inc(&tracers); + else + atomic_dec(&tracers); } static inline void put_tracer(struct task_struct *task) { - tracers -= (task ? 1 : -1); + if (task) + atomic_dec(&tracers); + else + atomic_inc(&tracers); } static inline int check_tracer(struct task_struct *task) { - return (task ? (tracers >= 0) : (tracers <= 0)); + return task ? + (atomic_read(&tracers) >= 0) : + (atomic_read(&tracers) <= 0); } @@ -190,14 +212,30 @@ static inline int check_tracer(struct task_struct *task) * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. */ -static DEFINE_PER_CPU(struct ds_context *, system_context); +struct ds_context { + /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + unsigned char ds[MAX_SIZEOF_DS]; + /* the owner of the BTS and PEBS configuration, respectively */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + /* use count */ + unsigned long count; + /* a pointer to the context location inside the thread_struct + * or the per_cpu context array */ + struct ds_context **this; + /* a pointer to the task owning this context, or NULL, if the + * context is owned by a cpu */ + struct task_struct *task; +}; + +static DEFINE_PER_CPU(struct ds_context *, system_context_array); -#define this_system_context per_cpu(system_context, smp_processor_id()) +#define system_context per_cpu(system_context_array, smp_processor_id()) static inline struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &this_system_context); + (task ? &task->thread.ds_ctx : &system_context); struct ds_context *context = *p_context; unsigned long irq; @@ -225,10 +263,22 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); } + + context->count++; + + spin_unlock_irqrestore(&ds_lock, irq); + } else { + spin_lock_irqsave(&ds_lock, irq); + + context = *p_context; + if (context) + context->count++; + spin_unlock_irqrestore(&ds_lock, irq); - } - context->count++; + if (!context) + context = ds_get_context(task); + } return context; } @@ -242,8 +292,10 @@ static inline void ds_put_context(struct ds_context *context) spin_lock_irqsave(&ds_lock, irq); - if (--context->count) - goto out; + if (--context->count) { + spin_unlock_irqrestore(&ds_lock, irq); + return; + } *(context->this) = NULL; @@ -253,14 +305,14 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - kfree(context); - out: spin_unlock_irqrestore(&ds_lock, irq); + + kfree(context); } /* - * Handle a buffer overflow + * Call the tracer's callback on a buffer overflow. * * context: the ds context * qual: the buffer type @@ -268,30 +320,244 @@ static inline void ds_put_context(struct ds_context *context) static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) { switch (qual) { - case ds_bts: { - struct bts_tracer *tracer = - container_of(context->owner[qual], - struct bts_tracer, ds); - if (tracer->ovfl) - tracer->ovfl(tracer); - } + case ds_bts: + if (context->bts_master && + context->bts_master->ovfl) + context->bts_master->ovfl(context->bts_master); + break; + case ds_pebs: + if (context->pebs_master && + context->pebs_master->ovfl) + context->pebs_master->ovfl(context->pebs_master); break; - case ds_pebs: { - struct pebs_tracer *tracer = - container_of(context->owner[qual], - struct pebs_tracer, ds); - if (tracer->ovfl) - tracer->ovfl(tracer); } +} + + +/* + * Write raw data into the BTS or PEBS buffer. + * + * The remainder of any partially written record is zeroed out. + * + * context: the DS context + * qual: the buffer type + * record: the data to write + * size: the size of the data + */ +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) +{ + int bytes_written = 0; + + if (!record) + return -EINVAL; + + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; + + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + + write_end = min(end, int_th); + + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; + + if (write_end <= index) + break; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); + + record = (const char *)record + write_size; + size -= write_size; + bytes_written += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(context, qual); + } + + return bytes_written; +} + + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + * + * We use two levels of abstraction: + * - the raw data level defined here + * - an arch-independent level defined in ds.h + */ + +enum bts_field { + bts_from, + bts_to, + bts_flags, + + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, + + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field) +{ + base += (ds_cfg.sizeof_field * field); + return *(unsigned long *)base; +} + +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (ds_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; +} + + +/* + * The raw BTS data is architecture dependent. + * + * For higher-level users, we give an arch-independent view. + * - ds.h defines struct bts_struct + * - bts_read translates one raw bts record into a bts_struct + * - bts_write translates one bts_struct into the raw format and + * writes it into the top of the parameter tracer's buffer. + * + * return: bytes read/written on success; -Eerrno, otherwise + */ +static int bts_read(struct bts_tracer *tracer, const void *at, + struct bts_struct *out) +{ + if (!tracer) + return -EINVAL; + + if (at < tracer->trace.ds.begin) + return -EINVAL; + + if (tracer->trace.ds.end < (at + tracer->trace.ds.size)) + return -EINVAL; + + memset(out, 0, sizeof(*out)); + if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { + out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); + out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); + out->variant.timestamp.pid = bts_get(at, bts_pid); + } else { + out->qualifier = bts_branch; + out->variant.lbr.from = bts_get(at, bts_from); + out->variant.lbr.to = bts_get(at, bts_to); + } + + return ds_cfg.sizeof_rec[ds_bts]; +} + +static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) +{ + unsigned char raw[MAX_SIZEOF_BTS]; + + if (!tracer) + return -EINVAL; + + if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts]) + return -EOVERFLOW; + + switch (in->qualifier) { + case bts_invalid: + bts_set(raw, bts_from, 0); + bts_set(raw, bts_to, 0); + bts_set(raw, bts_flags, 0); + break; + case bts_branch: + bts_set(raw, bts_from, in->variant.lbr.from); + bts_set(raw, bts_to, in->variant.lbr.to); + bts_set(raw, bts_flags, 0); + break; + case bts_task_arrives: + case bts_task_departs: + bts_set(raw, bts_qual, (bts_escape | in->qualifier)); + bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); + bts_set(raw, bts_pid, in->variant.timestamp.pid); break; + default: + return -EINVAL; } + + return ds_write(tracer->ds.context, ds_bts, raw, + ds_cfg.sizeof_rec[ds_bts]); } -static void ds_install_ds_config(struct ds_context *context, - enum ds_qualifier qual, - void *base, size_t size, size_t ith) +static void ds_write_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) +{ + unsigned char *ds = context->ds; + + ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin); + ds_set(ds, qual, ds_index, (unsigned long)cfg->top); + ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end); + ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith); +} + +static void ds_read_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) { + unsigned char *ds = context->ds; + + cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base); + cfg->top = (void *)ds_get(ds, qual, ds_index); + cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum); + cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold); +} + +static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, + void *base, size_t size, size_t ith, + unsigned int flags) { unsigned long buffer, adj; /* adjust the buffer address and size to meet alignment @@ -308,32 +574,30 @@ static void ds_install_ds_config(struct ds_context *context, buffer += adj; size -= adj; - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; + trace->n = size / ds_cfg.sizeof_rec[qual]; + trace->size = ds_cfg.sizeof_rec[qual]; - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + size = (trace->n * trace->size); + trace->begin = (void *)buffer; + trace->top = trace->begin; + trace->end = (void *)(buffer + size); /* The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size - ith); + trace->ith = (void *)(buffer + size - ith); + + trace->flags = flags; } -static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, - struct task_struct *task, - void *base, size_t size, size_t th) + +static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, + enum ds_qualifier qual, struct task_struct *task, + void *base, size_t size, size_t th, unsigned int flags) { struct ds_context *context; - unsigned long irq; int error; - error = -EOPNOTSUPP; - if (!ds_cfg.sizeof_ds) - goto out; - error = -EINVAL; if (!base) goto out; @@ -360,43 +624,26 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, goto out; tracer->context = context; + ds_init_ds_trace(trace, qual, base, size, th, flags); - spin_lock_irqsave(&ds_lock, irq); - - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - - error = -EPERM; - if (context->owner[qual]) - goto out_put_tracer; - context->owner[qual] = tracer; - - spin_unlock_irqrestore(&ds_lock, irq); - - - ds_install_ds_config(context, qual, base, size, th); - - return 0; - - out_put_tracer: - put_tracer(task); - out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); - tracer->context = NULL; + error = 0; out: return error; } struct bts_tracer *ds_request_bts(struct task_struct *task, void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th) + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct bts_tracer *tracer; + unsigned long irq; int error; + error = -EOPNOTSUPP; + if (!ds_cfg.ctl[dsf_bts]) + goto out; + /* buffer overflow notification is not yet implemented */ error = -EOPNOTSUPP; if (ovfl) @@ -408,12 +655,40 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, goto out; tracer->ovfl = ovfl; - error = ds_request(&tracer->ds, ds_bts, task, base, size, th); + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_bts, task, base, size, th, flags); if (error < 0) goto out_tracer; + + spin_lock_irqsave(&ds_lock, irq); + + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); + + error = -EPERM; + if (tracer->ds.context->bts_master) + goto out_put_tracer; + tracer->ds.context->bts_master = tracer; + + spin_unlock_irqrestore(&ds_lock, irq); + + + tracer->trace.read = bts_read; + tracer->trace.write = bts_write; + + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_bts(tracer); + return tracer; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); out: @@ -422,9 +697,11 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, struct pebs_tracer *ds_request_pebs(struct task_struct *task, void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th) + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct pebs_tracer *tracer; + unsigned long irq; int error; /* buffer overflow notification is not yet implemented */ @@ -438,300 +715,171 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, goto out; tracer->ovfl = ovfl; - error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_pebs, task, base, size, th, flags); if (error < 0) goto out_tracer; + spin_lock_irqsave(&ds_lock, irq); + + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); + + error = -EPERM; + if (tracer->ds.context->pebs_master) + goto out_put_tracer; + tracer->ds.context->pebs_master = tracer; + + spin_unlock_irqrestore(&ds_lock, irq); + + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_pebs(tracer); + return tracer; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); out: return ERR_PTR(error); } -static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) -{ - WARN_ON_ONCE(tracer->context->owner[qual] != tracer); - tracer->context->owner[qual] = NULL; - - put_tracer(tracer->context->task); - ds_put_context(tracer->context); -} - -int ds_release_bts(struct bts_tracer *tracer) +void ds_release_bts(struct bts_tracer *tracer) { if (!tracer) - return -EINVAL; + return; - ds_release(&tracer->ds, ds_bts); - kfree(tracer); + ds_suspend_bts(tracer); - return 0; -} + WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); + tracer->ds.context->bts_master = NULL; -int ds_release_pebs(struct pebs_tracer *tracer) -{ - if (!tracer) - return -EINVAL; + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); - ds_release(&tracer->ds, ds_pebs); kfree(tracer); - - return 0; -} - -static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) -{ - unsigned long base, index; - - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - - return (index - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos) +void ds_suspend_bts(struct bts_tracer *tracer) { - if (!tracer) - return -EINVAL; + struct task_struct *task; - if (!pos) - return -EINVAL; - - *pos = ds_get_index(tracer->ds.context, ds_bts); - - return 0; -} - -int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos) -{ if (!tracer) - return -EINVAL; + return; - if (!pos) - return -EINVAL; + task = tracer->ds.context->task; - *pos = ds_get_index(tracer->ds.context, ds_pebs); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); - return 0; -} + if (task) { + task->thread.debugctlmsr &= ~BTS_CONTROL; -static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual) -{ - unsigned long base, max; - - base = ds_get(context->ds, qual, ds_buffer_base); - max = ds_get(context->ds, qual, ds_absolute_maximum); - - return (max - base) / ds_cfg.sizeof_rec[qual]; + if (!task->thread.debugctlmsr) + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } } -int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos) +void ds_resume_bts(struct bts_tracer *tracer) { - if (!tracer) - return -EINVAL; - - if (!pos) - return -EINVAL; - - *pos = ds_get_end(tracer->ds.context, ds_bts); - - return 0; -} + struct task_struct *task; + unsigned long control; -int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos) -{ if (!tracer) - return -EINVAL; - - if (!pos) - return -EINVAL; - - *pos = ds_get_end(tracer->ds.context, ds_pebs); - - return 0; -} - -static int ds_access(struct ds_context *context, enum ds_qualifier qual, - size_t index, const void **record) -{ - unsigned long base, idx; - - if (!record) - return -EINVAL; - - base = ds_get(context->ds, qual, ds_buffer_base); - idx = base + (index * ds_cfg.sizeof_rec[qual]); - - if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - return -EINVAL; + return; - *record = (const void *)idx; + task = tracer->ds.context->task; - return ds_cfg.sizeof_rec[qual]; -} + control = ds_cfg.ctl[dsf_bts]; + if (!(tracer->trace.ds.flags & BTS_KERNEL)) + control |= ds_cfg.ctl[dsf_bts_kernel]; + if (!(tracer->trace.ds.flags & BTS_USER)) + control |= ds_cfg.ctl[dsf_bts_user]; -int ds_access_bts(struct bts_tracer *tracer, size_t index, - const void **record) -{ - if (!tracer) - return -EINVAL; + if (task) { + task->thread.debugctlmsr |= control; + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } - return ds_access(tracer->ds.context, ds_bts, index, record); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() | control); } -int ds_access_pebs(struct pebs_tracer *tracer, size_t index, - const void **record) +void ds_release_pebs(struct pebs_tracer *tracer) { if (!tracer) - return -EINVAL; - - return ds_access(tracer->ds.context, ds_pebs, index, record); -} - -static int ds_write(struct ds_context *context, enum ds_qualifier qual, - const void *record, size_t size) -{ - int bytes_written = 0; - - if (!record) - return -EINVAL; - - while (size) { - unsigned long base, index, end, write_end, int_th; - unsigned long write_size, adj_write_size; - - /* - * write as much as possible without producing an - * overflow interrupt. - * - * interrupt_threshold must either be - * - bigger than absolute_maximum or - * - point to a record between buffer_base and absolute_maximum - * - * index points to a valid record. - */ - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - end = ds_get(context->ds, qual, ds_absolute_maximum); - int_th = ds_get(context->ds, qual, ds_interrupt_threshold); - - write_end = min(end, int_th); - - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ - if (write_end <= index) - write_end = end; - - if (write_end <= index) - break; - - write_size = min((unsigned long) size, write_end - index); - memcpy((void *)index, record, write_size); - - record = (const char *)record + write_size; - size -= write_size; - bytes_written += write_size; - - adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; - adj_write_size *= ds_cfg.sizeof_rec[qual]; - - /* zero out trailing bytes */ - memset((char *)index + write_size, 0, - adj_write_size - write_size); - index += adj_write_size; + return; - if (index >= end) - index = base; - ds_set(context->ds, qual, ds_index, index); + ds_suspend_pebs(tracer); - if (index >= int_th) - ds_overflow(context, qual); - } + WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); + tracer->ds.context->pebs_master = NULL; - return bytes_written; -} + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); -int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size) -{ - if (!tracer) - return -EINVAL; - - return ds_write(tracer->ds.context, ds_bts, record, size); + kfree(tracer); } -int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size) +void ds_suspend_pebs(struct pebs_tracer *tracer) { - if (!tracer) - return -EINVAL; - return ds_write(tracer->ds.context, ds_pebs, record, size); } -static void ds_reset_or_clear(struct ds_context *context, - enum ds_qualifier qual, int clear) +void ds_resume_pebs(struct pebs_tracer *tracer) { - unsigned long base, end; - - base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); - - if (clear) - memset((void *)base, 0, end - base); - ds_set(context->ds, qual, ds_index, base); } -int ds_reset_bts(struct bts_tracer *tracer) +const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { if (!tracer) - return -EINVAL; - - ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); + return NULL; - return 0; + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + return &tracer->trace; } -int ds_reset_pebs(struct pebs_tracer *tracer) +const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) { if (!tracer) - return -EINVAL; + return NULL; - ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + tracer->trace.reset_value = + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - return 0; + return &tracer->trace; } -int ds_clear_bts(struct bts_tracer *tracer) +int ds_reset_bts(struct bts_tracer *tracer) { if (!tracer) return -EINVAL; - ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); - - return 0; -} - -int ds_clear_pebs(struct pebs_tracer *tracer) -{ - if (!tracer) - return -EINVAL; + tracer->trace.ds.top = tracer->trace.ds.begin; - ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); return 0; } -int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value) +int ds_reset_pebs(struct pebs_tracer *tracer) { if (!tracer) return -EINVAL; - if (!value) - return -EINVAL; + tracer->trace.ds.top = tracer->trace.ds.begin; - *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); return 0; } @@ -746,35 +894,59 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) return 0; } -static const struct ds_configuration ds_cfg_var = { - .sizeof_ds = sizeof(long) * 12, - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, +static const struct ds_configuration ds_cfg_netburst = { + .name = "netburst", + .ctl[dsf_bts] = (1 << 2) | (1 << 3), + .ctl[dsf_bts_kernel] = (1 << 5), + .ctl[dsf_bts_user] = (1 << 6), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = sizeof(long) * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; -static const struct ds_configuration ds_cfg_64 = { - .sizeof_ds = 8 * 12, - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, +static const struct ds_configuration ds_cfg_pentium_m = { + .name = "pentium m", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = 8 * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = 8 * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; +static const struct ds_configuration ds_cfg_core2 = { + .name = "core 2", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 18, +}; -static inline void +static void ds_configure(const struct ds_configuration *cfg) { + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; - printk(KERN_INFO "DS available\n"); + printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + + if (!cpu_has_bts) { + ds_cfg.ctl[dsf_bts] = 0; + printk(KERN_INFO "[ds] bts not available\n"); + } + if (!cpu_has_pebs) + printk(KERN_INFO "[ds] pebs not available\n"); - WARN_ON_ONCE(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -787,10 +959,10 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_pentium_m); break; default: /* Core2, Atom, ... */ - ds_configure(&ds_cfg_64); + ds_configure(&ds_cfg_core2); break; } break; @@ -799,7 +971,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_netburst); break; default: /* sorry, don't know about them */ @@ -812,14 +984,41 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } -void ds_free(struct ds_context *context) +/* + * Change the DS configuration from tracing prev to tracing next. + */ +void ds_switch_to(struct task_struct *prev, struct task_struct *next) { - /* This is called when the task owning the parameter context - * is dying. There should not be any user of that context left - * to disturb us, anymore. */ - unsigned long leftovers = context->count; - while (leftovers--) { - put_tracer(context->task); - ds_put_context(context); + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + + if (prev_ctx) { + update_debugctlmsr(0); + + if (prev_ctx->bts_master && + (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_departs, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = prev->pid + }; + bts_write(prev_ctx->bts_master, &ts); + } + } + + if (next_ctx) { + if (next_ctx->bts_master && + (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_arrives, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = next->pid + }; + bts_write(next_ctx->bts_master, &ts); + } + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); } + + update_debugctlmsr(next->thread.debugctlmsr); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 24c2276aa453..605eff9a8ac0 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -252,11 +252,14 @@ void exit_thread(void) put_cpu(); } #ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(current->thread.ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(current->thread.ds_ctx); + /* Free any BTS tracers that have not been properly released. */ + if (unlikely(current->bts)) { + ds_release_bts(current->bts); + current->bts = NULL; + + kfree(current->bts_buffer); + current->bts_buffer = NULL; + current->bts_size = 0; } #endif /* CONFIG_X86_DS */ } @@ -420,48 +423,19 @@ int set_tsc_mode(unsigned int val) return 0; } -#ifdef CONFIG_X86_DS -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - unsigned long ds_prev = 0; - unsigned long ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, ds_next, 0); - } - return debugctl; -} -#else -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - return debugctl; -} -#endif /* CONFIG_X86_DS */ - static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread; next = &next_p->thread; - debugctl = update_debugctl(prev, next, prev->debugctlmsr); - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -483,15 +457,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ - - if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index fbb321d53d34..1cfd2a4bf853 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -237,11 +237,14 @@ void exit_thread(void) put_cpu(); } #ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(t->ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(t->ds_ctx); + /* Free any BTS tracers that have not been properly released. */ + if (unlikely(current->bts)) { + ds_release_bts(current->bts); + current->bts = NULL; + + kfree(current->bts_buffer); + current->bts_buffer = NULL; + current->bts_size = 0; } #endif /* CONFIG_X86_DS */ } @@ -471,35 +474,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; - debugctl = prev->debugctlmsr; - -#ifdef CONFIG_X86_DS - { - unsigned long ds_prev = 0, ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* - * We clear debugctl to make sure DS - * is not in use when we change it: - */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, ds_next); - } - } -#endif /* CONFIG_X86_DS */ - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -534,14 +516,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } - -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ } /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b2998fe1166b..45e9855da2d2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -581,153 +581,73 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS -/* - * The configuration for a particular BTS hardware implementation. - */ -struct bts_configuration { - /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ - unsigned char sizeof_bts; - /* the size of a field in the BTS record in bytes */ - unsigned char sizeof_field; - /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ - unsigned long debugctl_mask; -}; -static struct bts_configuration bts_cfg; - -#define BTS_MAX_RECORD_SIZE (8 * 3) - - -/* - * Branch Trace Store (BTS) uses the following format. Different - * architectures vary in the size of those fields. - * - source linear address - * - destination linear address - * - flags - * - * Later architectures use 64bit pointers throughout, whereas earlier - * architectures use 32bit pointers in 32bit mode. - * - * We compute the base address for the first 8 fields based on: - * - the field size stored in the DS configuration - * - the relative field position - * - * In order to store additional information in the BTS buffer, we use - * a special source address to indicate that the record requires - * special interpretation. - * - * Netburst indicated via a bit in the flags field whether the branch - * was predicted; this is ignored. - */ - -enum bts_field { - bts_from = 0, - bts_to, - bts_flags, - - bts_escape = (unsigned long)-1, - bts_qual = bts_to, - bts_jiffies = bts_flags -}; - -static inline unsigned long bts_get(const char *base, enum bts_field field) -{ - base += (bts_cfg.sizeof_field * field); - return *(unsigned long *)base; -} - -static inline void bts_set(char *base, enum bts_field field, unsigned long val) -{ - base += (bts_cfg.sizeof_field * field);; - (*(unsigned long *)base) = val; -} - -/* - * Translate a BTS record from the raw format into the bts_struct format - * - * out (out): bts_struct interpretation - * raw: raw BTS record - */ -static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) -{ - memset(out, 0, sizeof(*out)); - if (bts_get(raw, bts_from) == bts_escape) { - out->qualifier = bts_get(raw, bts_qual); - out->variant.jiffies = bts_get(raw, bts_jiffies); - } else { - out->qualifier = BTS_BRANCH; - out->variant.lbr.from_ip = bts_get(raw, bts_from); - out->variant.lbr.to_ip = bts_get(raw, bts_to); - } -} - static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { - struct bts_struct ret; - const void *bts_record; - size_t bts_index, bts_end; + const struct bts_trace *trace; + struct bts_struct bts; + const unsigned char *at; int error; - error = ds_get_bts_end(child->bts, &bts_end); - if (error < 0) - return error; - - if (bts_end <= index) - return -EINVAL; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - error = ds_get_bts_index(child->bts, &bts_index); - if (error < 0) - return error; + at = trace->ds.top - ((index + 1) * trace->ds.size); + if ((void *)at < trace->ds.begin) + at += (trace->ds.n * trace->ds.size); - /* translate the ptrace bts index into the ds bts index */ - bts_index += bts_end - (index + 1); - if (bts_end <= bts_index) - bts_index -= bts_end; + if (!trace->read) + return -EOPNOTSUPP; - error = ds_access_bts(child->bts, bts_index, &bts_record); + error = trace->read(child->bts, at, &bts); if (error < 0) return error; - ptrace_bts_translate_record(&ret, bts_record); - - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; - return sizeof(ret); + return sizeof(bts); } static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { - struct bts_struct ret; - const unsigned char *raw; - size_t end, i; - int error; + const struct bts_trace *trace; + const unsigned char *at; + int error, drained = 0; - error = ds_get_bts_index(child->bts, &end); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - if (size < (end * sizeof(struct bts_struct))) + if (!trace->read) + return -EOPNOTSUPP; + + if (size < (trace->ds.top - trace->ds.begin)) return -EIO; - error = ds_access_bts(child->bts, 0, (const void **)&raw); - if (error < 0) - return error; + for (at = trace->ds.begin; (void *)at < trace->ds.top; + out++, drained++, at += trace->ds.size) { + struct bts_struct bts; + int error; - for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { - ptrace_bts_translate_record(&ret, raw); + error = trace->read(child->bts, at, &bts); + if (error < 0) + return error; - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; } - error = ds_clear_bts(child->bts); + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); + + error = ds_reset_bts(child->bts); if (error < 0) return error; - return end; + return drained; } static int ptrace_bts_config(struct task_struct *child, @@ -735,136 +655,89 @@ static int ptrace_bts_config(struct task_struct *child, const struct ptrace_bts_config __user *ucfg) { struct ptrace_bts_config cfg; - int error = 0; - - error = -EOPNOTSUPP; - if (!bts_cfg.sizeof_bts) - goto errout; + unsigned int flags = 0; - error = -EIO; if (cfg_size < sizeof(cfg)) - goto errout; + return -EIO; - error = -EFAULT; if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - goto errout; - - error = -EINVAL; - if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && - !(cfg.flags & PTRACE_BTS_O_ALLOC)) - goto errout; - - if (cfg.flags & PTRACE_BTS_O_ALLOC) { - bts_ovfl_callback_t ovfl = NULL; - unsigned int sig = 0; - - error = -EINVAL; - if (cfg.size < (10 * bts_cfg.sizeof_bts)) - goto errout; + return -EFAULT; - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { - if (!cfg.signal) - goto errout; + if (child->bts) { + ds_release_bts(child->bts); + child->bts = NULL; + } - error = -EOPNOTSUPP; - goto errout; + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { + if (!cfg.signal) + return -EINVAL; - sig = cfg.signal; - } + return -EOPNOTSUPP; - if (child->bts) { - (void)ds_release_bts(child->bts); - kfree(child->bts_buffer); + child->thread.bts_ovfl_signal = cfg.signal; + } - child->bts = NULL; - child->bts_buffer = NULL; - } + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && + (cfg.size != child->bts_size)) { + kfree(child->bts_buffer); - error = -ENOMEM; + child->bts_size = cfg.size; child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); - if (!child->bts_buffer) - goto errout; - - child->bts = ds_request_bts(child, child->bts_buffer, cfg.size, - ovfl, /* th = */ (size_t)-1); - if (IS_ERR(child->bts)) { - error = PTR_ERR(child->bts); - kfree(child->bts_buffer); - child->bts = NULL; - child->bts_buffer = NULL; - goto errout; + if (!child->bts_buffer) { + child->bts_size = 0; + return -ENOMEM; } - - child->thread.bts_ovfl_signal = sig; } - error = -EINVAL; - if (!child->thread.ds_ctx && cfg.flags) - goto errout; - if (cfg.flags & PTRACE_BTS_O_TRACE) - child->thread.debugctlmsr |= bts_cfg.debugctl_mask; - else - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + flags |= BTS_USER; if (cfg.flags & PTRACE_BTS_O_SCHED) - set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - else - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + flags |= BTS_TIMESTAMPS; - error = sizeof(cfg); + child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, + /* ovfl = */ NULL, /* th = */ (size_t)-1, + flags); + if (IS_ERR(child->bts)) { + int error = PTR_ERR(child->bts); -out: - if (child->thread.debugctlmsr) - set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - else - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + kfree(child->bts_buffer); + child->bts = NULL; + child->bts_buffer = NULL; + child->bts_size = 0; - return error; + return error; + } -errout: - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - goto out; + return sizeof(cfg); } static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + const struct bts_trace *trace; struct ptrace_bts_config cfg; - size_t end; - const void *base, *max; - int error; if (cfg_size < sizeof(cfg)) return -EIO; - error = ds_get_bts_end(child->bts, &end); - if (error < 0) - return error; - - error = ds_access_bts(child->bts, /* index = */ 0, &base); - if (error < 0) - return error; - - error = ds_access_bts(child->bts, /* index = */ end, &max); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; memset(&cfg, 0, sizeof(cfg)); - cfg.size = (max - base); + cfg.size = trace->ds.end - trace->ds.begin; cfg.signal = child->thread.bts_ovfl_signal; cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; - if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && - child->thread.debugctlmsr & bts_cfg.debugctl_mask) + if (trace->ds.flags & BTS_USER) cfg.flags |= PTRACE_BTS_O_TRACE; - if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + if (trace->ds.flags & BTS_TIMESTAMPS) cfg.flags |= PTRACE_BTS_O_SCHED; if (copy_to_user(ucfg, &cfg, sizeof(cfg))) @@ -873,105 +746,28 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } -static int ptrace_bts_write_record(struct task_struct *child, - const struct bts_struct *in) +static int ptrace_bts_clear(struct task_struct *child) { - unsigned char bts_record[BTS_MAX_RECORD_SIZE]; + const struct bts_trace *trace; - if (BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts) - return -EOVERFLOW; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - memset(bts_record, 0, bts_cfg.sizeof_bts); - switch (in->qualifier) { - case BTS_INVALID: - break; + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - case BTS_BRANCH: - bts_set(bts_record, bts_from, in->variant.lbr.from_ip); - bts_set(bts_record, bts_to, in->variant.lbr.to_ip); - break; - - case BTS_TASK_ARRIVES: - case BTS_TASK_DEPARTS: - bts_set(bts_record, bts_from, bts_escape); - bts_set(bts_record, bts_qual, in->qualifier); - bts_set(bts_record, bts_jiffies, in->variant.jiffies); - break; - - default: - return -EINVAL; - } - - return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts); + return ds_reset_bts(child->bts); } -void ptrace_bts_take_timestamp(struct task_struct *tsk, - enum bts_qualifier qualifier) +static int ptrace_bts_size(struct task_struct *child) { - struct bts_struct rec = { - .qualifier = qualifier, - .variant.jiffies = jiffies_64 - }; - - ptrace_bts_write_record(tsk, &rec); -} - -static const struct bts_configuration bts_cfg_netburst = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<2)|(1<<3)|(1<<5) -}; + const struct bts_trace *trace; -static const struct bts_configuration bts_cfg_pentium_m = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<6)|(1<<7) -}; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; -static const struct bts_configuration bts_cfg_core2 = { - .sizeof_bts = 8 * 3, - .sizeof_field = 8, - .debugctl_mask = (1<<6)|(1<<7)|(1<<9) -}; - -static inline void bts_configure(const struct bts_configuration *cfg) -{ - bts_cfg = *cfg; -} - -void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) -{ - switch (c->x86) { - case 0x6: - switch (c->x86_model) { - case 0 ... 0xC: - /* sorry, don't know about them */ - break; - case 0xD: - case 0xE: /* Pentium M */ - bts_configure(&bts_cfg_pentium_m); - break; - default: /* Core2, Atom, ... */ - bts_configure(&bts_cfg_core2); - break; - } - break; - case 0xF: - switch (c->x86_model) { - case 0x0: - case 0x1: - case 0x2: /* Netburst */ - bts_configure(&bts_cfg_netburst); - break; - default: - /* sorry, don't know about them */ - break; - } - break; - default: - /* sorry, don't know about them */ - break; - } + return (trace->ds.top - trace->ds.begin) / trace->ds.size; } #endif /* CONFIG_X86_PTRACE_BTS */ @@ -988,15 +784,12 @@ void ptrace_disable(struct task_struct *child) #endif #ifdef CONFIG_X86_PTRACE_BTS if (child->bts) { - (void)ds_release_bts(child->bts); + ds_release_bts(child->bts); + child->bts = NULL; + kfree(child->bts_buffer); child->bts_buffer = NULL; - - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + child->bts_size = 0; } #endif /* CONFIG_X86_PTRACE_BTS */ } @@ -1129,16 +922,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) (child, data, (struct ptrace_bts_config __user *)addr); break; - case PTRACE_BTS_SIZE: { - size_t size; - - ret = ds_get_bts_index(child->bts, &size); - if (ret == 0) { - WARN_ON_ONCE(size != (int) size); - ret = (int) size; - } + case PTRACE_BTS_SIZE: + ret = ptrace_bts_size(child); break; - } case PTRACE_BTS_GET: ret = ptrace_bts_read_record @@ -1146,7 +932,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ds_clear_bts(child->bts); + ret = ptrace_bts_clear(child); break; case PTRACE_BTS_DRAIN: @@ -1409,6 +1195,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: +#ifdef CONFIG_X86_PTRACE_BTS + case PTRACE_BTS_CONFIG: + case PTRACE_BTS_STATUS: + case PTRACE_BTS_SIZE: + case PTRACE_BTS_GET: + case PTRACE_BTS_CLEAR: + case PTRACE_BTS_DRAIN: +#endif /* CONFIG_X86_PTRACE_BTS */ return arch_ptrace(child, request, addr, data); default: diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b81fc5f7731..dc5ea65dc716 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1176,6 +1176,7 @@ struct task_struct { * The buffer to hold the BTS data. */ void *bts_buffer; + size_t bts_size; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ -- cgit v1.2.3 From ffc2238af8431d930d2c15f16feecf1fd6d75642 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 08:21:19 +0100 Subject: x86, bts: fix build error Impact: build fix arch/x86/kernel/ds.c: In function 'ds_request': arch/x86/kernel/ds.c:236: sorry, unimplemented: inlining failed in call to 'ds_get_context': recursive inlining but the recursion here is scary ... Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index f0583005b75e..dc1e7123ea4e 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -232,7 +232,7 @@ static DEFINE_PER_CPU(struct ds_context *, system_context_array); #define system_context per_cpu(system_context_array, smp_processor_id()) -static inline struct ds_context *ds_get_context(struct task_struct *task) +static struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = (task ? &task->thread.ds_ctx : &system_context); -- cgit v1.2.3 From 8808500f26a61757cb414da76b271bbd09d5958c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 09:20:12 +0100 Subject: x86: soften multi-BAR mapping sanity check warning message Impact: make debug warning less scary The ioremap() time multi-BAR map warning has been causing false positives: http://lkml.org/lkml/2008/12/10/432 http://lkml.org/lkml/2008/12/11/136 So make it less scary by making it once-per-boot, by making it KERN_INFO and by adding this text: "Info: mapping multiple BARs. Your kernel is fine." Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d4c4307ff3e0..bd85d42819e1 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -223,7 +223,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Check if the request spans more than any BAR in the iomem resource * tree. */ - WARN_ON(iomem_map_sanity_check(phys_addr, size)); + WARN_ONCE(iomem_map_sanity_check(phys_addr, size), + KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); /* * Don't allow anybody to remap normal RAM that we're using.. -- cgit v1.2.3 From 85072bd55219231b8ca5d9d3fa3492eb4fa6635f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 11:08:42 +0100 Subject: x86, debug: remove EBDA debug printk Remove leftover EBDA debug message. Reported-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/head.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 1dcb0f13897e..3e66bd364a9d 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -35,7 +35,6 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); - printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ -- cgit v1.2.3 From a0343e823184070f55364d8359f832dcb33c57c7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Dec 2008 23:53:16 +0100 Subject: tracing/function-graph-tracer: add a new .irqentry.text section Impact: let the function-graph-tracer be aware of the irq entrypoints Add a new .irqentry.text section to store the irq entrypoints functions inside the same section. This way, the tracer will be able to signal an interrupts triggering on output by recognizing these entrypoints. Also, make this section recordable for dynamic tracing. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_64.lds.S | 1 + include/asm-generic/vmlinux.lds.h | 10 ++++++++++ scripts/recordmcount.pl | 1 + 3 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 46e05447405b..1a614c0e6bef 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -35,6 +35,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index eba835a2c2cd..c61fab1dd2f8 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -288,6 +288,16 @@ *(.kprobes.text) \ VMLINUX_SYMBOL(__kprobes_text_end) = .; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#define IRQENTRY_TEXT \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__irqentry_text_start) = .; \ + *(.irqentry.text) \ + VMLINUX_SYMBOL(__irqentry_text_end) = .; +#else +#define IRQENTRY_TEXT +#endif + /* Section used for early init (in .S files) */ #define HEAD_TEXT *(.head.text) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 0b1dc9f9bb06..fe831412bea9 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -114,6 +114,7 @@ my %text_sections = ( ".text" => 1, ".sched.text" => 1, ".spinlock.text" => 1, + ".irqentry.text" => 1, ); $objdump = "objdump" if ((length $objdump) == 0); -- cgit v1.2.3 From bcbc4f20b52c2c40c43a4d2337707dcdfe81bc3a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Dec 2008 23:54:20 +0100 Subject: tracing/function-graph-tracer: annotate do_IRQ and smp_apic_timer_interrupt Impact: move most important x86 irq entry-points to a separate subsection Annotate do_IRQ and smp_apic_timer_interrupt to put them into the .irqentry.text subsection. These function will so be recognized as hardirq entrypoints for the function-graph-tracer. We could also annotate other irq entries but the others are far less important but they can be added on request. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 3 ++- arch/x86/kernel/irq_64.c | 3 ++- include/linux/ftrace.h | 11 +++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b525..b946ac19753b 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -800,7 +801,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs *regs) +void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 60eb84eb77a0..11c65e811ffe 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -47,7 +48,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs *regs) +asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 11cac81eed08..44020f31bd81 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -377,6 +377,16 @@ struct ftrace_graph_ret { */ #define __notrace_funcgraph notrace +/* + * We want to which function is an entrypoint of a hardirq. + * That will help us to put a signal on output. + */ +#define __irq_entry __attribute__((__section__(".irqentry.text"))) + +/* Limits of hardirq entrypoints */ +extern char __irqentry_text_start[]; +extern char __irqentry_text_end[]; + #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of the callback handlers for tracing function graph*/ @@ -414,6 +424,7 @@ static inline void unpause_graph_tracing(void) #else #define __notrace_funcgraph +#define __irq_entry static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } -- cgit v1.2.3 From 16855f878d7127a8bb3925753463485f3071ad76 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 8 Dec 2008 19:18:38 -0800 Subject: x86: uaccess: return value of __{get|put}_user() can be int Impact: cleanup The type of return value of __{get|put}_user() can be int. There is no user to refer the return value of __{get|put}_user() as long. This reduces code size a bit on 64-bit. $ size vmlinux.* text data bss dec hex filename 4509265 479988 673588 5662841 566879 vmlinux.new 4511462 479988 673588 5665038 56710e vmlinux.old Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35c54921b2e4..580c3ee6c58c 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -350,14 +350,14 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ - long __pu_err; \ + int __pu_err; \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ __pu_err; \ }) #define __get_user_nocheck(x, ptr, size) \ ({ \ - long __gu_err; \ + int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ -- cgit v1.2.3 From 8f2466f45f75e3cbe3aa2b69d33fd9d6e343b9cc Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 8 Dec 2008 19:19:07 -0800 Subject: x86: kill #ifdef for exit_idle() Impact: cleanup Introduce helper inline function in arch/x86/include/asm/idle.h to remove #ifdefs around exit_idle(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/idle.h | 5 +++++ arch/x86/kernel/apic.c | 6 ------ arch/x86/kernel/io_apic.c | 3 +-- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h index 44c89c3a23e9..38d87379e270 100644 --- a/arch/x86/include/asm/idle.h +++ b/arch/x86/include/asm/idle.h @@ -8,8 +8,13 @@ struct notifier_block; void idle_notifier_register(struct notifier_block *n); void idle_notifier_unregister(struct notifier_block *n); +#ifdef CONFIG_X86_64 void enter_idle(void); void exit_idle(void); +#else /* !CONFIG_X86_64 */ +static inline void enter_idle(void) { } +static inline void exit_idle(void) { } +#endif /* CONFIG_X86_64 */ void c1e_remove_cpu(int cpu); diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b525..0fd083713f62 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -814,9 +814,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); @@ -1682,9 +1680,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) { u32 v; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1713,9 +1709,7 @@ void smp_error_interrupt(struct pt_regs *regs) { u32 v, v1; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9043251210fb..679e7bbbbcd6 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2216,10 +2216,9 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; + ack_APIC_irq(); -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); me = smp_processor_id(); -- cgit v1.2.3 From 915b0d0104b72fd36af088ba4b11b5690bc96a6c Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 8 Dec 2008 19:19:26 -0800 Subject: x86: hardirq: introduce inc_irq_stat() Impact: cleanup Introduce inc_irq_stat() macro and unify irq_stat accounting code. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hardirq_32.h | 2 ++ arch/x86/include/asm/hardirq_64.h | 2 ++ arch/x86/kernel/apic.c | 13 +++---------- arch/x86/kernel/smp.c | 18 +++--------------- arch/x86/kernel/traps.c | 6 +----- 5 files changed, 11 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h index 5ca135e72f2b..cf7954d1405f 100644 --- a/arch/x86/include/asm/hardirq_32.h +++ b/arch/x86/include/asm/hardirq_32.h @@ -22,6 +22,8 @@ DECLARE_PER_CPU(irq_cpustat_t, irq_stat); #define __ARCH_IRQ_STAT #define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member) +#define inc_irq_stat(member) (__get_cpu_var(irq_stat).member++) + void ack_bad_irq(unsigned int irq); #include diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h index 1ba381fc51d3..b5a6b5d56704 100644 --- a/arch/x86/include/asm/hardirq_64.h +++ b/arch/x86/include/asm/hardirq_64.h @@ -11,6 +11,8 @@ #define __ARCH_IRQ_STAT 1 +#define inc_irq_stat(member) add_pda(member, 1) + #define local_softirq_pending() read_pda(__softirq_pending) #define __ARCH_SET_SOFTIRQ_PENDING 1 diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b525..1771dd746811 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -783,11 +783,7 @@ static void local_apic_timer_interrupt(void) /* * the NMI deadlock-detector uses this. */ -#ifdef CONFIG_X86_64 - add_pda(apic_timer_irqs, 1); -#else - per_cpu(irq_stat, cpu).apic_timer_irqs++; -#endif + inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); } @@ -1695,14 +1691,11 @@ void smp_spurious_interrupt(struct pt_regs *regs) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); -#ifdef CONFIG_X86_64 - add_pda(irq_spurious_count, 1); -#else + inc_irq_stat(irq_spurious_count); + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); - __get_cpu_var(irq_stat).irq_spurious_count++; -#endif irq_exit(); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18f9b19f5f8f..d18537ce2c79 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -178,11 +178,7 @@ static void native_smp_send_stop(void) void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_resched_count++; -#else - add_pda(irq_resched_count, 1); -#endif + inc_irq_stat(irq_resched_count); } void smp_call_function_interrupt(struct pt_regs *regs) @@ -190,11 +186,7 @@ void smp_call_function_interrupt(struct pt_regs *regs) ack_APIC_irq(); irq_enter(); generic_smp_call_function_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif + inc_irq_stat(irq_call_count); irq_exit(); } @@ -203,11 +195,7 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) ack_APIC_irq(); irq_enter(); generic_smp_call_function_single_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif + inc_irq_stat(irq_call_count); irq_exit(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 04d242ab0161..d815293e6d94 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -481,11 +481,7 @@ do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); -#ifdef CONFIG_X86_32 - { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } -#else - add_pda(__nmi_count, 1); -#endif + inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); -- cgit v1.2.3 From 2bed8446819a7c5033aa1da138d9f230ae212edc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 12:13:36 +0100 Subject: tracing/function-graph-tracer: add a new .irqentry.text section, fix Impact: build fix 32-bit x86 needs this section too. Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_32.lds.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index a9b8560adbc2..82c67559dde7 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -44,6 +44,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ -- cgit v1.2.3 From 9470565579f29486f4ed0ffa50774268b64994b0 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 1 Dec 2008 14:13:50 -0800 Subject: x86: remove init_mm export as planned for 2.6.26 Impact: remove deprecated export Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- Documentation/feature-removal-schedule.txt | 12 ------------ arch/x86/kernel/init_task.c | 1 - 2 files changed, 13 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index c28a2ac88f9d..1a8af7354e79 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -244,18 +244,6 @@ Who: Michael Buesch --------------------------- -What: init_mm export -When: 2.6.26 -Why: Not used in-tree. The current out-of-tree users used it to - work around problems in the CPA code which should be resolved - by now. One usecase was described to provide verification code - of the CPA operation. That's a good idea in general, but such - code / infrastructure should be in the kernel and not in some - out-of-tree driver. -Who: Thomas Gleixner - ----------------------------- - What: usedac i386 kernel parameter When: 2.6.27 Why: replaced by allowdac and no dac combination diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index a4f93b4120c1..d39918076bb4 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -14,7 +14,6 @@ static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); -EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ /* * Initial thread structure. -- cgit v1.2.3 From fd28a5b58dddf5cb5df162ae5c8797a63171c31d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 21 Oct 2008 14:05:00 +0200 Subject: x86: remove simnow earlyprintk support Impact: remove obsolete code The later versions of SimNow! actually all have serial console emulation, so the direct interface isn't needed anymore. So remove the undocumented simnow earlyprintk console. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner --- arch/x86/kernel/early_printk.c | 47 ------------------------------------------ 1 file changed, 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 34ad997d3834..23b138e31e9c 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -875,49 +875,6 @@ static struct console early_dbgp_console = { }; #endif -/* Console interface to a host file on AMD's SimNow! */ - -static int simnow_fd; - -enum { - MAGIC1 = 0xBACCD00A, - MAGIC2 = 0xCA110000, - XOPEN = 5, - XWRITE = 4, -}; - -static noinline long simnow(long cmd, long a, long b, long c) -{ - long ret; - - asm volatile("cpuid" : - "=a" (ret) : - "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); - return ret; -} - -static void __init simnow_init(char *str) -{ - char *fn = "klog"; - - if (*str == '=') - fn = ++str; - /* error ignored */ - simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); -} - -static void simnow_write(struct console *con, const char *s, unsigned n) -{ - simnow(XWRITE, simnow_fd, (unsigned long)s, n); -} - -static struct console simnow_console = { - .name = "simnow", - .write = simnow_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -960,10 +917,6 @@ static int __init setup_early_printk(char *buf) max_ypos = boot_params.screen_info.orig_video_lines; current_ypos = boot_params.screen_info.orig_y; early_console = &early_vga_console; - } else if (!strncmp(buf, "simnow", 6)) { - simnow_init(buf + 6); - early_console = &simnow_console; - keep_early = 1; #ifdef CONFIG_EARLY_PRINTK_DBGP } else if (!strncmp(buf, "dbgp", 4)) { if (early_dbgp_init(buf+4) < 0) -- cgit v1.2.3 From 205516c12dbba003c26b42cfb41e598631300106 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Tue, 16 Dec 2008 00:32:21 -0800 Subject: x86: convert rdtscll() to use __native_read_tsc Impact: micro-optimization Is there any reason why x86 rdtscll have to use the out of line function instead of inline __native_read_tsc()? native_read_tsc and __native_read_tsc is essentially the same functions. Patch to let x86 rdtscll() to use the inline version of read_tsc. Signed-off-by: Ken Chen Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c2a812ebde89..42f639b991b4 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -181,10 +181,10 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) } #define rdtscl(low) \ - ((low) = (u32)native_read_tsc()) + ((low) = (u32)__native_read_tsc()) #define rdtscll(val) \ - ((val) = native_read_tsc()) + ((val) = __native_read_tsc()) #define rdpmc(counter, low, high) \ do { \ -- cgit v1.2.3 From cc1dc6d039ced64c2f8b8457bf1cccf4ecfc5942 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 16 Dec 2008 15:51:03 +0100 Subject: x86, bts: remove recursion from get_context Impact: cleanup Optimistically allocate a DS context. It is extremely unlikely that one already existed. This simplifies the code a lot. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 58 ++++++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index dc1e7123ea4e..0dc795951d73 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -232,53 +232,45 @@ static DEFINE_PER_CPU(struct ds_context *, system_context_array); #define system_context per_cpu(system_context_array, smp_processor_id()) -static struct ds_context *ds_get_context(struct task_struct *task) + +static inline struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = (task ? &task->thread.ds_ctx : &system_context); - struct ds_context *context = *p_context; + struct ds_context *context = NULL; + struct ds_context *new_context = NULL; unsigned long irq; - if (!context) { - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return NULL; - - spin_lock_irqsave(&ds_lock, irq); - - if (*p_context) { - kfree(context); + /* Chances are small that we already have a context. */ + new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); + if (!new_context) + return NULL; - context = *p_context; - } else { - *p_context = context; + spin_lock_irqsave(&ds_lock, irq); - context->this = p_context; - context->task = task; + context = *p_context; + if (!context) { + context = new_context; - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); + context->this = p_context; + context->task = task; + context->count = 0; - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, - (unsigned long)context->ds); - } + if (task) + set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - context->count++; + if (!task || (task == current)) + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - spin_unlock_irqrestore(&ds_lock, irq); - } else { - spin_lock_irqsave(&ds_lock, irq); + *p_context = context; + } - context = *p_context; - if (context) - context->count++; + context->count++; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irqrestore(&ds_lock, irq); - if (!context) - context = ds_get_context(task); - } + if (context != new_context) + kfree(new_context); return context; } -- cgit v1.2.3 From d072c25f531c6513994960401d2c7f059434c0d2 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 16 Dec 2008 15:53:11 +0100 Subject: x86, bts: correctly report invalid bts records Impact: change the reporting of empty BTS records Correctly report a cleared BTS record as invalid. Used to be reported as branch from 0 to 0. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 0dc795951d73..98d271e60e08 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -484,6 +484,9 @@ static int bts_read(struct bts_tracer *tracer, const void *at, out->qualifier = bts_branch; out->variant.lbr.from = bts_get(at, bts_from); out->variant.lbr.to = bts_get(at, bts_to); + + if (!out->variant.lbr.from && !out->variant.lbr.to) + out->qualifier = bts_invalid; } return ds_cfg.sizeof_rec[ds_bts]; -- cgit v1.2.3 From 1796316a8b028a148be48ba5d4e7be493a39d173 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:35:24 +0000 Subject: x86: consolidate __swp_XXX() macros Impact: cleanup, code robustization The __swp_...() macros silently relied upon which bits are used for _PAGE_FILE and _PAGE_PROTNONE. After having changed _PAGE_PROTNONE in our Xen kernel to no longer overlap _PAGE_PAT, live locks and crashes were reported that could have been avoided if these macros properly used the symbolic constants. Since, as pointed out earlier, for Xen Dom0 support mainline likewise will need to eliminate the conflict between _PAGE_PAT and _PAGE_PROTNONE, this patch does all the necessary adjustments, plus it introduces a mechanism to check consistency between MAX_SWAPFILES_SHIFT and the actual encoding macros. This also fixes a latent bug in that x86-64 used a 6-bit mask in __swp_type(), and if MAX_SWAPFILES_SHIFT was increased beyond 5 in (the seemingly unrelated) linux/swap.h, this would have resulted in a collision with _PAGE_FILE. Non-PAE 32-bit code gets similarly adjusted for its pte_to_pgoff() and pgoff_to_pte() calculations. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable-2level.h | 50 ++++++++++++++++++++++++++++------- arch/x86/include/asm/pgtable-3level.h | 1 + arch/x86/include/asm/pgtable.h | 14 +++++----- arch/x86/include/asm/pgtable_64.h | 20 +++++++++++--- mm/swapfile.c | 9 +++++++ 5 files changed, 75 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index b17edfd23628..e0d199fe1d83 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -56,23 +56,55 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #define pte_none(x) (!(x).pte_low) /* - * Bits 0, 6 and 7 are taken, split up the 29 bits of offset - * into this range: + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, + * split up the 29 bits of offset into this range: */ #define PTE_FILE_MAX_BITS 29 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#else +#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1) +#endif +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) #define pte_to_pgoff(pte) \ - ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5)) + ((((pte).pte_low >> PTE_FILE_SHIFT1) \ + & ((1U << PTE_FILE_BITS1) - 1)) \ + + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ + & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ + + (((pte).pte_low >> PTE_FILE_SHIFT3) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) #define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \ - (((off) >> 5) << 8) + _PAGE_FILE }) + ((pte_t) { .pte_low = \ + (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + << PTE_FILE_SHIFT3) \ + + _PAGE_FILE }) /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x1f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 52597aeadfff..447da43cddb3 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -166,6 +166,7 @@ static inline int pte_none(pte_t pte) #define PTE_FILE_MAX_BITS 32 /* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) #define __swp_offset(x) ((x).val >> 5) #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c012f3b11671..b7c2ecdb7658 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -10,7 +10,6 @@ #define _PAGE_BIT_PCD 4 /* page cache disabled */ #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ -#define _PAGE_BIT_FILE 6 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ @@ -22,6 +21,12 @@ #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +/* If _PAGE_BIT_PRESENT is clear, we use these: */ +/* - if the user mapped it with PROT_NONE; pte_present gives true */ +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL +/* - set: nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY + #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) @@ -46,11 +51,8 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif -/* If _PAGE_PRESENT is clear, we use these: */ -#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, - * saved PTE; unset:swap */ -#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE; - pte_present gives true */ +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 545a0e042bb2..65b6be6677c7 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -250,10 +250,22 @@ static inline int pud_large(pud_t pte) extern int direct_gbpages; /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x3f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \ - ((offset) << 8) }) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) diff --git a/mm/swapfile.c b/mm/swapfile.c index 90cb67a5417c..54a9f87e5162 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1462,6 +1462,15 @@ static int __init procswaps_init(void) __initcall(procswaps_init); #endif /* CONFIG_PROC_FS */ +#ifdef MAX_SWAPFILES_CHECK +static int __init max_swapfiles_check(void) +{ + MAX_SWAPFILES_CHECK(); + return 0; +} +late_initcall(max_swapfiles_check); +#endif + /* * Written 01/25/92 by Simmule Turner, heavily changed by Linus. * -- cgit v1.2.3 From b93a531e315e97ef00367099e6b5f19651936e20 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:40:27 +0000 Subject: allow bug table entries to use relative pointers (and use it on x86-64) Impact: reduce bug table size This allows reducing the bug table size by half. Perhaps there are other 64-bit architectures that could also make use of this. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/bug.h | 2 +- include/asm-generic/bug.h | 8 ++++++++ lib/bug.c | 19 +++++++++++++++++-- 4 files changed, 30 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f7..ab98cca84e1b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -87,6 +87,10 @@ config GENERIC_IOMAP config GENERIC_BUG def_bool y depends on BUG + select GENERIC_BUG_RELATIVE_POINTERS if X86_64 + +config GENERIC_BUG_RELATIVE_POINTERS + bool config GENERIC_HWEIGHT def_bool y diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 3def2065fcea..d9cf1cd156d2 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -9,7 +9,7 @@ #ifdef CONFIG_X86_32 # define __BUG_C0 "2:\t.long 1b, %c0\n" #else -# define __BUG_C0 "2:\t.quad 1b, %c0\n" +# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n" #endif #define BUG() \ diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 12c07c1866b2..4c794d73fb84 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -8,9 +8,17 @@ #ifdef CONFIG_GENERIC_BUG #ifndef __ASSEMBLY__ struct bug_entry { +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS unsigned long bug_addr; +#else + signed int bug_addr_disp; +#endif #ifdef CONFIG_DEBUG_BUGVERBOSE +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS const char *file; +#else + signed int file_disp; +#endif unsigned short line; #endif unsigned short flags; diff --git a/lib/bug.c b/lib/bug.c index bfeafd60ee9f..300e41afbf97 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -5,6 +5,8 @@ CONFIG_BUG - emit BUG traps. Nothing happens without this. CONFIG_GENERIC_BUG - enable this code. + CONFIG_GENERIC_BUG_RELATIVE_POINTERS - use 32-bit pointers relative to + the containing struct bug_entry for bug_addr and file. CONFIG_DEBUG_BUGVERBOSE - emit full file+line information for each BUG CONFIG_BUG and CONFIG_DEBUG_BUGVERBOSE are potentially user-settable @@ -43,6 +45,15 @@ extern const struct bug_entry __start___bug_table[], __stop___bug_table[]; +static inline unsigned long bug_addr(const struct bug_entry *bug) +{ +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS + return bug->bug_addr; +#else + return (unsigned long)bug + bug->bug_addr_disp; +#endif +} + #ifdef CONFIG_MODULES static LIST_HEAD(module_bug_list); @@ -55,7 +66,7 @@ static const struct bug_entry *module_find_bug(unsigned long bugaddr) unsigned i; for (i = 0; i < mod->num_bugs; ++i, ++bug) - if (bugaddr == bug->bug_addr) + if (bugaddr == bug_addr(bug)) return bug; } return NULL; @@ -108,7 +119,7 @@ const struct bug_entry *find_bug(unsigned long bugaddr) const struct bug_entry *bug; for (bug = __start___bug_table; bug < __stop___bug_table; ++bug) - if (bugaddr == bug->bug_addr) + if (bugaddr == bug_addr(bug)) return bug; return module_find_bug(bugaddr); @@ -133,7 +144,11 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) if (bug) { #ifdef CONFIG_DEBUG_BUGVERBOSE +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS file = bug->file; +#else + file = (const char *)bug + bug->file_disp; +#endif line = bug->line; #endif warning = (bug->flags & BUGFLAG_WARNING) != 0; -- cgit v1.2.3 From d6be89ad660c5d03edef91715093d447025df59b Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:42:45 +0000 Subject: x86, 32-bit: simplify alloc_low_page() Impact: cleanup Neither of the callers really needs the physical address this function returns, so eliminate the pointless argument. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3ffed259883e..333c9e79d46f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,7 +67,7 @@ static unsigned long __meminitdata table_top; static int __initdata after_init_bootmem; -static __init void *alloc_low_page(unsigned long *phys) +static __init void *alloc_low_page(void) { unsigned long pfn = table_end++; void *adr; @@ -77,7 +77,6 @@ static __init void *alloc_low_page(unsigned long *phys) adr = __va(pfn * PAGE_SIZE); memset(adr, 0, PAGE_SIZE); - *phys = pfn * PAGE_SIZE; return adr; } @@ -92,12 +91,11 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_t *pmd_table; #ifdef CONFIG_X86_PAE - unsigned long phys; if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { if (after_init_bootmem) pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); else - pmd_table = (pmd_t *)alloc_low_page(&phys); + pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -128,10 +126,8 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!page_table) page_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); - } else { - unsigned long phys; - page_table = (pte_t *)alloc_low_page(&phys); - } + } else + page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); -- cgit v1.2.3 From beeb4195cbc80b7489631361b7ed38b7518af433 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:45:56 +0000 Subject: x86, 32-bit: add some compile time checks to mem_init() Some of the inconsistencies checked for at run time can be detected at build time already, so duplicate the checks done at run time to also be done at build time. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c483f4242079..d3a45d54547a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1040,11 +1040,25 @@ void __init mem_init(void) (unsigned long)&_text, (unsigned long)&_etext, ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + /* + * Check boundaries twice: Some fundamental inconsistencies can + * be detected at build time already. + */ +#define __FIXADDR_TOP (-PAGE_SIZE) +#ifdef CONFIG_HIGHMEM + BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif +#define high_memory (-128UL << 20) + BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); +#undef high_memory +#undef __FIXADDR_TOP + #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); BUG_ON(VMALLOC_END > PKMAP_BASE); #endif - BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON(VMALLOC_START >= VMALLOC_END); BUG_ON((unsigned long)high_memory > VMALLOC_START); if (boot_cpu_data.wp_works_ok < 0) -- cgit v1.2.3 From cfc319833b5b359bf3bce99564dbac00af7925ac Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:46:58 +0000 Subject: x86, 32-bit: improve lazy TLB handling code Impact: micro-optimize the 32-bit TLB flush code Use the faster x86_{read,write}_percpu() accessors here. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context_32.h | 13 ++++++------- arch/x86/kernel/tlb_32.c | 11 +++++------ 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h index 8e10015781fb..7e98ce1d2c0e 100644 --- a/arch/x86/include/asm/mmu_context_32.h +++ b/arch/x86/include/asm/mmu_context_32.h @@ -4,9 +4,8 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP - unsigned cpu = smp_processor_id(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY); #endif } @@ -20,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev, /* stop flush ipis for the previous mm */ cpu_clear(cpu, prev->cpu_vm_mask); #ifdef CONFIG_SMP - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - per_cpu(cpu_tlbstate, cpu).active_mm = next; + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); + x86_write_percpu(cpu_tlbstate.active_mm, next); #endif cpu_set(cpu, next->cpu_vm_mask); @@ -36,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev, } #ifdef CONFIG_SMP else { - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next); if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index f4049f3513b6..4290d918b58a 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -34,9 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock); */ void leave_mm(int cpu) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); + BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK); + cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -104,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) * BUG(); */ - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { + if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) { + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) { if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@ -238,7 +237,7 @@ static void do_flush_tlb_all(void *info) unsigned long cpu = smp_processor_id(); __flush_tlb_all(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(cpu); } -- cgit v1.2.3 From 83fd5cc6481c6b7fa8b45f8a7e0aa7120213430b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 16 Dec 2008 19:17:11 +0100 Subject: AMD IOMMU: allocate rlookup_table with __GFP_ZERO Impact: fix bug which can lead to panic in prealloc_protection_domains() Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 30ae2701b3df..c90a15eba5c5 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1074,7 +1074,8 @@ int __init amd_iommu_init(void) goto free; /* IOMMU rlookup table - find the IOMMU for a specific device */ - amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_rlookup_table = (void *)__get_free_pages( + GFP_KERNEL | __GFP_ZERO, get_order(rlookup_table_size)); if (amd_iommu_rlookup_table == NULL) goto free; -- cgit v1.2.3 From b6fd6f26733e864fba2ea3eb1d716e23d2e66f3a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Dec 2008 19:23:36 +0100 Subject: x86, mm: limit MAXMEM on 64-bit on 64-bit x86 the physical memory limit is controlled by the sparsemem bits - which are 44 bits right now. But MAXMEM (the max pfn number e820 parsing will allow to enter our sizing routines) is set to 0x00003fffffffffff, i.e. 46 bits - that's too large because it overlaps into the vmalloc range. So couple MAXMEM to MAX_PHYSMEM_BITS, and add a comment that the maximum of MAX_PHYSMEM_BITS is 45 bits. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64.h | 2 +- arch/x86/include/asm/sparsemem.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 65b6be6677c7..c54ba69608bd 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -146,7 +146,7 @@ static inline void native_pgd_clear(pgd_t *pgd) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) -#define MAXMEM _AC(0x00003fffffffffff, UL) +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #define VMALLOC_START _AC(0xffffc20000000000, UL) #define VMALLOC_END _AC(0xffffe1ffffffffff, UL) #define VMEMMAP_START _AC(0xffffe20000000000, UL) diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index be44f7dab395..e3cc3c063ec5 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,7 +27,7 @@ #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ # define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 44 +# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ #endif #endif /* CONFIG_SPARSEMEM */ -- cgit v1.2.3 From 3c763fd77e66e55d029052da31df0abd9920cb1e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:07:47 +0100 Subject: x86: microcode_amd: fix wrong handling of equivalent CPU id Impact: fix bug resulting in non-loaded AMD microcode mc_header->processor_rev_id is a 2 byte value. Similar is true for equiv_cpu in an equiv_cpu_entry -- only 2 bytes are of interest. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 5f8e5d75a254..b5bc81470bcf 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -62,7 +62,7 @@ struct microcode_header_amd { unsigned int mc_patch_data_checksum; unsigned int nb_dev_id; unsigned int sb_dev_id; - unsigned char processor_rev_id[2]; + u16 processor_rev_id; unsigned char nb_rev_id; unsigned char sb_rev_id; unsigned char bios_api_rev; @@ -125,7 +125,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) while (equiv_cpu_table[i].installed_cpu != 0) { if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { - equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; + equiv_cpu_id = equiv_cpu_table[i].equiv_cpu & 0xffff; break; } i++; @@ -137,21 +137,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 0; } - if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { - printk(KERN_ERR - "microcode: CPU%d patch does not match " - "(patch is %x, cpu extended is %x) \n", - cpu, mc_header->processor_rev_id[0], - (equiv_cpu_id & 0xff)); - return 0; - } - - if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { - printk(KERN_ERR "microcode: CPU%d patch does not match " - "(patch is %x, cpu base id is %x) \n", - cpu, mc_header->processor_rev_id[1], - ((equiv_cpu_id >> 16) & 0xff)); - + if (mc_header->processor_rev_id != equiv_cpu_id) { + printk(KERN_ERR "microcode: CPU%d patch does not match " + "(processor_rev_id: %x, eqiv_cpu_id: %x)\n", + cpu, mc_header->processor_rev_id, equiv_cpu_id); return 0; } -- cgit v1.2.3 From 2a3282a77b02fb47576ffbdb4867c8c6eeb83ed5 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:08:53 +0100 Subject: x86: microcode_amd: fix typos and trailing whitespaces in log messages Impact: fix printk typos Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index b5bc81470bcf..83a9fa321d9b 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -10,7 +10,7 @@ * This driver allows to upgrade microcode on AMD * family 0x10 and 0x11 processors. * - * Licensed unter the terms of the GNU General Public + * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ @@ -133,7 +133,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) if (!equiv_cpu_id) { printk(KERN_ERR "microcode: CPU%d cpu_id " - "not found in equivalent cpu table \n", cpu); + "not found in equivalent cpu table\n", cpu); return 0; } @@ -151,7 +151,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) NULL); if ((!nb_pci_dev) || (mc_header->nb_rev_id != nb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu); + printk(KERN_ERR "microcode: CPU%d NB mismatch\n", cpu); pci_dev_put(nb_pci_dev); return 0; } @@ -165,7 +165,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) NULL); if ((!sb_pci_dev) || (mc_header->sb_rev_id != sb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu); + printk(KERN_ERR "microcode: CPU%d SB mismatch\n", cpu); pci_dev_put(sb_pci_dev); return 0; } @@ -219,7 +219,7 @@ static void apply_microcode_amd(int cpu) } printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x \n", + "0x%x to 0x%x\n", cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); uci->cpu_sig.rev = rev; @@ -282,7 +282,7 @@ static int install_equiv_cpu_table(u8 *buf, if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { printk(KERN_ERR "microcode: error! " - "Wrong microcode equivalnet cpu table\n"); + "Wrong microcode equivalent cpu table\n"); return 0; } -- cgit v1.2.3 From be957763b01905d33b53cdd25c8df110f94f499a Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:11:23 +0100 Subject: x86: microcode_amd: fix checkpatch warnings/errors Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 83a9fa321d9b..a8a0ec600554 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -32,9 +32,9 @@ #include #include #include +#include #include -#include #include #include @@ -225,7 +225,7 @@ static void apply_microcode_amd(int cpu) uci->cpu_sig.rev = rev; } -static void * get_next_ucode(u8 *buf, unsigned int size, +static void *get_next_ucode(u8 *buf, unsigned int size, int (*get_ucode_data)(void *, const void *, size_t), unsigned int *mc_size) { @@ -256,7 +256,8 @@ static void * get_next_ucode(u8 *buf, unsigned int size, mc = vmalloc(UCODE_MAX_SIZE); if (mc) { memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { + if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, + total_size)) { vfree(mc); mc = NULL; } else @@ -332,7 +333,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, unsigned int uninitialized_var(mc_size); struct microcode_header_amd *mc_header; - mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); + mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, + &mc_size); if (!mc) break; @@ -342,7 +344,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size, vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; - } else + } else vfree(mc); ucode_ptr += mc_size; @@ -354,9 +356,9 @@ static int generic_load_microcode(int cpu, void *data, size_t size, if (uci->mc) vfree(uci->mc); uci->mc = new_mc; - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); + pr_debug("microcode: CPU%d found a matching microcode " + "update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); } else vfree(new_mc); } @@ -383,7 +385,8 @@ static int request_microcode_fw(int cpu, struct device *device) ret = request_firmware(&firmware, fw_name, device); if (ret) { - printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); + printk(KERN_ERR "microcode: ucode data file %s load failed\n", + fw_name); return ret; } @@ -397,8 +400,8 @@ static int request_microcode_fw(int cpu, struct device *device) static int request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" - "is not supported\n"); + printk(KERN_WARNING "microcode: AMD microcode update via " + "/dev/cpu/microcode is not supported\n"); return -1; } -- cgit v1.2.3 From 8c135206c826095c852c16d94a0a74eeaf05c90d Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:13:00 +0100 Subject: x86: microcode_amd: fix compile warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix build warning CC arch/x86/kernel/microcode_amd.o arch/x86/kernel/microcode_amd.c: In function ‘request_microcode_fw’: arch/x86/kernel/microcode_amd.c:393: warning: passing argument 2 of ‘generic_load_microcode’ discards qualifiers from pointer target type (Respect "const" qualifier of firmware->data.) Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index a8a0ec600554..89b386c901fd 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -225,8 +225,8 @@ static void apply_microcode_amd(int cpu) uci->cpu_sig.rev = rev; } -static void *get_next_ucode(u8 *buf, unsigned int size, - int (*get_ucode_data)(void *, const void *, size_t), +static void *get_next_ucode(const u8 *buf, unsigned int size, + int (*get_ucode_data)(void *, const u8 *, size_t), unsigned int *mc_size) { unsigned int total_size; @@ -268,8 +268,8 @@ static void *get_next_ucode(u8 *buf, unsigned int size, } -static int install_equiv_cpu_table(u8 *buf, - int (*get_ucode_data)(void *, const void *, size_t)) +static int install_equiv_cpu_table(const u8 *buf, + int (*get_ucode_data)(void *, const u8 *, size_t)) { #define UCODE_CONTAINER_HEADER_SIZE 12 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; @@ -311,11 +311,13 @@ static void free_equiv_cpu_table(void) } } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static int generic_load_microcode(int cpu, const u8 *data, size_t size, + int (*get_ucode_data)(void *, const u8 *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc; + const u8 *ucode_ptr = data; + void *new_mc = NULL; + void *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; @@ -368,7 +370,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size, return (int)leftover; } -static int get_ucode_fw(void *to, const void *from, size_t n) +static int get_ucode_fw(void *to, const u8 *from, size_t n) { memcpy(to, from, n); return 0; @@ -390,7 +392,7 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } - ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, + ret = generic_load_microcode(cpu, firmware->data, firmware->size, &get_ucode_fw); release_firmware(firmware); -- cgit v1.2.3 From 0657d9ebff186dcdb17e582dcb909028775a7707 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:14:05 +0100 Subject: x86: microcode_amd: don't pass superfluous function pointer for get_ucode_data Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 89b386c901fd..c7f225c7e481 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -225,9 +225,14 @@ static void apply_microcode_amd(int cpu) uci->cpu_sig.rev = rev; } +static int get_ucode_data(void *to, const u8 *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + static void *get_next_ucode(const u8 *buf, unsigned int size, - int (*get_ucode_data)(void *, const u8 *, size_t), - unsigned int *mc_size) + unsigned int *mc_size) { unsigned int total_size; #define UCODE_CONTAINER_SECTION_HDR 8 @@ -268,8 +273,7 @@ static void *get_next_ucode(const u8 *buf, unsigned int size, } -static int install_equiv_cpu_table(const u8 *buf, - int (*get_ucode_data)(void *, const u8 *, size_t)) +static int install_equiv_cpu_table(const u8 *buf) { #define UCODE_CONTAINER_HEADER_SIZE 12 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; @@ -311,8 +315,7 @@ static void free_equiv_cpu_table(void) } } -static int generic_load_microcode(int cpu, const u8 *data, size_t size, - int (*get_ucode_data)(void *, const u8 *, size_t)) +static int generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; const u8 *ucode_ptr = data; @@ -322,7 +325,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size, unsigned int leftover; unsigned long offset; - offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); + offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); return -EINVAL; @@ -335,8 +338,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size, unsigned int uninitialized_var(mc_size); struct microcode_header_amd *mc_header; - mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, - &mc_size); + mc = get_next_ucode(ucode_ptr, leftover, &mc_size); if (!mc) break; @@ -370,12 +372,6 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size, return (int)leftover; } -static int get_ucode_fw(void *to, const u8 *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static int request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; @@ -392,8 +388,7 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } - ret = generic_load_microcode(cpu, firmware->data, firmware->size, - &get_ucode_fw); + ret = generic_load_microcode(cpu, firmware->data, firmware->size); release_firmware(firmware); -- cgit v1.2.3 From 29d0887ffd084cde9d6a1286cb82b71701a974dd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:16:34 +0100 Subject: x86: microcode_amd: replace inline asm by common rdmsr/wrmsr functions Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/microcode_amd.c | 23 +++++------------------ 2 files changed, 7 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e38859d577a1..cb58643947b9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -85,7 +85,9 @@ /* AMD64 MSRs. Not complete. See the architecture manual for a more complete list. */ +#define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c7f225c7e481..2856955ddab1 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -93,6 +93,7 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); + u32 dummy; memset(csig, 0, sizeof(*csig)); @@ -102,9 +103,7 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return -1; } - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (csig->rev) - : "i" (0x0000008B) : "ecx"); + rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", csig->rev); @@ -181,12 +180,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev) static void apply_microcode_amd(int cpu) { unsigned long flags; - unsigned int eax, edx; - unsigned int rev; + u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; struct microcode_amd *mc_amd = uci->mc; - unsigned long addr; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -195,19 +192,9 @@ static void apply_microcode_amd(int cpu) return; spin_lock_irqsave(µcode_update_lock, flags); - - addr = (unsigned long)&mc_amd->hdr.data_code; - edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); - eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); - - asm volatile("movl %0, %%ecx; wrmsr" : - : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); - + wrmsrl(MSR_AMD64_PATCH_LOADER, &mc_amd->hdr.data_code); /* get patch id after patching */ - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (rev) - : "i" (0x0000008B) : "ecx"); - + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ -- cgit v1.2.3 From 6cc9b6d94b6fee23b0671970f67d297fa76b68b3 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:17:45 +0100 Subject: x86: microcode_amd: consolidate macro definitions Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 2856955ddab1..e68e723490a3 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -75,15 +75,9 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define UCODE_MAX_SIZE (2048) -#define DEFAULT_UCODE_DATASIZE (896) -#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) -#define DWSIZE (sizeof(u32)) -/* For now we support a fixed ucode total size only */ -#define get_totalsize(mc) \ - ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ - + MC_HEADER_SIZE) +#define UCODE_MAX_SIZE 2048 +#define UCODE_CONTAINER_SECTION_HDR 8 +#define UCODE_CONTAINER_HEADER_SIZE 12 /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); @@ -222,7 +216,6 @@ static void *get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) { unsigned int total_size; -#define UCODE_CONTAINER_SECTION_HDR 8 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; void *mc; @@ -255,14 +248,12 @@ static void *get_next_ucode(const u8 *buf, unsigned int size, } else *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; } -#undef UCODE_CONTAINER_SECTION_HDR return mc; } static int install_equiv_cpu_table(const u8 *buf) { -#define UCODE_CONTAINER_HEADER_SIZE 12 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; unsigned int *buf_pos = (unsigned int *)container_hdr; unsigned long size; @@ -291,7 +282,6 @@ static int install_equiv_cpu_table(const u8 *buf) } return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ -#undef UCODE_CONTAINER_HEADER_SIZE } static void free_equiv_cpu_table(void) -- cgit v1.2.3 From 98415301ea2dd389539ab429bcfa9da07219eabc Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:20:21 +0100 Subject: x86: microcode_amd: remove (wrong) chipset deivce ID checks Impact: remove dead/incorrect code Currently there is no chipset specific ucode. The checks are incorrect anyway (e.g. pci device IDs are 16 bit and not 8 bit). Thus I remove the stuff for the time being and will reintroduce it if it's foreseeable that it is really needed. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index e68e723490a3..2e8af6ef3da9 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -108,7 +108,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) static int get_matching_microcode(int cpu, void *mc, int rev) { struct microcode_header_amd *mc_header = mc; - struct pci_dev *nb_pci_dev, *sb_pci_dev; unsigned int current_cpu_id; unsigned int equiv_cpu_id = 0x00; unsigned int i = 0; @@ -137,32 +136,11 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 0; } - /* ucode may be northbridge specific */ - if (mc_header->nb_dev_id) { - nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, - (mc_header->nb_dev_id & 0xff), - NULL); - if ((!nb_pci_dev) || - (mc_header->nb_rev_id != nb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d NB mismatch\n", cpu); - pci_dev_put(nb_pci_dev); - return 0; - } - pci_dev_put(nb_pci_dev); - } - - /* ucode may be southbridge specific */ - if (mc_header->sb_dev_id) { - sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, - (mc_header->sb_dev_id & 0xff), - NULL); - if ((!sb_pci_dev) || - (mc_header->sb_rev_id != sb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d SB mismatch\n", cpu); - pci_dev_put(sb_pci_dev); - return 0; - } - pci_dev_put(sb_pci_dev); + /* ucode might be chipset specific -- currently we don't support this */ + if (mc_header->nb_dev_id || mc_header->sb_dev_id) { + printk(KERN_WARNING "microcode: CPU%d loading of chipset " + "specific code not yet supported\n", cpu); + return 0; } if (mc_header->patch_id <= rev) -- cgit v1.2.3 From 5549b94bc74c3e7edd44e0aeb7d9f773e82d2d20 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:21:30 +0100 Subject: x86: microcode_amd: use 'packed' attribute for structs Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 45 +++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 2e8af6ef3da9..e1ce650f276b 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -47,28 +47,29 @@ MODULE_LICENSE("GPL v2"); #define UCODE_UCODE_TYPE 0x00000001 struct equiv_cpu_entry { - unsigned int installed_cpu; - unsigned int fixed_errata_mask; - unsigned int fixed_errata_compare; - unsigned int equiv_cpu; -}; + u32 installed_cpu; + u32 fixed_errata_mask; + u32 fixed_errata_compare; + u16 equiv_cpu; + u16 res; +} __attribute__((packed)); struct microcode_header_amd { - unsigned int data_code; - unsigned int patch_id; - unsigned char mc_patch_data_id[2]; - unsigned char mc_patch_data_len; - unsigned char init_flag; - unsigned int mc_patch_data_checksum; - unsigned int nb_dev_id; - unsigned int sb_dev_id; - u16 processor_rev_id; - unsigned char nb_rev_id; - unsigned char sb_rev_id; - unsigned char bios_api_rev; - unsigned char reserved1[3]; - unsigned int match_reg[8]; -}; + u32 data_code; + u32 patch_id; + u16 mc_patch_data_id; + u8 mc_patch_data_len; + u8 init_flag; + u32 mc_patch_data_checksum; + u32 nb_dev_id; + u32 sb_dev_id; + u16 processor_rev_id; + u8 nb_rev_id; + u8 sb_rev_id; + u8 bios_api_rev; + u8 reserved1[3]; + u32 match_reg[8]; +} __attribute__((packed)); struct microcode_amd { struct microcode_header_amd hdr; @@ -109,7 +110,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) { struct microcode_header_amd *mc_header = mc; unsigned int current_cpu_id; - unsigned int equiv_cpu_id = 0x00; + u16 equiv_cpu_id = 0; unsigned int i = 0; BUG_ON(equiv_cpu_table == NULL); @@ -117,7 +118,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) while (equiv_cpu_table[i].installed_cpu != 0) { if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { - equiv_cpu_id = equiv_cpu_table[i].equiv_cpu & 0xffff; + equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; break; } i++; -- cgit v1.2.3 From df23cab563912ba43f7e9bc8ac517e5a2ddc9cd2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:22:36 +0100 Subject: x86: microcode_amd: modify log messages Impact: change microcode printk content Change log level and provide (at least I tried to;-) consistent, short, meaningful content. Signed-off-by: Andreas Herrmann Cc: Dmitry Adamushko Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 58 ++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index e1ce650f276b..24c256f4e50a 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -91,18 +91,13 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) u32 dummy; memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", - cpu); + printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); return -1; } - rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - - printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", - csig->rev); - + printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } @@ -125,21 +120,21 @@ static int get_matching_microcode(int cpu, void *mc, int rev) } if (!equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d cpu_id " - "not found in equivalent cpu table\n", cpu); + printk(KERN_WARNING "microcode: CPU%d: cpu revision " + "not listed in equivalent cpu table\n", cpu); return 0; } if (mc_header->processor_rev_id != equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d patch does not match " - "(processor_rev_id: %x, eqiv_cpu_id: %x)\n", + printk(KERN_ERR "microcode: CPU%d: patch mismatch " + "(processor_rev_id: %x, equiv_cpu_id: %x)\n", cpu, mc_header->processor_rev_id, equiv_cpu_id); return 0; } /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - printk(KERN_WARNING "microcode: CPU%d loading of chipset " + printk(KERN_ERR "microcode: CPU%d: loading of chipset " "specific code not yet supported\n", cpu); return 0; } @@ -172,15 +167,13 @@ static void apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", cpu_num, - mc_amd->hdr.patch_id, rev); + printk(KERN_ERR "microcode: CPU%d: update failed " + "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); return; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x\n", - cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); + printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", + cpu, rev); uci->cpu_sig.rev = rev; } @@ -202,18 +195,18 @@ static void *get_next_ucode(const u8 *buf, unsigned int size, return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - printk(KERN_ERR "microcode: error! " - "Wrong microcode payload type field\n"); + printk(KERN_ERR "microcode: error: invalid type field in " + "container file section header\n"); return NULL; } total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); - printk(KERN_INFO "microcode: size %u, total_size %u\n", - size, total_size); + printk(KERN_DEBUG "microcode: size %u, total_size %u\n", + size, total_size); if (total_size > size || total_size > UCODE_MAX_SIZE) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + printk(KERN_ERR "microcode: error: size mismatch\n"); return NULL; } @@ -243,14 +236,15 @@ static int install_equiv_cpu_table(const u8 *buf) size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - printk(KERN_ERR "microcode: error! " - "Wrong microcode equivalent cpu table\n"); + printk(KERN_ERR "microcode: error: invalid type field in " + "container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); + printk(KERN_ERR "microcode: failed to allocate " + "equivalent CPU table\n"); return 0; } @@ -283,7 +277,8 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); + printk(KERN_ERR "microcode: failed to create " + "equivalent cpu table\n"); return -EINVAL; } @@ -339,8 +334,7 @@ static int request_microcode_fw(int cpu, struct device *device) ret = request_firmware(&firmware, fw_name, device); if (ret) { - printk(KERN_ERR "microcode: ucode data file %s load failed\n", - fw_name); + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); return ret; } @@ -353,8 +347,8 @@ static int request_microcode_fw(int cpu, struct device *device) static int request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_WARNING "microcode: AMD microcode update via " - "/dev/cpu/microcode is not supported\n"); + printk(KERN_INFO "microcode: AMD microcode update via " + "/dev/cpu/microcode not supported\n"); return -1; } -- cgit v1.2.3 From bacbe9994541c70aa3abd1a013ac738e58d4bfb2 Mon Sep 17 00:00:00 2001 From: Janne Kulmala Date: Tue, 16 Dec 2008 13:39:57 +0200 Subject: x86: enable HPET on Fujitsu u9200 Impact: auto-enable HPET on Fujitsu u9200 HPET timer is listed in the ACPI table, but needs a quirk entry in order to work. Unfortunately, the quirk code runs after first HPET hpet_enable() which has already determined that the timer doesn't work (reads 0xFFFFFFFF). This patch allows hpet_enable() to be called again after running the quirk code. Signed-off-by: Janne Kulmala Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/quirks.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 067d8de913f6..84089dc8fd1d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -811,7 +811,7 @@ int __init hpet_enable(void) out_nohpet: hpet_clear_mapping(); - boot_hpet_disable = 1; + hpet_address = 0; return 0; } diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 67465ed89310..309949e9e1c1 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, ich_force_enable_hpet); -- cgit v1.2.3 From d4377974062122d6d9be0bbd8a910a0954714194 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Dec 2008 20:59:24 +0100 Subject: x86: support always running TSC on Intel CPUs, add cpufeature definition Impact: add new synthetic-cpuid bit definition add X86_FEATURE_NONSTOP_TSC to the cpufeature bits - this is in preparation of Venki's always-running-TSC patch. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 5bce8ed02b44..ea408dcba513 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -92,6 +92,7 @@ #define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ -- cgit v1.2.3 From 40fb17152c50a69dc304dd632131c2f41281ce44 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Mon, 17 Nov 2008 16:11:37 -0800 Subject: x86: support always running TSC on Intel CPUs Impact: reward non-stop TSCs with good TSC-based clocksources, etc. Add support for CPUID_0x80000007_Bit8 on Intel CPUs as well. This bit means that the TSC is invariant with C/P/T states and always runs at constant frequency. With Intel CPUs, we have 3 classes * CPUs where TSC runs at constant rate and does not stop n C-states * CPUs where TSC runs at constant rate, but will stop in deep C-states * CPUs where TSC rate will vary based on P/T-states and TSC will stop in deep C-states. To cover these 3, one feature bit (CONSTANT_TSC) is not enough. So, add a second bit (NONSTOP_TSC). CONSTANT_TSC indicates that the TSC runs at constant frequency irrespective of P/T-states, and NONSTOP_TSC indicates that TSC does not stop in deep C-states. CPUID_0x8000000_Bit8 indicates both these feature bit can be set. We still have CONSTANT_TSC _set_ and NONSTOP_TSC _not_set_ on some older Intel CPUs, based on model checks. We can use TSC on such CPUs for time, as long as those CPUs do not support/enter deep C-states. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 9 +++++++-- arch/x86/kernel/cpu/intel.c | 10 ++++++++++ arch/x86/kernel/process.c | 2 +- drivers/acpi/processor_idle.c | 6 +++--- 4 files changed, 21 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8f1e31db2ad5..7c878f6aa919 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -283,9 +283,14 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cce0b6118d55..caec59437a22 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -41,6 +41,16 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; #endif + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d8..18c70fedba32 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -270,7 +270,7 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 5f8d746a9b81..38aca048e951 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -374,15 +374,15 @@ static int tsc_halts_in_c(int state) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + case X86_VENDOR_INTEL: /* * AMD Fam10h TSC will tick in all * C/P/S0/S1 states when this bit is set. */ - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) return 0; + /*FALL THROUGH*/ - case X86_VENDOR_INTEL: - /* Several cases known where TSC halts in C2 too */ default: return state > ACPI_STATE_C1; } -- cgit v1.2.3 From f63c2f248959366cd11bfa476f866737047cf663 Mon Sep 17 00:00:00 2001 From: Tej Date: Tue, 16 Dec 2008 11:56:06 -0800 Subject: xen: whitespace/checkpatch cleanup Impact: cleanup Signed-off-by: Tej Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 16 +++++++++------- arch/x86/xen/mmu.c | 17 ++++++++++------- arch/x86/xen/multicalls.c | 2 +- arch/x86/xen/setup.c | 9 +++++---- 4 files changed, 25 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5e4686d70f62..86cd2f829683 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -793,7 +793,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) ret = 0; - switch(msr) { + switch (msr) { #ifdef CONFIG_X86_64 unsigned which; u64 base; @@ -1453,7 +1453,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) ident_pte = 0; pfn = 0; - for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { pte_t *pte_page; /* Reuse or allocate a page of ptes */ @@ -1471,7 +1471,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) } /* Install mappings */ - for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { pte_t pte; if (pfn > max_pfn_mapped) @@ -1485,7 +1485,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) } } - for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); set_page_prot(pmd, PAGE_KERNEL_RO); @@ -1499,7 +1499,7 @@ static void convert_pfn_mfn(void *v) /* All levels are converted the same way, so just treat them as ptes. */ - for(i = 0; i < PTRS_PER_PTE; i++) + for (i = 0; i < PTRS_PER_PTE; i++) pte[i] = xen_make_pte(pte[i].pte); } @@ -1514,7 +1514,8 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) { pud_t *l3; pmd_t *l2; @@ -1577,7 +1578,8 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf #else /* !CONFIG_X86_64 */ static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) { pmd_t *kernel_pmd; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 636ef4caa52d..773d68d3e912 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -154,13 +154,13 @@ void xen_setup_mfn_list_list(void) { unsigned pfn, idx; - for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); } - for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { + for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } @@ -179,7 +179,7 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; - for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top[topidx] = &mfn_list[pfn]; @@ -207,7 +207,7 @@ static void alloc_p2m(unsigned long **pp, unsigned long *mfnp) p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); BUG_ON(p == NULL); - for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) + for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p[i] = INVALID_P2M_ENTRY; if (cmpxchg(pp, p2m_missing, p) != p2m_missing) @@ -407,7 +407,8 @@ out: preempt_enable(); } -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ return *ptep; @@ -878,7 +879,8 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) if (user_pgd) { xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); - xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_do_pin(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ @@ -993,7 +995,8 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) pgd_t *user_pgd = xen_get_user_pgd(pgd); if (user_pgd) { - xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_do_pin(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(user_pgd))); xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); } } diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 8ea8a0d0b0de..c738644b5435 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -154,7 +154,7 @@ void xen_mc_flush(void) ret, smp_processor_id()); dump_stack(); for (i = 0; i < b->mcidx; i++) { - printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", + printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n", i+1, b->mcidx, b->debug[i].op, b->debug[i].args[0], diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index d67901083888..15c6c68db6a2 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -28,6 +28,9 @@ /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +extern void xen_sysenter_target(void); +extern void xen_syscall_target(void); +extern void xen_syscall32_target(void); /** @@ -110,7 +113,6 @@ static __cpuinit int register_callback(unsigned type, const void *func) void __cpuinit xen_enable_sysenter(void) { - extern void xen_sysenter_target(void); int ret; unsigned sysenter_feature; @@ -132,8 +134,6 @@ void __cpuinit xen_enable_syscall(void) { #ifdef CONFIG_X86_64 int ret; - extern void xen_syscall_target(void); - extern void xen_syscall32_target(void); ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); if (ret != 0) { @@ -160,7 +160,8 @@ void __init xen_arch_setup(void) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); if (!xen_feature(XENFEAT_auto_translated_physmap)) - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); + HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_pae_extended_cr3); if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) -- cgit v1.2.3 From aab02f0ae20b8fe0fe891e9f107c6e392256ca01 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 22:23:54 +0530 Subject: x86: process_64.c declare __switch_to() and sys_arch_prctl before they get used Impact: cleanup In asm/system.h moved out __switch_to from CONFIG_X86_32 as it is common for both 32 and 64 bit. In asm/pctl.h defined sys_arch_prctl Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/prctl.h | 3 +++ arch/x86/include/asm/system.h | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h index fe681147a4f7..a8894647dd9a 100644 --- a/arch/x86/include/asm/prctl.h +++ b/arch/x86/include/asm/prctl.h @@ -6,5 +6,8 @@ #define ARCH_GET_FS 0x1003 #define ARCH_GET_GS 0x1004 +#ifdef CONFIG_X86_64 +extern long sys_arch_prctl(int, unsigned long); +#endif /* CONFIG_X86_64 */ #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2ed3f0f44ff7..59555f48bf4c 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -17,12 +17,12 @@ # define AT_VECTOR_SIZE_ARCH 1 #endif -#ifdef CONFIG_X86_32 - struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); +#ifdef CONFIG_X86_32 + /* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. -- cgit v1.2.3 From 7b5b50f1be9e07714cfaa620d102c8daf3cdd814 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 22:24:48 +0530 Subject: x86: signal.c declare do_notify_resume before they get used Impact: cleanup In asm/signal.h moved out do_notify_resume from __i386__ as it is common for both 32 and 64 bit. Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar arch/x86/include/asm/signal.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) --- arch/x86/include/asm/signal.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 96ac44f275da..7761a5d554bb 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -121,6 +121,10 @@ typedef unsigned long sigset_t; #ifndef __ASSEMBLY__ +# ifdef __KERNEL__ +extern void do_notify_resume(struct pt_regs *, void *, __u32); +# endif /* __KERNEL__ */ + #ifdef __i386__ # ifdef __KERNEL__ struct old_sigaction { @@ -141,8 +145,6 @@ struct k_sigaction { struct sigaction sa; }; -extern void do_notify_resume(struct pt_regs *, void *, __u32); - # else /* __KERNEL__ */ /* Here we must cater to libcs that poke about in kernel headers. */ -- cgit v1.2.3 From c0195b6da08c4ddd8c8ea830f6c3c40bc7f82071 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 22:26:30 +0530 Subject: x86: ldt.c declare sys_modify_ldt before they get used Impact: cleanup In asm/syscalls.h moved out sys_modify_ldt from CONFIG_X86_32 as it is common for both 32 and 64 bit. Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/syscalls.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 87803da44010..75d4a6afc36f 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -19,6 +19,9 @@ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +/* kernel/ldt.c */ +asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); + /* X86_32 only */ #ifdef CONFIG_X86_32 /* kernel/process_32.c */ @@ -38,9 +41,6 @@ asmlinkage int sys_rt_sigreturn(unsigned long); /* kernel/ioport.c */ asmlinkage long sys_iopl(unsigned long); -/* kernel/ldt.c */ -asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); - /* kernel/sys_i386_32.c */ asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -- cgit v1.2.3 From a9b43c7d9890066709609df849959009645c1a19 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 23:11:10 +0530 Subject: x86: setup.c find_and_reserve_crashkernel should be static Impact: cleanup, reduce kernel size a bit Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d5674f7b6cc..81f5d22747ae 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -448,6 +448,7 @@ static void __init reserve_early_setup_data(void) * @size: Size of the crashkernel memory to reserve. * Returns the base address on success, and -1ULL on failure. */ +static unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) { const unsigned long long alignment = 16<<20; /* 16M */ -- cgit v1.2.3 From a79b7a2a758c39315344f0d86b5adb21d90d786e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 16 Dec 2008 12:17:25 -0800 Subject: x86: remove unused iommu_nr_pages Impact: cleanup, remove dead code The last usage was removed by the patch set culminating in | commit e3c449f526cebb8d287241c7e82faafd9709668b | Author: Joerg Roedel | Date: Wed Oct 15 22:02:11 2008 -0700 | | x86, AMD IOMMU: convert driver to generic iommu_num_pages function Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/include/asm/iommu.h | 2 -- arch/x86/kernel/pci-dma.c | 7 ------- 2 files changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 0b500c5b6446..35276ec5925b 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -7,8 +7,6 @@ extern struct dma_mapping_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; -extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len); - /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 192624820217..e150ad4f0ccc 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -125,13 +125,6 @@ void __init pci_iommu_alloc(void) pci_swiotlb_init(); } -unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) -{ - unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); - - return size >> PAGE_SHIFT; -} -EXPORT_SYMBOL(iommu_nr_pages); #endif void *dma_generic_alloc_coherent(struct device *dev, size_t size, -- cgit v1.2.3 From ecbf29cdb3990c83d90d0c4187c89fb2ce423367 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 16 Dec 2008 12:37:07 -0800 Subject: xen: clean up asm/xen/hypervisor.h Impact: cleanup hypervisor.h had accumulated a lot of crud, including lots of spurious #includes. Clean it all up, and go around fixing up everything else accordingly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/include/asm/xen/hypercall.h | 6 ++++++ arch/x86/include/asm/xen/hypervisor.h | 39 +++++++---------------------------- arch/x86/include/asm/xen/page.h | 5 +++++ arch/x86/xen/enlighten.c | 1 + drivers/xen/balloon.c | 4 +++- drivers/xen/features.c | 6 +++++- drivers/xen/grant-table.c | 1 + include/xen/interface/event_channel.h | 2 ++ 8 files changed, 31 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 3f6000d95fe2..5e79ca694326 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -33,8 +33,14 @@ #ifndef _ASM_X86_XEN_HYPERCALL_H #define _ASM_X86_XEN_HYPERCALL_H +#include +#include #include #include +#include + +#include +#include #include #include diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index a38d25ac87d2..81fbd735aec4 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -33,39 +33,10 @@ #ifndef _ASM_X86_XEN_HYPERVISOR_H #define _ASM_X86_XEN_HYPERVISOR_H -#include -#include - -#include -#include - -#include -#include -#include -#if defined(__i386__) -# ifdef CONFIG_X86_PAE -# include -# else -# include -# endif -#endif -#include - /* arch/i386/kernel/setup.c */ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; -/* arch/i386/mach-xen/evtchn.c */ -/* Force a proper event-channel callback from Xen. */ -extern void force_evtchn_callback(void); - -/* Turn jiffies into Xen system time. */ -u64 jiffies_to_st(unsigned long jiffies); - - -#define MULTI_UVMFLAGS_INDEX 3 -#define MULTI_UVMDOMID_INDEX 4 - enum xen_domain_type { XEN_NATIVE, XEN_PV_DOMAIN, @@ -74,9 +45,15 @@ enum xen_domain_type { extern enum xen_domain_type xen_domain_type; +#ifdef CONFIG_XEN #define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) +#else +#define xen_domain() (0) +#endif + +#define xen_pv_domain() (xen_domain() && xen_domain_type == XEN_PV_DOMAIN) +#define xen_hvm_domain() (xen_domain() && xen_domain_type == XEN_HVM_DOMAIN) + #define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN) -#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index bc628998a1b9..7ef617ef1df3 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -1,11 +1,16 @@ #ifndef _ASM_X86_XEN_PAGE_H #define _ASM_X86_XEN_PAGE_H +#include +#include +#include #include #include +#include #include +#include #include /* Xen machine address */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 86cd2f829683..bea215230b20 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 526c191e84ea..8dc7109d61b7 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -44,13 +44,15 @@ #include #include -#include #include #include #include #include #include +#include +#include +#include #include #include #include diff --git a/drivers/xen/features.c b/drivers/xen/features.c index 0707714e40d6..99eda169c779 100644 --- a/drivers/xen/features.c +++ b/drivers/xen/features.c @@ -8,7 +8,11 @@ #include #include #include -#include + +#include + +#include +#include #include u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 06592b9da83c..7d8f531fb8e8 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h index 919b5bdcb2bd..2090881c3650 100644 --- a/include/xen/interface/event_channel.h +++ b/include/xen/interface/event_channel.h @@ -9,6 +9,8 @@ #ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__ #define __XEN_PUBLIC_EVENT_CHANNEL_H__ +#include + typedef uint32_t evtchn_port_t; DEFINE_GUEST_HANDLE(evtchn_port_t); -- cgit v1.2.3 From 39c04b55240342d0742ac48538d3d8c71bfc0a94 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 16 Dec 2008 12:32:23 -0800 Subject: x86: make sure we really have an hpet mapping before using it Impact: prepare the hpet code for Xen dom0 booting When booting in Xen dom0, the hpet isn't really accessible, so make sure the mapping is non-NULL before use. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 84089dc8fd1d..a1f6ed5e1a05 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -834,10 +834,11 @@ static __init int hpet_late_init(void) hpet_address = force_hpet_address; hpet_enable(); - if (!hpet_virt_address) - return -ENODEV; } + if (!hpet_virt_address) + return -ENODEV; + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); for_each_online_cpu(cpu) { -- cgit v1.2.3 From 8ae936690972dfcad73d0dde1095b9f32af5ee95 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 12 Dec 2008 15:52:26 -0800 Subject: x86: hardirq: use inc_irq_stat() in non-unified functions Impact: cleanup Replace incrementing irq stat with inc_irq_stat() in non-unified functions. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- arch/x86/kernel/time_32.c | 2 +- arch/x86/kernel/time_64.c | 2 +- arch/x86/kernel/tlb_32.c | 2 +- arch/x86/kernel/tlb_64.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 5eb390a4b2e9..748c8f9e7a05 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -237,7 +237,7 @@ asmlinkage void mce_threshold_interrupt(void) } } out: - add_pda(irq_threshold_count, 1); + inc_irq_stat(irq_threshold_count); irq_exit(); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index c17eaf5dd6dd..4b48f251fd39 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(void) if (therm_throt_process(msr_val & 1)) mce_log_therm_throt_event(smp_processor_id(), msr_val); - add_pda(irq_thermal_count, 1); + inc_irq_stat(irq_thermal_count); irq_exit(); } diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 77b400f06ea2..65309e4cb1c0 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc); irqreturn_t timer_interrupt(int irq, void *dev_id) { /* Keep nmi watchdog up to date */ - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++; + inc_irq_stat(irq0_irqs); #ifdef CONFIG_X86_IO_APIC if (timer_ack) { diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 418a095c5796..1749cacde8b9 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(profile_pc); irqreturn_t timer_interrupt(int irq, void *dev_id) { - add_pda(irq0_irqs, 1); + inc_irq_stat(irq0_irqs); global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index f4049f3513b6..f374f83fca42 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -119,7 +119,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs) smp_mb__after_clear_bit(); out: put_cpu_no_resched(); - __get_cpu_var(irq_stat).irq_tlb_count++; + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 8f919ca69494..29887d7081a9 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) out: ack_APIC_irq(); cpu_clear(cpu, f->flush_cpumask); - add_pda(irq_tlb_count, 1); + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, -- cgit v1.2.3 From fde9071167c4624281553b23232aa8b81e71c790 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 12 Dec 2008 11:26:35 -0800 Subject: x86: clean up dead code in vmi_32.c Impact: cleanup, remove dead debug code I ran across some old debugging code in vmi paravirt-ops code that was already dead, but still potentially useful. After reviewing recent changes to the way kernel page tables are allocated and initialized, and the lack of bugs caught by this debugging code, I've concluded it is now totally useless to have around, and it's already been #if 0'd for quite some time. There's no rush to get this in mainline, but it's also totally harmless, so I'll let the x86 maintainers decide where it should be tucked. I've been out of the mainstream dev loop for a couple months, so apologies if I haven't got any protocol changes in order. Remove mummified remains found in vmi_32.c Signed-off-by: Zachary Amsden Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmi_32.c | 119 ----------------------------------------------- 1 file changed, 119 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 8b6c393ab9fd..8087e0cd877d 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -266,109 +266,6 @@ static void vmi_nop(void) { } -#ifdef CONFIG_DEBUG_PAGE_TYPE - -#ifdef CONFIG_X86_PAE -#define MAX_BOOT_PTS (2048+4+1) -#else -#define MAX_BOOT_PTS (1024+1) -#endif - -/* - * During boot, mem_map is not yet available in paging_init, so stash - * all the boot page allocations here. - */ -static struct { - u32 pfn; - int type; -} boot_page_allocations[MAX_BOOT_PTS]; -static int num_boot_page_allocations; -static int boot_allocations_applied; - -void vmi_apply_boot_page_allocations(void) -{ - int i; - BUG_ON(!mem_map); - for (i = 0; i < num_boot_page_allocations; i++) { - struct page *page = pfn_to_page(boot_page_allocations[i].pfn); - page->type = boot_page_allocations[i].type; - page->type = boot_page_allocations[i].type & - ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - } - boot_allocations_applied = 1; -} - -static void record_page_type(u32 pfn, int type) -{ - BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS); - boot_page_allocations[num_boot_page_allocations].pfn = pfn; - boot_page_allocations[num_boot_page_allocations].type = type; - num_boot_page_allocations++; -} - -static void check_zeroed_page(u32 pfn, int type, struct page *page) -{ - u32 *ptr; - int i; - int limit = PAGE_SIZE / sizeof(int); - - if (page_address(page)) - ptr = (u32 *)page_address(page); - else - ptr = (u32 *)__va(pfn << PAGE_SHIFT); - /* - * When cloning the root in non-PAE mode, only the userspace - * pdes need to be zeroed. - */ - if (type & VMI_PAGE_CLONE) - limit = KERNEL_PGD_BOUNDARY; - for (i = 0; i < limit; i++) - BUG_ON(ptr[i]); -} - -/* - * We stash the page type into struct page so we can verify the page - * types are used properly. - */ -static void vmi_set_page_type(u32 pfn, int type) -{ - /* PAE can have multiple roots per page - don't track */ - if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) - return; - - if (boot_allocations_applied) { - struct page *page = pfn_to_page(pfn); - if (type != VMI_PAGE_NORMAL) - BUG_ON(page->type); - else - BUG_ON(page->type == VMI_PAGE_NORMAL); - page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - if (type & VMI_PAGE_ZEROED) - check_zeroed_page(pfn, type, page); - } else { - record_page_type(pfn, type); - } -} - -static void vmi_check_page_type(u32 pfn, int type) -{ - /* PAE can have multiple roots per page - skip checks */ - if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) - return; - - type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - if (boot_allocations_applied) { - struct page *page = pfn_to_page(pfn); - BUG_ON((page->type ^ type) & VMI_PAGE_PAE); - BUG_ON(type == VMI_PAGE_NORMAL && page->type); - BUG_ON((type & page->type) == 0); - } -} -#else -#define vmi_set_page_type(p,t) do { } while (0) -#define vmi_check_page_type(p,t) do { } while (0) -#endif - #ifdef CONFIG_HIGHPTE static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) { @@ -395,7 +292,6 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { - vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } @@ -406,27 +302,22 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn) * It is called only for swapper_pg_dir, which already has * data on it. */ - vmi_set_page_type(pfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); } static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) { - vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); - vmi_check_page_type(clonepfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); } static void vmi_release_pte(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L1); - vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } static void vmi_release_pmd(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L2); - vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } /* @@ -450,26 +341,22 @@ static void vmi_release_pmd(unsigned long pfn) static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_set_pte(pte_t *ptep, pte_t pte) { /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD); vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); } static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } @@ -477,10 +364,8 @@ static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) { #ifdef CONFIG_X86_PAE const pte_t pte = { .pte = pmdval.pmd }; - vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); #else const pte_t pte = { pmdval.pud.pgd.pgd }; - vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD); #endif vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); } @@ -502,7 +387,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); } @@ -510,21 +394,18 @@ static void vmi_set_pud(pud_t *pudp, pud_t pudval) { /* Um, eww */ const pte_t pte = { .pte = pudval.pgd.pgd }; - vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); } static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { const pte_t pte = { .pte = 0 }; - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_pmd_clear(pmd_t *pmd) { const pte_t pte = { .pte = 0 }; - vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); } #endif -- cgit v1.2.3 From 189f67c4408806563a1f061f5c8bf184a6658477 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 12 Dec 2008 14:50:40 -0600 Subject: x86: UV fix for global physical addresses Impact: fix UV boot crash This fixes a UV bug related to generating global memory addresses on partitioned systems. Partition systems do not have physical memory at address 0. Instead, a chunk of high memory is remapped by the chipset so that it appears to be at address 0. This remapping is INVISIBLE to most of the OS. The only OS functions that need to be aware of the remaping are functions that directly interface to the chipset. The GRU is one example. Also, delete a couple of unused macros related to global memory addresses. Signed-off-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 16 ++-------------- arch/x86/kernel/genx2apic_uv_x.c | 3 +-- 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 52aa943c634f..777327ef05c1 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -210,7 +210,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) { if (paddr < uv_hub_info->lowmem_remap_top) - paddr += uv_hub_info->lowmem_remap_base; + paddr |= uv_hub_info->lowmem_remap_base; return paddr | uv_hub_info->gnode_upper; } @@ -218,19 +218,7 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) /* socket virtual --> UV global physical address */ static inline unsigned long uv_gpa(void *v) { - return __pa(v) | uv_hub_info->gnode_upper; -} - -/* socket virtual --> UV global physical address */ -static inline void *uv_vgpa(void *v) -{ - return (void *)uv_gpa(v); -} - -/* UV global physical address --> socket virtual */ -static inline void *uv_va(unsigned long gpa) -{ - return __va(gpa & uv_hub_info->gpa_mask); + return uv_soc_phys_ram_to_gpa(__pa(v)); } /* pnode, offset --> socket virtual */ diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 221299f4509f..dece17289731 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -540,8 +540,7 @@ void __init uv_system_init(void) uv_blade_info[blade].nr_possible_cpus++; uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; - uv_cpu_hub_info(cpu)->lowmem_remap_top = - lowmem_redir_base + lowmem_redir_size; + uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; uv_cpu_hub_info(cpu)->n_val = m_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; -- cgit v1.2.3 From cf9b303e55da810255638c0b616b1a3f7eda9320 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 15 Dec 2008 23:33:10 +0100 Subject: x86: re-enable MCE on secondary CPUS after suspend/resume Impact: fix disabled MCE after resume Don't prevent multiple initialization of MCEs. Back from early prehistory mcheck_init() has a reentry check. Presumably that was needed in very old kernels to prevent it entering twice. But as Andreas points out this prevents CPU hotplug (and therefore resume) to correctly reinitialize MCEs when a AP boots again after being offlined. Just drop the check. Reported-by: Andreas Herrmann Signed-off-by: Andi Kleen Tested-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_64.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 4b031a4ac856..1c838032fd37 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -510,12 +510,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - static cpumask_t mce_cpus = CPU_MASK_NONE; - mce_cpu_quirks(c); if (mce_dont_init || - cpu_test_and_set(smp_processor_id(), mce_cpus) || !mce_available(c)) return; -- cgit v1.2.3 From c8182f0016fb65a721c4fbe487909a2d56178135 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 12 Dec 2008 11:07:00 -0600 Subject: sgi-xp: xpc needs to pass the physical address, not virtual Impact: fix crash xpc needs to pass the physical address, not virtual. Testing uncovered this problem. The virtual address happens to work most of the time due to the way bios was masking off the node bits. Passing the physical address makes it work all of the time. Signed-off-by: Russ Anderson Acked-by: Dean Nelson Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/bios.h | 2 +- arch/x86/kernel/bios_uv.c | 4 +--- drivers/misc/sgi-xp/xpc_uv.c | 8 ++++---- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index da1c4e8e78fc..7ed17ff502b9 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -100,7 +100,7 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); -extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, +extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index d22d0f1bbea0..2a0a2a3cac26 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -101,15 +101,13 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, } int -uv_bios_mq_watchlist_alloc(int blade, void *mq, unsigned int mq_size, +uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, unsigned long *intr_mmr_offset) { union uv_watchlist_u size_blade; - unsigned long addr; u64 watchlist; s64 ret; - addr = (unsigned long)mq; size_blade.size = mq_size; size_blade.blade = blade; diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 684b2dd17583..91a55b1b1037 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -119,16 +119,16 @@ xpc_gru_mq_watchlist_alloc_uv(struct xpc_gru_mq_uv *mq) int ret; #if defined CONFIG_X86_64 - ret = uv_bios_mq_watchlist_alloc(mq->mmr_blade, mq->address, mq->order, - &mq->mmr_offset); + ret = uv_bios_mq_watchlist_alloc(mq->mmr_blade, uv_gpa(mq->address), + mq->order, &mq->mmr_offset); if (ret < 0) { dev_err(xpc_part, "uv_bios_mq_watchlist_alloc() failed, " "ret=%d\n", ret); return ret; } #elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV - ret = sn_mq_watchlist_alloc(mq->mmr_blade, mq->address, mq->order, - &mq->mmr_offset); + ret = sn_mq_watchlist_alloc(mq->mmr_blade, uv_gpa(mq->address), + mq->order, &mq->mmr_offset); if (ret < 0) { dev_err(xpc_part, "sn_mq_watchlist_alloc() failed, ret=%d\n", ret); -- cgit v1.2.3 From ae417bb487e3bb88dc862b83b4bf00d87ba67ec8 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 16 Dec 2008 14:02:16 -0800 Subject: x86: signal: use signal_fault() in sys_sigreturn() Impact: cleanup Call signal_fault() in error route of sys_sigreturn(). Change log level to KERN_EMERG if current is init. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b1cc6da64208..2725a294d734 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -594,17 +594,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused) return ax; badframe: - if (show_unhandled_signals && printk_ratelimit()) { - printk("%s%s[%d] bad frame in sigreturn frame:" - "%p ip:%lx sp:%lx oeax:%lx", - task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, - current->comm, task_pid_nr(current), frame, regs->ip, - regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); - } - - force_sig(SIGSEGV, current); + signal_fault(regs, frame, "sigreturn"); return 0; } @@ -901,8 +891,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) struct task_struct *me = current; if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO + printk("%s" "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); -- cgit v1.2.3 From d0b48ca189523b638d8674fa41e94d1950a17038 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 16 Dec 2008 14:03:36 -0800 Subject: x86: ia32_signal: use __put_user() instead of __copy_to_user() Impact: cleanup __put_user() can be used for constant size 8, like arch/x86/kernel/signal.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 1267977e7708..e4f2a5045743 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -467,7 +467,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, * These are actually not used anymore, but left because some * gdb versions depend on them as a marker. */ - err |= __copy_to_user(frame->retcode, &code, 8); + err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode); if (err) return -EFAULT; @@ -554,7 +554,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * Not actually used anymore, but left because some gdb * versions need it. */ - err |= __copy_to_user(frame->retcode, &code, 8); + err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode); if (err) return -EFAULT; -- cgit v1.2.3 From 8bee3f0a662ad9c3d6bb705b0530a3b90f089c55 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 16 Dec 2008 14:04:43 -0800 Subject: x86: ia32_signal: use proper macro __USER32_DS Impact: cleanup Use __USER32_DS instead of __USER_DS in ia32_signal.c. No impact, because __USER32_DS is defined __USER_DS. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e4f2a5045743..9c99c429a20d 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -396,7 +396,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, } /* This is the legacy signal stack switching. */ - else if ((regs->ss & 0xffff) != __USER_DS && + else if ((regs->ss & 0xffff) != __USER32_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) sp = (unsigned long) ka->sa.sa_restorer; -- cgit v1.2.3 From d680fe44775ed17a80035462d9898f5e77bfd7dd Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 13 Dec 2008 00:09:08 +0300 Subject: x86: entry_64 - introduce FTRACE_ frame macro v2 Impact: clean up Itroduce MCOUNT_SAVE/RESTORE_FRAME which allow us to save a number of lines on source level. Also fix a comment in ftrace.h. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 29 +++++++++++++++++++++- arch/x86/kernel/entry_64.S | 57 ++++++------------------------------------- 2 files changed, 35 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 7e61b4ceb9a4..b55b4a7fbefd 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -1,6 +1,33 @@ #ifndef _ASM_X86_FTRACE_H #define _ASM_X86_FTRACE_H +#ifdef __ASSEMBLY__ + + .macro MCOUNT_SAVE_FRAME + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + .endm + + .macro MCOUNT_RESTORE_FRAME + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + .endm + +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ @@ -46,7 +73,7 @@ struct ftrace_ret_stack { /* * Primary handler of a function return. * It relays on ftrace_return_to_handler. - * Defined in entry32.S + * Defined in entry_32/64.S */ extern void return_to_handler(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 54e0bbdccb99..303dd84d2a98 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -71,15 +71,7 @@ ENTRY(ftrace_caller) cmpl $0, function_trace_stop jne ftrace_stub - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -89,14 +81,7 @@ ENTRY(ftrace_caller) ftrace_call: call ftrace_stub - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call @@ -130,15 +115,7 @@ ftrace_stub: retq trace: - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -146,14 +123,7 @@ trace: call *ftrace_trace_function - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME jmp ftrace_stub END(mcount) @@ -165,14 +135,7 @@ ENTRY(ftrace_graph_caller) cmpl $0, function_trace_stop jne ftrace_stub - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi @@ -180,14 +143,8 @@ ENTRY(ftrace_graph_caller) call prepare_ftrace_return - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME + retq END(ftrace_graph_caller) -- cgit v1.2.3 From cf558d25e5c9f70fa0279c9b7b8b4aed7cae9bd4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Dec 2008 15:06:01 +0100 Subject: AMD IOMMU: set cmd buffer pointers to zero manually Impact: set cmd buffer head and tail pointers to zero in case nobody else did Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c90a15eba5c5..c6cc22815d35 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -427,6 +427,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, &entry, sizeof(entry)); + /* set head and tail to zero manually */ + writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); return cmd_buf; -- cgit v1.2.3 From 84df81759590ad16b0024cf46b3423cca76b2e07 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Dec 2008 16:36:44 +0100 Subject: AMD IOMMU: panic if completion wait loop fails Impact: prevents data corruption after a failed completion wait loop Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a7b6dec6fc3f..0a60d60ed036 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -235,8 +235,9 @@ static int iommu_completion_wait(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) - printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); + if (unlikely(i == EXIT_LOOP_COUNT)) + panic("AMD IOMMU: Completion wait loop failed\n"); + out: spin_unlock_irqrestore(&iommu->lock, flags); -- cgit v1.2.3 From 8ce7996009bab7b2d23e7af7ad831fed7eb6faa1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 16 Dec 2008 12:17:35 -0800 Subject: x86: add swiotlb allocation functions Add x86-specific swiotlb allocation functions. These are purely default for the moment. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb_64.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 3c539d111abb..f47a097a135b 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include #include @@ -11,6 +13,16 @@ int swiotlb __read_mostly; +void *swiotlb_alloc_boot(size_t size, unsigned long nslabs) +{ + return alloc_bootmem_low_pages(size); +} + +void *swiotlb_alloc(unsigned order, unsigned long nslabs) +{ + return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); +} + static dma_addr_t swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int direction) -- cgit v1.2.3 From cfb80c9eae8c7ed8f2ee81090062d15ead51cbe8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 16 Dec 2008 12:17:36 -0800 Subject: x86: unify pci iommu setup and allow swiotlb to compile for 32 bit swiotlb on 32 bit will be used by Xen domain 0 support. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dma-mapping.h | 2 +- arch/x86/include/asm/pci.h | 2 ++ arch/x86/include/asm/pci_64.h | 1 - arch/x86/kernel/Makefile | 3 ++- arch/x86/kernel/pci-dma.c | 6 ++++-- arch/x86/kernel/pci-swiotlb_64.c | 2 ++ arch/x86/mm/init_32.c | 3 +++ 7 files changed, 14 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 097794ff6b79..3b43a65894c4 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -65,7 +65,7 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) return dma_ops; else return dev->archdata.dma_ops; -#endif /* _ASM_X86_DMA_MAPPING_H */ +#endif } /* Make sure we keep the same behaviour */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 875b38edf193..50ac542c9382 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -82,6 +82,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev, static inline void early_quirks(void) { } #endif +extern void pci_iommu_alloc(void); + #endif /* __KERNEL__ */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h index d02d936840a3..4da207982777 100644 --- a/arch/x86/include/asm/pci_64.h +++ b/arch/x86/include/asm/pci_64.h @@ -23,7 +23,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); extern void dma32_reserve_bootmem(void); -extern void pci_iommu_alloc(void); /* The PCI address space does equal the physical memory * address space. The networking and block device layers use diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b62a7667828e..a9c656f2d661 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -105,6 +105,8 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) @@ -118,7 +120,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o - obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o endif diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index e150ad4f0ccc..00e07447a5bd 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -105,11 +105,15 @@ static void __init dma32_free_bootmem(void) dma32_bootmem_ptr = NULL; dma32_bootmem_size = 0; } +#endif void __init pci_iommu_alloc(void) { +#ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); +#endif + /* * The order of these functions is important for * fall-back/fail-over reasons @@ -125,8 +129,6 @@ void __init pci_iommu_alloc(void) pci_swiotlb_init(); } -#endif - void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index f47a097a135b..a991afea6700 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -62,8 +62,10 @@ struct dma_mapping_ops swiotlb_dma_ops = { void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ +#ifdef CONFIG_X86_64 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; +#endif if (swiotlb_force) swiotlb = 1; if (swiotlb) { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c483f4242079..2b4b14fc0c04 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -971,6 +972,8 @@ void __init mem_init(void) start_periodic_check_for_corruption(); + pci_iommu_alloc(); + #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif -- cgit v1.2.3 From 1d32251e846ccbcf9d2da041dffd1199f94b2a3b Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 16 Dec 2008 12:17:37 -0800 Subject: x86/swiotlb: add default phys<->bus conversion Xen will override these later on. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb_64.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index a991afea6700..93a8371f2c22 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -23,6 +23,16 @@ void *swiotlb_alloc(unsigned order, unsigned long nslabs) return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); } +dma_addr_t swiotlb_phys_to_bus(phys_addr_t paddr) +{ + return paddr; +} + +phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +{ + return baddr; +} + static dma_addr_t swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int direction) -- cgit v1.2.3 From a08636690d06b2e36cfb4c2b3ee133a81c47e1e0 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 16 Dec 2008 12:17:38 -0800 Subject: x86/swiotlb: add default swiotlb_arch_range_needs_mapping Xen will override these later on. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb_64.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 93a8371f2c22..242c3440687f 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -33,6 +33,11 @@ phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) return baddr; } +int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size) +{ + return 0; +} + static dma_addr_t swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int direction) -- cgit v1.2.3 From f5223763a664da16771211f9d293e18cb242b246 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:47:17 -0800 Subject: x86: signal: move ia32 func declarations into arch/x86/kernel/signal.c Impact: cleanup Move declarations of ia32_setup_rt_frame() and ia32_setup_frame() into arch/x86/kernel/signal.c. This is for future use of sigframe.h. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/sigframe.h | 5 ----- arch/x86/kernel/signal.c | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h index cc673aa55ce4..6dd7e2b70a4b 100644 --- a/arch/x86/kernel/sigframe.h +++ b/arch/x86/kernel/sigframe.h @@ -34,9 +34,4 @@ struct rt_sigframe { struct siginfo info; /* fp state follows here */ }; - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs *regs); #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 2725a294d734..848c2d64a289 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -671,6 +671,11 @@ static int signr_convert(int sig) #define is_ia32 0 #endif /* CONFIG_IA32_EMULATION */ +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs *regs); + #endif /* CONFIG_X86_32 */ static int -- cgit v1.2.3 From a5c56eb36f999ae0ecac278e51fd1cf8feb16c2f Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:49:55 -0800 Subject: x86: signal: rename sigframe and rt_sigframe on 32-bit Impact: cleanup, prepare to move sigframe.h On 32-bit, rename struct sigrame to struct sigframe_ia32, struct rt_sigframe to struct rt_sigframe_ia32 and several structures. And add helper macros to access the above data in arch/x86/kernel/signal.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/sigframe.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h index 6dd7e2b70a4b..6718ed04b05b 100644 --- a/arch/x86/kernel/sigframe.h +++ b/arch/x86/kernel/sigframe.h @@ -1,8 +1,14 @@ #ifdef CONFIG_X86_32 -struct sigframe { - char __user *pretcode; +#define sigframe_ia32 sigframe +#define rt_sigframe_ia32 rt_sigframe +#define sigcontext_ia32 sigcontext +#define _fpstate_ia32 _fpstate +#define ucontext_ia32 ucontext + +struct sigframe_ia32 { + u32 pretcode; int sig; - struct sigcontext sc; + struct sigcontext_ia32 sc; /* * fpstate is unused. fpstate is moved/allocated after * retcode[] below. This movement allows to have the FP state and the @@ -11,27 +17,27 @@ struct sigframe { * the offset of extramask[] in the sigframe and thus prevent any * legacy application accessing/modifying it. */ - struct _fpstate fpstate_unused; + struct _fpstate_ia32 fpstate_unused; unsigned long extramask[_NSIG_WORDS-1]; char retcode[8]; /* fp state follows here */ }; -struct rt_sigframe { - char __user *pretcode; +struct rt_sigframe_ia32 { + u32 pretcode; int sig; - struct siginfo __user *pinfo; - void __user *puc; + u32 pinfo; + u32 puc; struct siginfo info; - struct ucontext uc; + struct ucontext_ia32 uc; char retcode[8]; /* fp state follows here */ }; -#else +#else /* !CONFIG_X86_32 */ struct rt_sigframe { char __user *pretcode; struct ucontext uc; struct siginfo info; /* fp state follows here */ }; -#endif +#endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From 41af86fad3c40646b9748279e3862781e937a5d2 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:50:32 -0800 Subject: x86: signal: move sigframe.h to arch/x86/include/asm Impact: cleanup, move header file Move arch/x86/kernel/sigframe.h to arch/x86/include/asm/sigframe.h. It will be used in arch/x86/ia32/ia32_signal.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 43 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/sigframe.h | 43 ---------------------------------------- arch/x86/kernel/signal.c | 2 +- 4 files changed, 45 insertions(+), 45 deletions(-) create mode 100644 arch/x86/include/asm/sigframe.h delete mode 100644 arch/x86/kernel/sigframe.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h new file mode 100644 index 000000000000..6718ed04b05b --- /dev/null +++ b/arch/x86/include/asm/sigframe.h @@ -0,0 +1,43 @@ +#ifdef CONFIG_X86_32 +#define sigframe_ia32 sigframe +#define rt_sigframe_ia32 rt_sigframe +#define sigcontext_ia32 sigcontext +#define _fpstate_ia32 _fpstate +#define ucontext_ia32 ucontext + +struct sigframe_ia32 { + u32 pretcode; + int sig; + struct sigcontext_ia32 sc; + /* + * fpstate is unused. fpstate is moved/allocated after + * retcode[] below. This movement allows to have the FP state and the + * future state extensions (xsave) stay together. + * And at the same time retaining the unused fpstate, prevents changing + * the offset of extramask[] in the sigframe and thus prevent any + * legacy application accessing/modifying it. + */ + struct _fpstate_ia32 fpstate_unused; + unsigned long extramask[_NSIG_WORDS-1]; + char retcode[8]; + /* fp state follows here */ +}; + +struct rt_sigframe_ia32 { + u32 pretcode; + int sig; + u32 pinfo; + u32 puc; + struct siginfo info; + struct ucontext_ia32 uc; + char retcode[8]; + /* fp state follows here */ +}; +#else /* !CONFIG_X86_32 */ +struct rt_sigframe { + char __user *pretcode; + struct ucontext uc; + struct siginfo info; + /* fp state follows here */ +}; +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6649d09ad88f..ee4df08feee6 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -11,7 +11,7 @@ #include #include #include -#include "sigframe.h" +#include #include #include #include diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h deleted file mode 100644 index 6718ed04b05b..000000000000 --- a/arch/x86/kernel/sigframe.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifdef CONFIG_X86_32 -#define sigframe_ia32 sigframe -#define rt_sigframe_ia32 rt_sigframe -#define sigcontext_ia32 sigcontext -#define _fpstate_ia32 _fpstate -#define ucontext_ia32 ucontext - -struct sigframe_ia32 { - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - /* - * fpstate is unused. fpstate is moved/allocated after - * retcode[] below. This movement allows to have the FP state and the - * future state extensions (xsave) stay together. - * And at the same time retaining the unused fpstate, prevents changing - * the offset of extramask[] in the sigframe and thus prevent any - * legacy application accessing/modifying it. - */ - struct _fpstate_ia32 fpstate_unused; - unsigned long extramask[_NSIG_WORDS-1]; - char retcode[8]; - /* fp state follows here */ -}; - -struct rt_sigframe_ia32 { - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - struct siginfo info; - struct ucontext_ia32 uc; - char retcode[8]; - /* fp state follows here */ -}; -#else /* !CONFIG_X86_32 */ -struct rt_sigframe { - char __user *pretcode; - struct ucontext uc; - struct siginfo info; - /* fp state follows here */ -}; -#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 848c2d64a289..89bb7668041d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -35,7 +35,7 @@ #include #include -#include "sigframe.h" +#include #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -- cgit v1.2.3 From c85c2ff877c9305f801f7d5b9e6382cb05a03d45 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:51:08 -0800 Subject: x86: signal: prepare to include from ia32_signal.c Impact: cleanup, prepare to use from ia32_signal.c Make struct sigframe_ia32 and rt_sigframe_ia32 visible to ia32_signal.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 6718ed04b05b..491a0878c3aa 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -4,7 +4,15 @@ #define sigcontext_ia32 sigcontext #define _fpstate_ia32 _fpstate #define ucontext_ia32 ucontext +#else /* !CONFIG_X86_32 */ + +#ifdef CONFIG_IA32_EMULATION +#include +#endif /* CONFIG_IA32_EMULATION */ + +#endif /* CONFIG_X86_32 */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) struct sigframe_ia32 { u32 pretcode; int sig; @@ -18,7 +26,11 @@ struct sigframe_ia32 { * legacy application accessing/modifying it. */ struct _fpstate_ia32 fpstate_unused; +#ifdef CONFIG_IA32_EMULATION + unsigned int extramask[_COMPAT_NSIG_WORDS-1]; +#else /* !CONFIG_IA32_EMULATION */ unsigned long extramask[_NSIG_WORDS-1]; +#endif /* CONFIG_IA32_EMULATION */ char retcode[8]; /* fp state follows here */ }; @@ -28,16 +40,22 @@ struct rt_sigframe_ia32 { int sig; u32 pinfo; u32 puc; +#ifdef CONFIG_IA32_EMULATION + compat_siginfo_t info; +#else /* !CONFIG_IA32_EMULATION */ struct siginfo info; +#endif /* CONFIG_IA32_EMULATION */ struct ucontext_ia32 uc; char retcode[8]; /* fp state follows here */ }; -#else /* !CONFIG_X86_32 */ +#endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */ + +#ifdef CONFIG_X86_64 struct rt_sigframe { char __user *pretcode; struct ucontext uc; struct siginfo info; /* fp state follows here */ }; -#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_X86_64 */ -- cgit v1.2.3 From 3b0d29ee1c73b6b90bfddd10f7b8e86632b6b694 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:51:46 -0800 Subject: x86: ia32_signal: rename struct sigframe and rt_sigframe Impact: cleanup, prepare to include sigframe.h Rename struct sigframe to struct sigframe_ia32 and struct rt_sigframe to struct rt_sigframe_ia32. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 9c99c429a20d..334a4aa2e75b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -174,7 +174,7 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, * Do a signal return; undo the signal stack. */ -struct sigframe +struct sigframe_ia32 { u32 pretcode; int sig; @@ -185,7 +185,7 @@ struct sigframe /* fp state follows here */ }; -struct rt_sigframe +struct rt_sigframe_ia32 { u32 pretcode; int sig; @@ -271,7 +271,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, asmlinkage long sys32_sigreturn(struct pt_regs *regs) { - struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8); + struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; unsigned int ax; @@ -301,12 +301,12 @@ badframe: asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) { - struct rt_sigframe __user *frame; + struct rt_sigframe_ia32 __user *frame; sigset_t set; unsigned int ax; struct pt_regs tregs; - frame = (struct rt_sigframe __user *)(regs->sp - 4); + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -418,7 +418,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, int ia32_setup_frame(int sig, struct k_sigaction *ka, compat_sigset_t *set, struct pt_regs *regs) { - struct sigframe __user *frame; + struct sigframe_ia32 __user *frame; void __user *restorer; int err = 0; void __user *fpstate = NULL; @@ -497,7 +497,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, compat_sigset_t *set, struct pt_regs *regs) { - struct rt_sigframe __user *frame; + struct rt_sigframe_ia32 __user *frame; void __user *restorer; int err = 0; void __user *fpstate = NULL; -- cgit v1.2.3 From d98f9d84422c393103dc7569dc8444bac628f7ac Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:52:45 -0800 Subject: x86: ia32_signal: use sigframe.h Impact: cleanup Use arch/x86/include/asm/sigframe.h instead of defining redundant structures. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 334a4aa2e75b..3b3878a63bc2 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -32,6 +32,8 @@ #include #include +#include + #define DEBUG_SIG 0 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) @@ -173,30 +175,6 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, /* * Do a signal return; undo the signal stack. */ - -struct sigframe_ia32 -{ - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */ - unsigned int extramask[_COMPAT_NSIG_WORDS-1]; - char retcode[8]; - /* fp state follows here */ -}; - -struct rt_sigframe_ia32 -{ - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - compat_siginfo_t info; - struct ucontext_ia32 uc; - char retcode[8]; - /* fp state follows here */ -}; - #define COPY(x) { \ err |= __get_user(regs->x, &sc->x); \ } -- cgit v1.2.3 From 55aab5f49e384a361668d112eefdb33e90779af9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 17 Dec 2008 12:52:34 -0700 Subject: x86 gart: don't complain if no AMD GART found Impact: remove annoying bootup printk It's perfectly normal for no AMD GART to be present, e.g., if you have Intel CPUs. None of the other iommu_init() functions makes noise when it finds nothing. Signed-off-by: Bjorn Helgaas Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index ba7ad83e20a8..a35eaa379ff6 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -745,10 +745,8 @@ void __init gart_iommu_init(void) unsigned long scratch; long i; - if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { - printk(KERN_INFO "PCI-GART: No AMD GART found.\n"); + if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) return; - } #ifndef CONFIG_AGP_AMD64 no_agp = 1; -- cgit v1.2.3 From 57a37505d19f4dfeee26f0fd7ea38ed6f1d10cbe Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 17 Dec 2008 23:17:21 +0530 Subject: x86: time_64.c timer_interrupt() should be static Impact: cleanup, reduce kernel size a bit Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index cb19d650c216..083a4a5bb00f 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -49,7 +49,7 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); -irqreturn_t timer_interrupt(int irq, void *dev_id) +static irqreturn_t timer_interrupt(int irq, void *dev_id) { add_pda(irq0_irqs, 1); -- cgit v1.2.3 From 7c9c160c54fc545efc23881344593868e5f717bd Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 17 Dec 2008 23:18:52 +0530 Subject: x86: tls.c declare sys_set_thread_area and sys_get_thread_area before they get used Impact: cleanup In asm/syscalls.h move out sys_set_thread_area() and sys_get_thread_area() as they are common for both 32 and 64 bit. Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/syscalls.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 75d4a6afc36f..c0b0bda754ee 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -22,6 +22,10 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +/* kernel/tls.c */ +asmlinkage int sys_set_thread_area(struct user_desc __user *); +asmlinkage int sys_get_thread_area(struct user_desc __user *); + /* X86_32 only */ #ifdef CONFIG_X86_32 /* kernel/process_32.c */ @@ -54,10 +58,6 @@ asmlinkage int sys_uname(struct old_utsname __user *); struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); -/* kernel/tls.c */ -asmlinkage int sys_set_thread_area(struct user_desc __user *); -asmlinkage int sys_get_thread_area(struct user_desc __user *); - /* kernel/vm86_32.c */ asmlinkage int sys_vm86old(struct pt_regs); asmlinkage int sys_vm86(struct pt_regs); -- cgit v1.2.3 From f269b07e862c395d6981ab2c05d6bc34b0249e90 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 18 Dec 2008 18:35:06 +0100 Subject: x86: revert CONFIG_RELOCATABLE=y defconfig change This commit: commit 5cb04df8d3f03e37a19f2502591a84156be71772 Author: Ingo Molnar Date: Sun May 4 19:49:04 2008 +0200 x86: defconfig updates changed CONFIG_RELOCATABLE from n to y, which may lead to a mismatch between the vmlinux debug information and the runtime location of the kernel, even when the bootloader does not relocate the kernel. Revert the specific change. Works for me with GRUB and qemu. Reference: http://lkml.org/lkml/2008/11/25/243 Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 2 +- arch/x86/configs/x86_64_defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 71fc39c70782..b30a08ed8eb4 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -298,7 +298,7 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y +# CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_ALIGN=0x200000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index b38bbabc1706..0e7dbc0a3e46 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -298,7 +298,7 @@ CONFIG_SCHED_HRTICK=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y +# CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_ALIGN=0x200000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set -- cgit v1.2.3 From 5c2628e8b4f670d0954053444289e2b018be957a Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 09:18:35 -0800 Subject: x86: sigframe.h: add guard macro Impact: cleanup Add missing guard macro _ASM_X86_SIGFRAME_H. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 491a0878c3aa..3bd0f4276000 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -1,3 +1,6 @@ +#ifndef _ASM_X86_SIGFRAME_H +#define _ASM_X86_SIGFRAME_H + #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe #define rt_sigframe_ia32 rt_sigframe @@ -59,3 +62,5 @@ struct rt_sigframe { /* fp state follows here */ }; #endif /* CONFIG_X86_64 */ + +#endif /* _ASM_X86_SIGFRAME_H */ -- cgit v1.2.3 From f0bc2202e0373eb8e9b1ddbec930e2e681357db8 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 17 Dec 2008 23:20:05 +0530 Subject: x86: process.c declare c1e_remove_cpu before they get used Impact: cleanup, avoid sparse warning Included asm/idle.h for c1e_remove_cpu() declaration. Fixes this sparse warning: CHECK arch/x86/kernel/process.c arch/x86/kernel/process.c:284:6: warning: symbol 'c1e_remove_cpu' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d8..b06100f1d612 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 5899329b19100c0b82dc78e9b21ed8b920c9ffb3 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:30 -0800 Subject: x86: PAT: implement track/untrack of pfnmap regions for x86 - v3 Impact: New mm functionality. Hookup remap_pfn_range and vm_insert_pfn and corresponding copy and free routines with reserve and free tracking. reserve and free here only takes care of non RAM region mapping. For RAM region, driver should use set_memory_[uc|wc|wb] to set the cache type and then setup the mapping for user pte. We can bypass below reserve/free in that case. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 10 ++ arch/x86/mm/pat.c | 236 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 246 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c012f3b11671..7dcd94c29044 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -219,6 +219,11 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } +static inline u64 pte_pa(pte_t pte) +{ + return pte_val(pte) & PTE_PFN_MASK; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -328,6 +333,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) +/* Indicate that x86 has its own track and untrack pfn vma functions */ +#define track_pfn_vma_new track_pfn_vma_new +#define track_pfn_vma_copy track_pfn_vma_copy +#define untrack_pfn_vma untrack_pfn_vma + #ifndef __ASSEMBLY__ #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index eb1bf000d12e..1069ffecf77d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -596,6 +596,242 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } +/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful reserve_memtype, + * this func also keeps identity mapping (if any) in sync with this new prot. + */ +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot) +{ + int is_ram = 0; + int id_sz, ret; + unsigned long flags; + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); + + is_ram = pagerange_is_ram(paddr, paddr + size); + + if (is_ram != 0) { + /* + * For mapping RAM pages, drivers need to call + * set_memory_[uc|wc|wb] directly, for reserve and free, before + * setting up the PTE. + */ + WARN_ON_ONCE(1); + return 0; + } + + ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + if (ret) + return ret; + + if (flags != want_flags) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR + "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + return -EINVAL; + } + + /* Need to keep identity mapping in sync */ + if (paddr >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < paddr + size) ? + __pa(high_memory) - paddr : + size; + + if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR + "%s:%d reserve_pfn_range ioremap_change_attr failed %s " + "for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size)); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to free a range of physical memory. + * Frees non RAM regions only. + */ +static void free_pfn_range(u64 paddr, unsigned long size) +{ + int is_ram; + + is_ram = pagerange_is_ram(paddr, paddr + size); + if (is_ram == 0) + free_memtype(paddr, paddr + size); +} + +/* + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + * + * If the vma has a linear pfn mapping for the entire range, we get the prot + * from pte and reserve the entire vma range with single reserve_pfn_range call. + * Otherwise, we reserve the entire vma range, my ging through the PTEs page + * by page to get physical address and protection. + */ +int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + int retval = 0; + unsigned long i, j; + u64 paddr; + pgprot_t prot; + pte_t pte; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return 0; + + if (is_linear_pfn_mapping(vma)) { + /* + * reserve the whole chunk starting from vm_pgoff, + * But, we have to get the protection from pte. + */ + if (follow_pfnmap_pte(vma, vma_start, &pte)) { + WARN_ON_ONCE(1); + return -1; + } + prot = pte_pgprot(pte); + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + return reserve_pfn_range(paddr, vma_size, prot); + } + + /* reserve entire vma page by page, using pfn and prot from pte */ + for (i = 0; i < vma_size; i += PAGE_SIZE) { + if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + continue; + + paddr = pte_pa(pte); + prot = pte_pgprot(pte); + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + if (retval) + goto cleanup_ret; + } + return 0; + +cleanup_ret: + /* Reserve error: Cleanup partial reservation and return error */ + for (j = 0; j < i; j += PAGE_SIZE) { + if (follow_pfnmap_pte(vma, vma_start + j, &pte)) + continue; + + paddr = pte_pa(pte); + free_pfn_range(paddr, PAGE_SIZE); + } + + return retval; +} + +/* + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + * + * prot is passed in as a parameter for the new mapping. If the vma has a + * linear pfn mapping for the entire range reserve the entire vma range with + * single reserve_pfn_range call. + * Otherwise, we look t the pfn and size and reserve only the specified range + * page by page. + * + * Note that this function can be called with caller trying to map only a + * subrange/page inside the vma. + */ +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size) +{ + int retval = 0; + unsigned long i, j; + u64 base_paddr; + u64 paddr; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return 0; + + if (is_linear_pfn_mapping(vma)) { + /* reserve the whole chunk starting from vm_pgoff */ + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + return reserve_pfn_range(paddr, vma_size, prot); + } + + /* reserve page by page using pfn and size */ + base_paddr = (u64)pfn << PAGE_SHIFT; + for (i = 0; i < size; i += PAGE_SIZE) { + paddr = base_paddr + i; + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + if (retval) + goto cleanup_ret; + } + return 0; + +cleanup_ret: + /* Reserve error: Cleanup partial reservation and return error */ + for (j = 0; j < i; j += PAGE_SIZE) { + paddr = base_paddr + j; + free_pfn_range(paddr, PAGE_SIZE); + } + + return retval; +} + +/* + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) +{ + unsigned long i; + u64 paddr; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return; + + if (is_linear_pfn_mapping(vma)) { + /* free the whole chunk starting from vm_pgoff */ + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + free_pfn_range(paddr, vma_size); + return; + } + + if (size != 0 && size != vma_size) { + /* free page by page, using pfn and size */ + paddr = (u64)pfn << PAGE_SHIFT; + for (i = 0; i < size; i += PAGE_SIZE) { + paddr = paddr + i; + free_pfn_range(paddr, PAGE_SIZE); + } + } else { + /* free entire vma, page by page, using the pfn from pte */ + for (i = 0; i < vma_size; i += PAGE_SIZE) { + pte_t pte; + + if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + continue; + + paddr = pte_pa(pte); + free_pfn_range(paddr, PAGE_SIZE); + } + } +} + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ -- cgit v1.2.3 From 8a7b12f70fb135a1b1d865687de3edcdc780f6d1 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:31 -0800 Subject: x86: PAT: change pgprot_noncached to uc_minus instead of strong uc - v3 Impact: mm behavior change. Make pgprot_noncached uc_minus instead of strong UC. This will make pgprot_noncached to be in line with ioremap_nocache() and all the other APIs that map page uc_minus on uc request. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 8 ++++++++ arch/x86/include/asm/pgtable_32.h | 9 --------- arch/x86/include/asm/pgtable_64.h | 6 ------ 3 files changed, 8 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7dcd94c29044..6968d4f6be3e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -158,6 +158,14 @@ #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif +/* + * Macro to mark a page protection value as UC- + */ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ + : (prot)) + #ifndef __ASSEMBLY__ /* diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index f9d5889b336b..72b020deb46b 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -100,15 +100,6 @@ extern unsigned long pg0[]; # include #endif -/* - * Macro to mark a page protection value as "uncacheable". - * On processors which do not support it, this is a no-op. - */ -#define pgprot_noncached(prot) \ - ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \ - : (prot)) - /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 545a0e042bb2..4798a4033e3a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -176,12 +176,6 @@ static inline int pmd_bad(pmd_t pmd) #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ -/* - * Macro to mark a page protection value as "uncacheable". - */ -#define pgprot_noncached(prot) \ - (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT)) - /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. -- cgit v1.2.3 From 2520bd3123c00272f818a176c92d03c7d0a113d6 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:32 -0800 Subject: x86: PAT: add pgprot_writecombine() interface for drivers - v3 Impact: New mm functionality. Add pgprot_writecombine. pgprot_writecombine will be aliased to pgprot_noncached when not supported by the architecture. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 3 +++ arch/x86/mm/pat.c | 8 ++++++++ include/asm-generic/pgtable.h | 4 ++++ 3 files changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6968d4f6be3e..579f8ceee948 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -168,6 +168,9 @@ #ifndef __ASSEMBLY__ +#define pgprot_writecombine pgprot_writecombine +extern pgprot_t pgprot_writecombine(pgprot_t prot); + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1069ffecf77d..d5254bae84f4 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -832,6 +832,14 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, } } +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + if (pat_enabled) + return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + else + return pgprot_noncached(prot); +} + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index ef87f889ef62..b84633801fb6 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -129,6 +129,10 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres #define move_pte(pte, prot, old_addr, new_addr) (pte) #endif +#ifndef pgprot_writecombine +#define pgprot_writecombine pgprot_noncached +#endif + /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no -- cgit v1.2.3 From d1769d5475176124af04fa69848b022c98c4bc37 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 19 Dec 2008 00:03:56 +0530 Subject: x86: traps.c declare functions before they get used Impact: cleanup In asm/traps.h :- do_double_fault : added under X86_64 sync_regs : added under X86_64 math_error : moved out from X86_32 as it is common for both 32 and 64 bit math_emulate : moved from X86_32 as it is common for both 32 and 64 bit smp_thermal_interrupt : added under X86_64 mce_threshold_interrupt : added under X86_64 Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/traps.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 45dee286e45c..2ee0a3bceedf 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -46,6 +46,10 @@ dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long); dotraplinkage void do_invalid_TSS(struct pt_regs *, long); dotraplinkage void do_segment_not_present(struct pt_regs *, long); dotraplinkage void do_stack_segment(struct pt_regs *, long); +#ifdef CONFIG_X86_64 +dotraplinkage void do_double_fault(struct pt_regs *, long); +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *); +#endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); @@ -72,10 +76,13 @@ static inline int get_si_code(unsigned long condition) extern int panic_on_unrecovered_nmi; extern int kstack_depth_to_print; -#ifdef CONFIG_X86_32 void math_error(void __user *); -unsigned long patch_espfix_desc(unsigned long, unsigned long); asmlinkage void math_emulate(long); +#ifdef CONFIG_X86_32 +unsigned long patch_espfix_desc(unsigned long, unsigned long); +#else +asmlinkage void smp_thermal_interrupt(void); +asmlinkage void mce_threshold_interrupt(void); #endif #endif /* _ASM_X86_TRAPS_H */ -- cgit v1.2.3 From b2fa739c06931d167b6d2aa7b514ab7f30d04dc0 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 14:43:34 -0800 Subject: x86: sigframe.h: include headers for dependency Impact: cleanup Include following headers for dependency. asm/sigcontext.h asm/siginfo.h asm/ucontext.h Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/sigframe.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 3bd0f4276000..4e0fe26d27d3 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -1,6 +1,10 @@ #ifndef _ASM_X86_SIGFRAME_H #define _ASM_X86_SIGFRAME_H +#include +#include +#include + #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe #define rt_sigframe_ia32 rt_sigframe -- cgit v1.2.3 From 8869a2e5d3a66d5b63b948052d60cd13ede8b735 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 14:46:52 -0800 Subject: x86: asm-offset_64: use rt_sigframe_ia32 Impact: cleanup Use rt_sigframe_ia32 instead of rt_sigframe32. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/asm-offsets_64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 7fcf63d22f8b..1d41d3f1edbc 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -20,6 +20,8 @@ #include +#include + #define __NO_STUBS 1 #undef __SYSCALL #undef _ASM_X86_UNISTD_64_H @@ -87,7 +89,7 @@ int main(void) BLANK(); #undef ENTRY DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe32, uc.uc_mcontext)); + offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); BLANK(); #endif DEFINE(pbe_address, offsetof(struct pbe, address)); -- cgit v1.2.3 From 9f221495997d180df51ce4d8296669445dd3e7b3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 14:47:37 -0800 Subject: x86: ia32.h: remove unused struct sigfram32 and rt_sigframe32 Impact: cleanup Remove struct sigfram32 and rt_sigframe32 because there is no user. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/ia32.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 97989c0e534c..50ca486fd88c 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -129,24 +129,6 @@ typedef struct compat_siginfo { } _sifields; } compat_siginfo_t; -struct sigframe32 { - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - struct _fpstate_ia32 fpstate; - unsigned int extramask[_COMPAT_NSIG_WORDS-1]; -}; - -struct rt_sigframe32 { - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - compat_siginfo_t info; - struct ucontext_ia32 uc; - struct _fpstate_ia32 fpstate; -}; - struct ustat32 { __u32 f_tfree; compat_ino_t f_tinode; -- cgit v1.2.3 From f34a10bd9f8cc95ebdc69a079db195636b2e22e0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 01:36:14 +0100 Subject: x86: fix warning in arch/x86/kernel/microcode_amd.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: arch/x86/kernel/microcode_amd.c: In function ‘apply_microcode_amd’: arch/x86/kernel/microcode_amd.c:163: warning: cast from pointer to integer of different size arch/x86/kernel/microcode_amd.c:163: warning: cast from pointer to integer of different size triggers because we want to pass the address to the microcode MSR, which is 64-bit even on 32-bit. Cast it explicitly to express this. Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 24c256f4e50a..c25fdb382292 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -160,7 +160,7 @@ static void apply_microcode_amd(int cpu) return; spin_lock_irqsave(µcode_update_lock, flags); - wrmsrl(MSR_AMD64_PATCH_LOADER, &mc_amd->hdr.data_code); + wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); spin_unlock_irqrestore(µcode_update_lock, flags); @@ -372,3 +372,4 @@ struct microcode_ops * __init init_amd_microcode(void) { return µcode_amd_ops; } + -- cgit v1.2.3 From 345077cd98ff5532b2d1158013c3fec7b1ae85ec Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 18 Dec 2008 18:09:21 -0800 Subject: x86: fix intel x86_64 llc_shared_map/cpu_llc_id anomolies Impact: fix wrong cache sharing detection on platforms supporting > 8 bit apicid's In the presence of extended topology eumeration leaf 0xb provided by cpuid, 32bit extended initial_apicid in cpuinfo_x86 struct will be updated by detect_extended_topology(). At this instance, we should also reinit the apicid (which could also potentially be extended to 32bit). With out this there will potentially be duplicate apicid's populated in the per cpu's cpuinfo_x86 struct, resulting in wrong cache sharing topology etc detected by init_intel_cacheinfo(). Reported-by: Dimitri Sivanich Signed-off-by: Suresh Siddha Acked-by: Dimitri Sivanich Signed-off-by: Ingo Molnar Cc: --- arch/x86/kernel/cpu/addon_cpuid_features.c | 8 ++++++++ arch/x86/kernel/cpu/intel.c | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index ef8f831af823..2cf23634b6d9 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -120,9 +120,17 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) & core_select_mask; c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = phys_pkg_id(c->initial_apicid, 0); #else c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; c->phys_proc_id = phys_pkg_id(core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = phys_pkg_id(0); #endif c->x86_max_cores = (core_level_siblings / smp_num_siblings); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index caec59437a22..b21c37c060a2 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -252,6 +252,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) intel_workarounds(c); + /* + * Detect the extended topology information if available. This + * will reinitialise the initial_apicid which will be used + * in init_intel_cacheinfo() + */ + detect_extended_topology(c); + l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); @@ -323,7 +330,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) #endif - detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* * let's use the legacy cpuid vector 0x1 and 0x4 for topology -- cgit v1.2.3 From 9bb482476c6c9d1ae033306440c51ceac93ea80c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:30:08 +0000 Subject: allow stripping of generated symbols under CONFIG_KALLSYMS_ALL Building upon parts of the module stripping patch, this patch introduces similar stripping for vmlinux when CONFIG_KALLSYMS_ALL=y. Using CONFIG_KALLSYMS_STRIP_GENERATED reduces the overhead of CONFIG_KALLSYMS_ALL from 245k/310k to 65k/80k for the (i386/x86-64) kernels I tested with. The patch also does away with the need to special case the kallsyms- internal symbols by making them available even in the first linking stage. While it is a generated file, the patch includes the changes to scripts/genksyms/keywords.c_shipped, as I'm unsure what the procedure here is. Signed-off-by: Jan Beulich Signed-off-by: Sam Ravnborg --- Makefile | 47 ++++++--- arch/x86/scripts/strip-symbols | 1 + init/Kconfig | 7 ++ kernel/kallsyms.c | 16 ++- scripts/genksyms/keywords.c_shipped | 189 ++++++++++++++++++------------------ scripts/genksyms/keywords.gperf | 2 + scripts/kallsyms.c | 21 ++-- 7 files changed, 155 insertions(+), 128 deletions(-) create mode 100644 arch/x86/scripts/strip-symbols (limited to 'arch/x86') diff --git a/Makefile b/Makefile index 5dd0ed3b12c6..b3d1c8f1f4ce 100644 --- a/Makefile +++ b/Makefile @@ -604,6 +604,9 @@ export INSTALL_PATH ?= /boot MODLIB = $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE) export MODLIB +strip-symbols := $(srctree)/scripts/strip-symbols \ + $(wildcard $(srctree)/arch/$(ARCH)/scripts/strip-symbols) + # # INSTALL_MOD_STRIP, if defined, will cause modules to be stripped while # they get installed. If INSTALL_MOD_STRIP is '1', then the default @@ -611,8 +614,10 @@ export MODLIB # be used as the option(s) to the objcopy command. ifdef INSTALL_MOD_STRIP ifeq ($(INSTALL_MOD_STRIP),1) -mod_strip_cmd = $(OBJCOPY) --strip-debug --strip-symbols \ - $(srctree)/scripts/strip-symbols --wildcard +mod_strip_cmd = $(OBJCOPY) --strip-debug +ifeq ($(CONFIG_KALLSYMS_ALL),$(CONFIG_KALLSYMS_STRIP_GENERATED)) +mod_strip_cmd += --wildcard $(addprefix --strip-symbols ,$(strip-symbols)) +endif else mod_strip_cmd = $(OBJCOPY) $(INSTALL_MOD_STRIP) endif # INSTALL_MOD_STRIP=1 @@ -747,6 +752,7 @@ last_kallsyms := 2 endif kallsyms.o := .tmp_kallsyms$(last_kallsyms).o +kallsyms.h := $(wildcard include/config/kallsyms/*.h) $(wildcard include/config/kallsyms/*/*.h) define verify_kallsyms $(Q)$(if $($(quiet)cmd_sysmap), \ @@ -771,24 +777,41 @@ endef # Generate .S file with all kernel symbols quiet_cmd_kallsyms = KSYM $@ - cmd_kallsyms = $(NM) -n $< | $(KALLSYMS) \ - $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) > $@ + cmd_kallsyms = { test $* -eq 0 || $(NM) -n $<; } \ + | $(KALLSYMS) $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) >$@ -.tmp_kallsyms1.o .tmp_kallsyms2.o .tmp_kallsyms3.o: %.o: %.S scripts FORCE +quiet_cmd_kstrip = STRIP $@ + cmd_kstrip = $(OBJCOPY) --wildcard $(addprefix --strip$(if $(CONFIG_RELOCATABLE),-unneeded)-symbols ,$(filter %/scripts/strip-symbols,$^)) $< $@ + +$(foreach n,0 1 2 3,.tmp_kallsyms$(n).o): KBUILD_AFLAGS += -Wa,--strip-local-absolute +$(foreach n,0 1 2 3,.tmp_kallsyms$(n).o): %.o: %.S scripts FORCE $(call if_changed_dep,as_o_S) -.tmp_kallsyms%.S: .tmp_vmlinux% $(KALLSYMS) +ifeq ($(CONFIG_KALLSYMS_STRIP_GENERATED),y) +strip-ext := .stripped +endif + +.tmp_kallsyms%.S: .tmp_vmlinux%$(strip-ext) $(KALLSYMS) $(kallsyms.h) $(call cmd,kallsyms) +# make -jN seems to have problems with intermediate files, see bug #3330. +.SECONDARY: $(foreach n,1 2 3,.tmp_vmlinux$(n).stripped) +.tmp_vmlinux%.stripped: .tmp_vmlinux% $(strip-symbols) $(kallsyms.h) + $(call cmd,kstrip) + +ifneq ($(CONFIG_DEBUG_INFO),y) +.tmp_vmlinux%: LDFLAGS_vmlinux += -S +endif # .tmp_vmlinux1 must be complete except kallsyms, so update vmlinux version -.tmp_vmlinux1: $(vmlinux-lds) $(vmlinux-all) FORCE - $(call if_changed_rule,ksym_ld) +.tmp_vmlinux%: $(vmlinux-lds) $(vmlinux-all) FORCE + $(if $(filter 1,$*),$(call if_changed_rule,ksym_ld),$(call if_changed,vmlinux__)) -.tmp_vmlinux2: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms1.o FORCE - $(call if_changed,vmlinux__) +.tmp_vmlinux0$(strip-ext): + $(Q)echo "placeholder" >$@ -.tmp_vmlinux3: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms2.o FORCE - $(call if_changed,vmlinux__) +.tmp_vmlinux1: .tmp_kallsyms0.o +.tmp_vmlinux2: .tmp_kallsyms1.o +.tmp_vmlinux3: .tmp_kallsyms2.o # Needs to visit scripts/ before $(KALLSYMS) can be used. $(KALLSYMS): scripts ; diff --git a/arch/x86/scripts/strip-symbols b/arch/x86/scripts/strip-symbols new file mode 100644 index 000000000000..a2f1ccb827c7 --- /dev/null +++ b/arch/x86/scripts/strip-symbols @@ -0,0 +1 @@ +__cpu_vendor_dev_X86_VENDOR_* diff --git a/init/Kconfig b/init/Kconfig index f763762d544a..0f5af409fef1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -588,6 +588,13 @@ config KALLSYMS_ALL Say N. +config KALLSYMS_STRIP_GENERATED + bool "Strip machine generated symbols from kallsyms" + depends on KALLSYMS_ALL + default y + help + Say N if you want kallsyms to retain even machine generated symbols. + config KALLSYMS_EXTRA_PASS bool "Do an extra kallsyms pass" depends on KALLSYMS diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 7b8b0f21a5b1..e694afa0eb8c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -30,20 +30,19 @@ #define all_var 0 #endif -/* These will be re-linked against their real values during the second link stage */ -extern const unsigned long kallsyms_addresses[] __attribute__((weak)); -extern const u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[]; +extern const u8 kallsyms_names[]; /* tell the compiler that the count isn't in the small data section if the arch * has one (eg: FRV) */ extern const unsigned long kallsyms_num_syms -__attribute__((weak, section(".rodata"))); + __attribute__((__section__(".rodata"))); -extern const u8 kallsyms_token_table[] __attribute__((weak)); -extern const u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[]; +extern const u16 kallsyms_token_index[]; -extern const unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[]; static inline int is_kernel_inittext(unsigned long addr) { @@ -168,9 +167,6 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long symbol_start = 0, symbol_end = 0; unsigned long i, low, high, mid; - /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); - /* do a binary search on the sorted kallsyms_addresses array */ low = 0; high = kallsyms_num_syms; diff --git a/scripts/genksyms/keywords.c_shipped b/scripts/genksyms/keywords.c_shipped index 971e0113ae7a..83484fe93ede 100644 --- a/scripts/genksyms/keywords.c_shipped +++ b/scripts/genksyms/keywords.c_shipped @@ -1,4 +1,4 @@ -/* ANSI-C code produced by gperf version 3.0.2 */ +/* ANSI-C code produced by gperf version 3.0.1 */ /* Command-line: gperf -L ANSI-C -a -C -E -g -H is_reserved_hash -k '1,3,$' -N is_reserved_word -p -t scripts/genksyms/keywords.gperf */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ @@ -32,7 +32,7 @@ #line 3 "scripts/genksyms/keywords.gperf" struct resword { const char *name; int token; }; -/* maximum key range = 62, duplicates = 0 */ +/* maximum key range = 64, duplicates = 0 */ #ifdef __GNUC__ __inline @@ -46,32 +46,32 @@ is_reserved_hash (register const char *str, register unsigned int len) { static const unsigned char asso_values[] = { - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 5, - 65, 65, 65, 65, 65, 65, 35, 65, 65, 65, - 0, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 0, 65, 0, 65, 5, - 20, 15, 10, 30, 65, 15, 65, 65, 20, 0, - 10, 35, 20, 65, 10, 5, 0, 10, 5, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, - 65, 65, 65, 65, 65, 65 + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 0, + 67, 67, 67, 67, 67, 67, 15, 67, 67, 67, + 0, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 0, 67, 0, 67, 5, + 25, 20, 15, 30, 67, 15, 67, 67, 10, 0, + 10, 40, 20, 67, 10, 5, 0, 10, 15, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67 }; return len + asso_values[(unsigned char)str[2]] + asso_values[(unsigned char)str[0]] + asso_values[(unsigned char)str[len - 1]]; } @@ -84,116 +84,119 @@ is_reserved_word (register const char *str, register unsigned int len) { enum { - TOTAL_KEYWORDS = 43, + TOTAL_KEYWORDS = 45, MIN_WORD_LENGTH = 3, MAX_WORD_LENGTH = 24, MIN_HASH_VALUE = 3, - MAX_HASH_VALUE = 64 + MAX_HASH_VALUE = 66 }; static const struct resword wordlist[] = { {""}, {""}, {""}, -#line 26 "scripts/genksyms/keywords.gperf" +#line 28 "scripts/genksyms/keywords.gperf" {"asm", ASM_KEYW}, {""}, -#line 8 "scripts/genksyms/keywords.gperf" +#line 10 "scripts/genksyms/keywords.gperf" {"__asm", ASM_KEYW}, {""}, -#line 9 "scripts/genksyms/keywords.gperf" +#line 11 "scripts/genksyms/keywords.gperf" {"__asm__", ASM_KEYW}, {""}, {""}, -#line 52 "scripts/genksyms/keywords.gperf" +#line 54 "scripts/genksyms/keywords.gperf" {"__typeof__", TYPEOF_KEYW}, {""}, -#line 12 "scripts/genksyms/keywords.gperf" +#line 14 "scripts/genksyms/keywords.gperf" {"__const", CONST_KEYW}, -#line 11 "scripts/genksyms/keywords.gperf" - {"__attribute__", ATTRIBUTE_KEYW}, #line 13 "scripts/genksyms/keywords.gperf" + {"__attribute__", ATTRIBUTE_KEYW}, +#line 15 "scripts/genksyms/keywords.gperf" {"__const__", CONST_KEYW}, -#line 18 "scripts/genksyms/keywords.gperf" +#line 20 "scripts/genksyms/keywords.gperf" {"__signed__", SIGNED_KEYW}, -#line 44 "scripts/genksyms/keywords.gperf" +#line 46 "scripts/genksyms/keywords.gperf" {"static", STATIC_KEYW}, -#line 20 "scripts/genksyms/keywords.gperf" - {"__volatile__", VOLATILE_KEYW}, -#line 39 "scripts/genksyms/keywords.gperf" + {""}, +#line 41 "scripts/genksyms/keywords.gperf" {"int", INT_KEYW}, -#line 32 "scripts/genksyms/keywords.gperf" +#line 34 "scripts/genksyms/keywords.gperf" {"char", CHAR_KEYW}, -#line 33 "scripts/genksyms/keywords.gperf" +#line 35 "scripts/genksyms/keywords.gperf" {"const", CONST_KEYW}, -#line 45 "scripts/genksyms/keywords.gperf" +#line 47 "scripts/genksyms/keywords.gperf" {"struct", STRUCT_KEYW}, -#line 24 "scripts/genksyms/keywords.gperf" +#line 26 "scripts/genksyms/keywords.gperf" {"__restrict__", RESTRICT_KEYW}, -#line 25 "scripts/genksyms/keywords.gperf" +#line 27 "scripts/genksyms/keywords.gperf" {"restrict", RESTRICT_KEYW}, -#line 23 "scripts/genksyms/keywords.gperf" - {"_restrict", RESTRICT_KEYW}, -#line 16 "scripts/genksyms/keywords.gperf" +#line 7 "scripts/genksyms/keywords.gperf" + {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW}, +#line 18 "scripts/genksyms/keywords.gperf" {"__inline__", INLINE_KEYW}, -#line 10 "scripts/genksyms/keywords.gperf" - {"__attribute", ATTRIBUTE_KEYW}, {""}, -#line 14 "scripts/genksyms/keywords.gperf" +#line 22 "scripts/genksyms/keywords.gperf" + {"__volatile__", VOLATILE_KEYW}, +#line 5 "scripts/genksyms/keywords.gperf" + {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW}, +#line 25 "scripts/genksyms/keywords.gperf" + {"_restrict", RESTRICT_KEYW}, + {""}, +#line 12 "scripts/genksyms/keywords.gperf" + {"__attribute", ATTRIBUTE_KEYW}, +#line 6 "scripts/genksyms/keywords.gperf" + {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW}, +#line 16 "scripts/genksyms/keywords.gperf" {"__extension__", EXTENSION_KEYW}, -#line 35 "scripts/genksyms/keywords.gperf" +#line 37 "scripts/genksyms/keywords.gperf" {"enum", ENUM_KEYW}, -#line 19 "scripts/genksyms/keywords.gperf" - {"__volatile", VOLATILE_KEYW}, -#line 36 "scripts/genksyms/keywords.gperf" +#line 8 "scripts/genksyms/keywords.gperf" + {"EXPORT_UNUSED_SYMBOL", EXPORT_SYMBOL_KEYW}, +#line 38 "scripts/genksyms/keywords.gperf" {"extern", EXTERN_KEYW}, {""}, -#line 17 "scripts/genksyms/keywords.gperf" +#line 19 "scripts/genksyms/keywords.gperf" {"__signed", SIGNED_KEYW}, -#line 7 "scripts/genksyms/keywords.gperf" - {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW}, - {""}, -#line 51 "scripts/genksyms/keywords.gperf" +#line 9 "scripts/genksyms/keywords.gperf" + {"EXPORT_UNUSED_SYMBOL_GPL", EXPORT_SYMBOL_KEYW}, +#line 49 "scripts/genksyms/keywords.gperf" + {"union", UNION_KEYW}, +#line 53 "scripts/genksyms/keywords.gperf" {"typeof", TYPEOF_KEYW}, -#line 46 "scripts/genksyms/keywords.gperf" +#line 48 "scripts/genksyms/keywords.gperf" {"typedef", TYPEDEF_KEYW}, -#line 15 "scripts/genksyms/keywords.gperf" +#line 17 "scripts/genksyms/keywords.gperf" {"__inline", INLINE_KEYW}, -#line 31 "scripts/genksyms/keywords.gperf" +#line 33 "scripts/genksyms/keywords.gperf" {"auto", AUTO_KEYW}, -#line 47 "scripts/genksyms/keywords.gperf" - {"union", UNION_KEYW}, - {""}, {""}, -#line 48 "scripts/genksyms/keywords.gperf" - {"unsigned", UNSIGNED_KEYW}, -#line 49 "scripts/genksyms/keywords.gperf" - {"void", VOID_KEYW}, -#line 42 "scripts/genksyms/keywords.gperf" - {"short", SHORT_KEYW}, +#line 21 "scripts/genksyms/keywords.gperf" + {"__volatile", VOLATILE_KEYW}, {""}, {""}, #line 50 "scripts/genksyms/keywords.gperf" - {"volatile", VOLATILE_KEYW}, - {""}, -#line 37 "scripts/genksyms/keywords.gperf" - {"float", FLOAT_KEYW}, -#line 34 "scripts/genksyms/keywords.gperf" - {"double", DOUBLE_KEYW}, + {"unsigned", UNSIGNED_KEYW}, {""}, -#line 5 "scripts/genksyms/keywords.gperf" - {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW}, - {""}, {""}, -#line 38 "scripts/genksyms/keywords.gperf" +#line 44 "scripts/genksyms/keywords.gperf" + {"short", SHORT_KEYW}, +#line 40 "scripts/genksyms/keywords.gperf" {"inline", INLINE_KEYW}, -#line 6 "scripts/genksyms/keywords.gperf" - {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW}, -#line 41 "scripts/genksyms/keywords.gperf" - {"register", REGISTER_KEYW}, {""}, -#line 22 "scripts/genksyms/keywords.gperf" +#line 52 "scripts/genksyms/keywords.gperf" + {"volatile", VOLATILE_KEYW}, +#line 42 "scripts/genksyms/keywords.gperf" + {"long", LONG_KEYW}, +#line 24 "scripts/genksyms/keywords.gperf" {"_Bool", BOOL_KEYW}, -#line 43 "scripts/genksyms/keywords.gperf" - {"signed", SIGNED_KEYW}, {""}, {""}, -#line 40 "scripts/genksyms/keywords.gperf" - {"long", LONG_KEYW} +#line 43 "scripts/genksyms/keywords.gperf" + {"register", REGISTER_KEYW}, +#line 51 "scripts/genksyms/keywords.gperf" + {"void", VOID_KEYW}, +#line 39 "scripts/genksyms/keywords.gperf" + {"float", FLOAT_KEYW}, +#line 36 "scripts/genksyms/keywords.gperf" + {"double", DOUBLE_KEYW}, + {""}, {""}, {""}, {""}, +#line 45 "scripts/genksyms/keywords.gperf" + {"signed", SIGNED_KEYW} }; if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) diff --git a/scripts/genksyms/keywords.gperf b/scripts/genksyms/keywords.gperf index 5ef3733225fb..8abe7ab8d88f 100644 --- a/scripts/genksyms/keywords.gperf +++ b/scripts/genksyms/keywords.gperf @@ -5,6 +5,8 @@ struct resword { const char *name; int token; } EXPORT_SYMBOL, EXPORT_SYMBOL_KEYW EXPORT_SYMBOL_GPL, EXPORT_SYMBOL_KEYW EXPORT_SYMBOL_GPL_FUTURE, EXPORT_SYMBOL_KEYW +EXPORT_UNUSED_SYMBOL, EXPORT_SYMBOL_KEYW +EXPORT_UNUSED_SYMBOL_GPL, EXPORT_SYMBOL_KEYW __asm, ASM_KEYW __asm__, ASM_KEYW __attribute, ATTRIBUTE_KEYW diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index ad2434b26970..92758120a767 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -130,18 +130,9 @@ static int read_symbol(FILE *in, struct sym_entry *s) static int symbol_valid(struct sym_entry *s) { /* Symbols which vary between passes. Passes 1 and 2 must have - * identical symbol lists. The kallsyms_* symbols below are only added - * after pass 1, they would be included in pass 2 when --all-symbols is - * specified so exclude them to get a stable symbol list. + * identical symbol lists. */ static char *special_symbols[] = { - "kallsyms_addresses", - "kallsyms_num_syms", - "kallsyms_names", - "kallsyms_markers", - "kallsyms_token_table", - "kallsyms_token_index", - /* Exclude linker generated symbols which vary between passes */ "_SDA_BASE_", /* ppc */ "_SDA2_BASE_", /* ppc */ @@ -173,7 +164,9 @@ static int symbol_valid(struct sym_entry *s) } /* Exclude symbols which vary between passes. */ - if (strstr((char *)s->sym + offset, "_compiled.")) + if (strstr((char *)s->sym + offset, "_compiled.") || + strncmp((char*)s->sym + offset, "__compound_literal.", 19) == 0 || + strncmp((char*)s->sym + offset, "__compound_literal$", 19) == 0) return 0; for (i = 0; special_symbols[i]; i++) @@ -550,8 +543,10 @@ int main(int argc, char **argv) usage(); read_map(stdin); - sort_symbols(); - optimize_token_table(); + if (table_cnt) { + sort_symbols(); + optimize_token_table(); + } write_src(); return 0; -- cgit v1.2.3 From b909895739427874c089bc0e03dc119f99cab2dd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 19 Dec 2008 13:48:34 -0800 Subject: sparseirq: fix numa_migrate_irq_desc dependency and comments Impact: reduce kconfig variable scope and clean up Bartlomiej pointed out that the config dependencies and comments are not right. update it depend to NUMA, and fix some comments Reported-by: Bartlomiej Zolnierkiewicz Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/io_apic.c | 2 +- kernel/irq/numa_migrate.c | 11 +++-------- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 60a008857a38..5c243826334a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -250,7 +250,7 @@ config SPARSE_IRQ config NUMA_MIGRATE_IRQ_DESC bool "Move irq desc when changing irq smp_affinity" - depends on SPARSE_IRQ && SMP + depends on SPARSE_IRQ && NUMA default n help This enables moving irq_desc to cpu/node that irq will use handled. diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index bfe1245b1a3e..a74887b416cc 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2471,7 +2471,7 @@ static void irq_complete_move(struct irq_desc **descp) if (likely(!cfg->move_desc_pending)) return; - /* domain is not change, but affinity is changed */ + /* domain has not changed, but affinity did */ me = smp_processor_id(); if (cpu_isset(me, desc->affinity)) { *descp = desc = move_irq_desc(desc, me); diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 0178e2296990..089c3746358a 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -1,13 +1,8 @@ /* - * linux/kernel/irq/handle.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner, Russell King - * - * This file contains the core interrupt handling code. - * - * Detailed information is available in Documentation/DocBook/genericirq + * NUMA irq-desc migration code * + * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to + * the new "home node" of the IRQ. */ #include -- cgit v1.2.3 From 34945ede31071ac7d72270cc6c1893323f392b3f Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 19 Dec 2008 22:33:52 +0530 Subject: x86: common.c boot_cpu_stack and boot_exception_stacks should be static Impact: cleanup, avoid sparse warnings, reduce kernel size a bit Fixes these sparse warnings: arch/x86/kernel/cpu/common.c:869:6: warning: symbol 'boot_cpu_stack' was not declared. Should it be static? arch/x86/kernel/cpu/common.c:910:6: warning: symbol 'boot_exception_stacks' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0217a9..aba49c782fd6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -862,7 +862,7 @@ EXPORT_SYMBOL(_cpu_pda); struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; +static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; void __cpuinit pda_init(int cpu) { @@ -903,8 +903,8 @@ void __cpuinit pda_init(int cpu) } } -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; +static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] __page_aligned_bss; extern asmlinkage void ignore_sysret(void); -- cgit v1.2.3 From 8403295e0fa460f6240e2d781e25dc29189f33c7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 19 Dec 2008 14:25:50 -0800 Subject: x86: ia32_signal: remove unnecessary declaration Impact: cleanup No need to declare do_signal(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 3b3878a63bc2..09513f8a2896 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -43,7 +43,6 @@ X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ X86_EFLAGS_CF) -asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) -- cgit v1.2.3 From 982d789ab76c8a11426852fec2fdf2f412e21c0c Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:28 -0800 Subject: x86: PAT: remove follow_pfnmap_pte in favor of follow_phys Impact: Cleanup - removes a new function in favor of a recently modified older one. Replace follow_pfnmap_pte in pat code with follow_phys. follow_phys lso returns protection eliminating the need of pte_pgprot call. Using follow_phys also eliminates the need for pte_pa. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 5 ----- arch/x86/mm/pat.c | 30 +++++++++++------------------ include/linux/mm.h | 3 --- mm/memory.c | 43 ------------------------------------------ 4 files changed, 11 insertions(+), 70 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 579f8ceee948..2aa792bbd7e0 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -230,11 +230,6 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } -static inline u64 pte_pa(pte_t pte) -{ - return pte_val(pte) & PTE_PFN_MASK; -} - #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d5254bae84f4..541bcc944a5b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -685,8 +685,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) int retval = 0; unsigned long i, j; u64 paddr; - pgprot_t prot; - pte_t pte; + unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; @@ -696,26 +695,22 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) if (is_linear_pfn_mapping(vma)) { /* - * reserve the whole chunk starting from vm_pgoff, - * But, we have to get the protection from pte. + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. */ - if (follow_pfnmap_pte(vma, vma_start, &pte)) { + if (follow_phys(vma, vma_start, 0, &prot, &paddr)) { WARN_ON_ONCE(1); - return -1; + return -EINVAL; } - prot = pte_pgprot(pte); - paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot); + return reserve_pfn_range(paddr, vma_size, __pgprot(prot)); } /* reserve entire vma page by page, using pfn and prot from pte */ for (i = 0; i < vma_size; i += PAGE_SIZE) { - if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); - prot = pte_pgprot(pte); - retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + retval = reserve_pfn_range(paddr, PAGE_SIZE, __pgprot(prot)); if (retval) goto cleanup_ret; } @@ -724,10 +719,9 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) cleanup_ret: /* Reserve error: Cleanup partial reservation and return error */ for (j = 0; j < i; j += PAGE_SIZE) { - if (follow_pfnmap_pte(vma, vma_start + j, &pte)) + if (follow_phys(vma, vma_start + j, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); free_pfn_range(paddr, PAGE_SIZE); } @@ -797,6 +791,7 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, { unsigned long i; u64 paddr; + unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; @@ -821,12 +816,9 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, } else { /* free entire vma, page by page, using the pfn from pte */ for (i = 0; i < vma_size; i += PAGE_SIZE) { - pte_t pte; - - if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); free_pfn_range(paddr, PAGE_SIZE); } } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2f6e2f886d4b..36f9b3fa5e15 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1239,9 +1239,6 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ -int follow_pfnmap_pte(struct vm_area_struct *vma, - unsigned long address, pte_t *ret_ptep); - typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 79f28e35d4fc..6b29f39a5a3e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1168,49 +1168,6 @@ no_page_table: return page; } -int follow_pfnmap_pte(struct vm_area_struct *vma, unsigned long address, - pte_t *ret_ptep) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - if (!is_pfn_mapping(vma)) - goto err; - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto err; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto err; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto err; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - - pte = *ptep; - if (!pte_present(pte)) - goto err_unlock; - - *ret_ptep = pte; - pte_unmap_unlock(ptep, ptl); - return 0; - -err_unlock: - pte_unmap_unlock(ptep, ptl); -err: - return -EINVAL; -} - /* Can we do the FOLL_ANON optimization? */ static inline int use_zero_page(struct vm_area_struct *vma) { -- cgit v1.2.3 From 34801ba9bf0381fcf0e2b08179d2c07f2c6ede74 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:29 -0800 Subject: x86: PAT: move track untrack pfnmap stubs to asm-generic Impact: Cleanup and branch hints only. Move the track and untrack pfn stub routines from memory.c to asm-generic. Also add unlikely to pfnmap related calls in fork and exit path. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 6 ++---- include/asm-generic/pgtable.h | 46 ++++++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 6 ------ mm/memory.c | 48 ++---------------------------------------- 4 files changed, 50 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2aa792bbd7e0..875192bf72cb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -339,12 +339,10 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) +#ifndef __ASSEMBLY__ /* Indicate that x86 has its own track and untrack pfn vma functions */ -#define track_pfn_vma_new track_pfn_vma_new -#define track_pfn_vma_copy track_pfn_vma_copy -#define untrack_pfn_vma untrack_pfn_vma +#define __HAVE_PFNMAP_TRACKING -#ifndef __ASSEMBLY__ #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index b84633801fb6..72ebe91005a8 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -293,6 +293,52 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #define arch_flush_lazy_cpu_mode() do {} while (0) #endif +#ifndef __HAVE_PFNMAP_TRACKING +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + */ +static inline int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size) +{ + return 0; +} + +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + */ +static inline int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + return 0; +} + +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +static inline void untrack_pfn_vma(struct vm_area_struct *vma, + unsigned long pfn, unsigned long size) +{ +} +#else +extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size); +extern int track_pfn_vma_copy(struct vm_area_struct *vma); +extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size); +#endif + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_GENERIC_PGTABLE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 36f9b3fa5e15..d3ddd735e375 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -163,12 +163,6 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma) return (vma->vm_flags & VM_PFNMAP); } -extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, - unsigned long pfn, unsigned long size); -extern int track_pfn_vma_copy(struct vm_area_struct *vma); -extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); - /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask diff --git a/mm/memory.c b/mm/memory.c index 6b29f39a5a3e..f01b7eed6e16 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -99,50 +99,6 @@ int randomize_va_space __read_mostly = 2; #endif -#ifndef track_pfn_vma_new -/* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. - */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, - unsigned long pfn, unsigned long size) -{ - return 0; -} -#endif - -#ifndef track_pfn_vma_copy -/* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets - * copied through copy_page_range(). - */ -int track_pfn_vma_copy(struct vm_area_struct *vma) -{ - return 0; -} -#endif - -#ifndef untrack_pfn_vma -/* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * untrack_pfn_vma is called while unmapping a pfnmap for a region. - * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). - */ -void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) -{ -} -#endif - static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -713,7 +669,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); - if (is_pfn_mapping(vma)) { + if (unlikely(is_pfn_mapping(vma))) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine @@ -969,7 +925,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, if (vma->vm_flags & VM_ACCOUNT) *nr_accounted += (end - start) >> PAGE_SHIFT; - if (is_pfn_mapping(vma)) + if (unlikely(is_pfn_mapping(vma))) untrack_pfn_vma(vma, 0, 0); while (start != end) { -- cgit v1.2.3 From bf53de907dfdaac178c92d774aae7370d7b97d20 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 19 Dec 2008 15:10:24 +0100 Subject: x86, bts: add fork and exit handling Impact: introduce new ptrace facility Add arch_ptrace_untrace() function that is called when the tracer detaches (either voluntarily or when the tracing task dies); ptrace_disable() is only called on a voluntary detach. Add ptrace_fork() and arch_ptrace_fork(). They are called when a traced task is forked. Clear DS and BTS related fields on fork. Release DS resources and reclaim memory in ptrace_untrace(). This releases resources already when the tracing task dies. We used to do that when the traced task dies. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 9 ++++++++ arch/x86/include/asm/ptrace.h | 7 ++++++ arch/x86/kernel/ds.c | 11 ++++++++++ arch/x86/kernel/process_32.c | 20 ++++++++--------- arch/x86/kernel/process_64.c | 20 ++++++++--------- arch/x86/kernel/ptrace.c | 50 ++++++++++++++++++++++++++++++++++--------- include/linux/ptrace.h | 22 +++++++++++++++++++ kernel/fork.c | 2 ++ kernel/ptrace.c | 12 +++++++++++ 9 files changed, 121 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index ee0ea3a96c11..a8f672ba100c 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -252,12 +252,21 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); */ extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); +/* + * Task clone/init and cleanup work + */ +extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father); +extern void ds_exit_thread(struct task_struct *tsk); + #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} static inline void ds_switch_to(struct task_struct *prev, struct task_struct *next) {} +static inline void ds_copy_thread(struct task_struct *tsk, + struct task_struct *father) {} +static inline void ds_exit_thread(struct task_struct *tsk) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index fbf744215911..6d34d954c228 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -235,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); +extern void x86_ptrace_untrace(struct task_struct *); +extern void x86_ptrace_fork(struct task_struct *child, + unsigned long clone_flags); + +#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) +#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) + #endif /* __KERNEL__ */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 98d271e60e08..da91701a2348 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1017,3 +1017,14 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next) update_debugctlmsr(next->thread.debugctlmsr); } + +void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +{ + clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); + tsk->thread.ds_ctx = NULL; +} + +void ds_exit_thread(struct task_struct *tsk) +{ + WARN_ON(tsk->thread.ds_ctx); +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 605eff9a8ac0..3ba155d24884 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -60,6 +60,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -251,17 +252,8 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any BTS tracers that have not been properly released. */ - if (unlikely(current->bts)) { - ds_release_bts(current->bts); - current->bts = NULL; - - kfree(current->bts_buffer); - current->bts_buffer = NULL; - current->bts_size = 0; - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -343,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + + ds_copy_thread(p, current); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + return err; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1cfd2a4bf853..416fb9282f4f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -53,6 +53,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); @@ -236,17 +237,8 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any BTS tracers that have not been properly released. */ - if (unlikely(current->bts)) { - ds_release_bts(current->bts); - current->bts = NULL; - - kfree(current->bts_buffer); - current->bts_buffer = NULL; - current->bts_size = 0; - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -376,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (err) goto out; } + + ds_copy_thread(p, me); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + err = 0; out: if (err && p->thread.io_bitmap_ptr) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45e9855da2d2..6ad2bb607650 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -769,8 +769,47 @@ static int ptrace_bts_size(struct task_struct *child) return (trace->ds.top - trace->ds.begin) / trace->ds.size; } + +static void ptrace_bts_fork(struct task_struct *tsk) +{ + tsk->bts = NULL; + tsk->bts_buffer = NULL; + tsk->bts_size = 0; + tsk->thread.bts_ovfl_signal = 0; +} + +static void ptrace_bts_untrace(struct task_struct *child) +{ + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; + + kfree(child->bts_buffer); + child->bts_buffer = NULL; + child->bts_size = 0; + } +} + +static void ptrace_bts_detach(struct task_struct *child) +{ + ptrace_bts_untrace(child); +} +#else +static inline void ptrace_bts_fork(struct task_struct *tsk) {} +static inline void ptrace_bts_detach(struct task_struct *child) {} +static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ +void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) +{ + ptrace_bts_fork(child); +} + +void x86_ptrace_untrace(struct task_struct *child) +{ + ptrace_bts_untrace(child); +} + /* * Called by kernel/ptrace.c when detaching.. * @@ -782,16 +821,7 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif -#ifdef CONFIG_X86_PTRACE_BTS - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; - } -#endif /* CONFIG_X86_PTRACE_BTS */ + ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 22641d5d45df..98b93ca4db06 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); +extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 /* Returns 0 on success, -errno on denial. */ @@ -313,6 +314,27 @@ static inline void user_enable_block_step(struct task_struct *task) #define arch_ptrace_stop(code, info) do { } while (0) #endif +#ifndef arch_ptrace_untrace +/* + * Do machine-specific work before untracing child. + * + * This is called for a normal detach as well as from ptrace_exit() + * when the tracing task dies. + * + * Called with write_lock(&tasklist_lock) held. + */ +#define arch_ptrace_untrace(task) do { } while (0) +#endif + +#ifndef arch_ptrace_fork +/* + * Do machine-specific work to initialize a new task. + * + * This is called from copy_process(). + */ +#define arch_ptrace_fork(child, clone_flags) do { } while (0) +#endif + extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); diff --git a/kernel/fork.c b/kernel/fork.c index 7b93da72d4a2..65ce60adc8e8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1096,6 +1096,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif + if (unlikely(ptrace_reparented(current))) + ptrace_fork(p, clone_flags); /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4c8bcd7dd8e0..100a71cfdaba 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -25,6 +25,17 @@ #include #include + +/* + * Initialize a new task whose father had been ptraced. + * + * Called from copy_process(). + */ +void ptrace_fork(struct task_struct *child, unsigned long clone_flags) +{ + arch_ptrace_fork(child, clone_flags); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); + arch_ptrace_untrace(child); if (task_is_traced(child)) ptrace_untrace(child); } -- cgit v1.2.3 From c5dee6177f4bd2095aab7d9be9f6ebdddd6deee9 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 19 Dec 2008 15:17:02 +0100 Subject: x86, bts: memory accounting Impact: move the BTS buffer accounting to the mlock bucket Add alloc_locked_buffer() and free_locked_buffer() functions to mm/mlock.c to kalloc a buffer and account the locked memory to current. Account the memory for the BTS buffer to the tracer. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 45 ++++++++++++++++++++++++++++++++++----------- include/linux/mm.h | 2 ++ mm/mlock.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 6ad2bb607650..0a5df5f82fb9 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -650,6 +650,24 @@ static int ptrace_bts_drain(struct task_struct *child, return drained; } +static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) +{ + child->bts_buffer = alloc_locked_buffer(size); + if (!child->bts_buffer) + return -ENOMEM; + + child->bts_size = size; + + return 0; +} + +static void ptrace_bts_free_buffer(struct task_struct *child) +{ + free_locked_buffer(child->bts_buffer, child->bts_size); + child->bts_buffer = NULL; + child->bts_size = 0; +} + static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) @@ -679,14 +697,13 @@ static int ptrace_bts_config(struct task_struct *child, if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != child->bts_size)) { - kfree(child->bts_buffer); + int error; - child->bts_size = cfg.size; - child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); - if (!child->bts_buffer) { - child->bts_size = 0; - return -ENOMEM; - } + ptrace_bts_free_buffer(child); + + error = ptrace_bts_allocate_buffer(child, cfg.size); + if (error < 0) + return error; } if (cfg.flags & PTRACE_BTS_O_TRACE) @@ -701,10 +718,8 @@ static int ptrace_bts_config(struct task_struct *child, if (IS_ERR(child->bts)) { int error = PTR_ERR(child->bts); - kfree(child->bts_buffer); + ptrace_bts_free_buffer(child); child->bts = NULL; - child->bts_buffer = NULL; - child->bts_size = 0; return error; } @@ -784,6 +799,9 @@ static void ptrace_bts_untrace(struct task_struct *child) ds_release_bts(child->bts); child->bts = NULL; + /* We cannot update total_vm and locked_vm since + child's mm is already gone. But we can reclaim the + memory. */ kfree(child->bts_buffer); child->bts_buffer = NULL; child->bts_size = 0; @@ -792,7 +810,12 @@ static void ptrace_bts_untrace(struct task_struct *child) static void ptrace_bts_detach(struct task_struct *child) { - ptrace_bts_untrace(child); + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; + + ptrace_bts_free_buffer(child); + } } #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} diff --git a/include/linux/mm.h b/include/linux/mm.h index ffee2f743418..9979d3fab6e7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1286,5 +1286,7 @@ int vmemmap_populate_basepages(struct page *start_page, int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); +extern void *alloc_locked_buffer(size_t size); +extern void free_locked_buffer(void *buffer, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 1ada366570cb..3035a56e7616 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -667,3 +667,48 @@ void user_shm_unlock(size_t size, struct user_struct *user) spin_unlock(&shmlock_user_lock); free_uid(user); } + +void *alloc_locked_buffer(size_t size) +{ + unsigned long rlim, vm, pgsz; + void *buffer = NULL; + + pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + pgsz; + if (rlim < vm) + goto out; + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + pgsz; + if (rlim < vm) + goto out; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto out; + + current->mm->total_vm += pgsz; + current->mm->locked_vm += pgsz; + + out: + up_write(¤t->mm->mmap_sem); + return buffer; +} + +void free_locked_buffer(void *buffer, size_t size) +{ + unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + current->mm->total_vm -= pgsz; + current->mm->locked_vm -= pgsz; + + up_write(¤t->mm->mmap_sem); + + kfree(buffer); +} -- cgit v1.2.3 From 280a9ca5d0663b185ddc4443052076c29652a328 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Sat, 20 Dec 2008 00:15:24 +0100 Subject: x86: fix resume (S2R) broken by Intel microcode module, on A110L Impact: fix deadlock This is in response to the following bug report: Bug-Entry : http://bugzilla.kernel.org/show_bug.cgi?id=12100 Subject : resume (S2R) broken by Intel microcode module, on A110L Submitter : Andreas Mohr Date : 2008-11-25 08:48 (19 days old) Handled-By : Dmitry Adamushko [ The deadlock scenario has been discovered by Andreas Mohr ] I think I might have a logical explanation why the system: (http://bugzilla.kernel.org/show_bug.cgi?id=12100) might hang upon resuming, OTOH it should have likely hanged each and every time. (1) possible deadlock in microcode_resume_cpu() if either 'if' section is taken; (2) now, I don't see it in spec. and can't experimentally verify it (newer ucodes don't seem to be available for my Core2duo)... but logically-wise, I'd think that when read upon resuming, the 'microcode revision' (MSR 0x8B) should be back to its original one (we need to reload ucode anyway so it doesn't seem logical if a cpu doesn't drop the version)... if so, the comparison with memcmp() for the full 'struct cpu_signature' is wrong... and that's how one of the aforementioned 'if' sections might have been triggered - leading to a deadlock. Obviously, in my tests I simulated loading/resuming with the ucode of the same version (just to see that the file is loaded/re-loaded upon resuming) so this issue has never popped up. I'd appreciate if someone with an appropriate system might give a try to the 2nd patch (titled "fix a comparison && deadlock..."). In any case, the deadlock situation is a must-have fix. Reported-by: Andreas Mohr Signed-off-by: Dmitry Adamushko Tested-by: Andreas Mohr Signed-off-by: Ingo Molnar Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 19 ++++++++++++++----- arch/x86/kernel/microcode_intel.c | 6 ++++++ 2 files changed, 20 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 82fb2809ce32..c4b5b24e0217 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -272,13 +272,18 @@ static struct attribute_group mc_attr_group = { .name = "microcode", }; -static void microcode_fini_cpu(int cpu) +static void __microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - mutex_lock(µcode_mutex); microcode_ops->microcode_fini_cpu(cpu); uci->valid = 0; +} + +static void microcode_fini_cpu(int cpu) +{ + mutex_lock(µcode_mutex); + __microcode_fini_cpu(cpu); mutex_unlock(µcode_mutex); } @@ -306,12 +311,16 @@ static int microcode_resume_cpu(int cpu) * to this cpu (a bit of paranoia): */ if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - microcode_fini_cpu(cpu); + __microcode_fini_cpu(cpu); + printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", + cpu); return -1; } - if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { - microcode_fini_cpu(cpu); + if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { + __microcode_fini_cpu(cpu); + printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", + cpu); /* Should we look for a new ucode here? */ return 1; } diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 622dc4a21784..a8e62792d171 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -155,6 +155,7 @@ static DEFINE_SPINLOCK(microcode_update_lock); static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -174,11 +175,16 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } + /* serialize access to the physical write to MSR 0x79 */ + spin_lock_irqsave(µcode_update_lock, flags); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); + spin_unlock_irqrestore(µcode_update_lock, flags); + pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", csig->sig, csig->pf, csig->rev); -- cgit v1.2.3 From adf77bac052bb5bf0722b2ce2af9fefc5b2d2a71 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 22 Dec 2008 17:56:05 -0800 Subject: x86: prioritize the FPU traps for the error code In the case of multiple FPU errors, prioritize the error codes, instead of returning __SI_FAULT, which ends up pushing a 0 as the error code to userspace, a POSIX violation. For i386, we will simply return if there are no errors at all; for x86-64 this is probably a "can't happen" (and the code should be unified), but for this patch, return __SI_FAULT|SI_KERNEL if this ever happens. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/traps.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 04d242ab0161..c320c29255c2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -664,7 +664,7 @@ void math_error(void __user *ip) { struct task_struct *task; siginfo_t info; - unsigned short cwd, swd; + unsigned short cwd, swd, err; /* * Save the info for the exception handler and clear the error. @@ -675,7 +675,6 @@ void math_error(void __user *ip) task->thread.error_code = 0; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_code = __SI_FAULT; info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked @@ -689,34 +688,31 @@ void math_error(void __user *ip) */ cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ -#ifdef CONFIG_X86_32 + + err = swd & ~cwd & 0x3f; + +#if CONFIG_X86_32 + if (!err) return; #endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ + + if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ + } else if (err & 0x004) { /* Divide by Zero */ info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ + } else if (err & 0x008) { /* Overflow */ info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ + } else if (err & 0x012) { /* Denormal, Underflow */ + info.si_code = FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ info.si_code = FPE_FLTRES; - break; + } else { + info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */ } force_sig_info(SIGFPE, &info, task); } -- cgit v1.2.3 From c1c15b65ec30275575dac9322aae607075769fbc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 23 Dec 2008 10:10:40 -0800 Subject: x86: PAT: fix address types in track_pfn_vma_new() Impact: cleanup, fix warning This warning: arch/x86/mm/pat.c: In function track_pfn_vma_copy: arch/x86/mm/pat.c:701: warning: passing argument 5 of follow_phys from incompatible pointer type Triggers because physical addresses are resource_size_t, not u64. This really matters when calling an interface like follow_phys() which takes a pointer to a physical address -- although on x86, being littleendian, it would generally work anyway as long as the memory region wasn't completely uninitialized. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 541bcc944a5b..85cbd3cd3723 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -684,7 +684,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) { int retval = 0; unsigned long i, j; - u64 paddr; + resource_size_t paddr; unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; @@ -746,8 +746,8 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, { int retval = 0; unsigned long i, j; - u64 base_paddr; - u64 paddr; + resource_size_t base_paddr; + resource_size_t paddr; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; @@ -757,12 +757,12 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ - paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return reserve_pfn_range(paddr, vma_size, prot); } /* reserve page by page using pfn and size */ - base_paddr = (u64)pfn << PAGE_SHIFT; + base_paddr = (resource_size_t)pfn << PAGE_SHIFT; for (i = 0; i < size; i += PAGE_SIZE) { paddr = base_paddr + i; retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); @@ -790,7 +790,7 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size) { unsigned long i; - u64 paddr; + resource_size_t paddr; unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; @@ -801,14 +801,14 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, if (is_linear_pfn_mapping(vma)) { /* free the whole chunk starting from vm_pgoff */ - paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; free_pfn_range(paddr, vma_size); return; } if (size != 0 && size != vma_size) { /* free page by page, using pfn and size */ - paddr = (u64)pfn << PAGE_SHIFT; + paddr = (resource_size_t)pfn << PAGE_SHIFT; for (i = 0; i < size; i += PAGE_SIZE) { paddr = paddr + i; free_pfn_range(paddr, PAGE_SIZE); -- cgit v1.2.3 From 40f15ad8aadff5ebb621b17a6f303ad2cd3f847d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 24 Dec 2008 10:49:51 +0100 Subject: x86: disable X86_PTRACE_BTS there's a new ptrace arch level feature in .28: config X86_PTRACE_BTS bool "Branch Trace Store" it has broken fork() handling: the old DS area gets copied over into a new task without clearing it. Fixes exist but they came too late: c5dee61: x86, bts: memory accounting bf53de9: x86, bts: add fork and exit handling and are queued up for v2.6.29. This shows that the facility is still not tested well enough to release into a stable kernel - disable it for now and reactivate in .29. In .29 the hardware-branch-tracer will use the DS/BTS facilities too - hopefully resulting in better code. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b815664fe370..8e99073b9e0f 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -520,6 +520,7 @@ config X86_PTRACE_BTS bool "Branch Trace Store" default y depends on X86_DEBUGCTLMSR + depends on BROKEN help This adds a ptrace interface to the hardware's branch trace store. -- cgit v1.2.3 From 67be403d897f818b1a5ecc201967b0ee6a0332f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 24 Dec 2008 21:08:37 +0100 Subject: Revert "x86: disable X86_PTRACE_BTS" This reverts commit 40f15ad8aadff5ebb621b17a6f303ad2cd3f847d. The CONFIG_X86_PTRACE_BTS bugs have been fixed via: c5dee61: x86, bts: memory accounting bf53de9: x86, bts: add fork and exit handling Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b54903efb39e..85a78575956c 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -521,7 +521,6 @@ config X86_PTRACE_BTS bool "Branch Trace Store" default y depends on X86_DEBUGCTLMSR - depends on BROKEN help This adds a ptrace interface to the hardware's branch trace store. -- cgit v1.2.3 From 1c06da81a5d042d5fba67c4c533b16ae62a174ab Mon Sep 17 00:00:00 2001 From: Kent Liu Date: Fri, 31 Oct 2008 16:52:58 +0800 Subject: crypto: crc32c-intel - Update copyright head The original copyright head for crc32c-intel.c is incorrect. Please merge the patch to update it. Signed-Off-By: Kent Liu Signed-off-by: Herbert Xu --- arch/x86/crypto/crc32c-intel.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c index 070afc5b6c94..a2c539cc52b7 100644 --- a/arch/x86/crypto/crc32c-intel.c +++ b/arch/x86/crypto/crc32c-intel.c @@ -6,13 +6,22 @@ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual * Volume 2A: Instruction Set Reference, A-M * - * Copyright (c) 2008 Austin Zhang - * Copyright (c) 2008 Kent Liu + * Copyright (C) 2008 Intel Corporation + * Authors: Austin Zhang + * Kent Liu * * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. * */ #include @@ -194,4 +203,3 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS("crc32c"); MODULE_ALIAS("crc32c-intel"); - -- cgit v1.2.3 From b7e8bdadce6317eb13c13b9451d7114614aa1450 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 6 Nov 2008 16:56:41 +0800 Subject: crypto: crc32c-intel - Switch to shash This patch changes crc32c-intel to the new shash interface. Signed-off-by: Herbert Xu --- arch/x86/crypto/crc32c-intel.c | 101 +++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c index a2c539cc52b7..b9d00261703c 100644 --- a/arch/x86/crypto/crc32c-intel.c +++ b/arch/x86/crypto/crc32c-intel.c @@ -84,99 +84,92 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len * If your algorithm starts with ~0, then XOR with ~0 before you set * the seed. */ -static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key, +static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key, unsigned int keylen) { - u32 *mctx = crypto_ahash_ctx(hash); + u32 *mctx = crypto_shash_ctx(hash); if (keylen != sizeof(u32)) { - crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } *mctx = le32_to_cpup((__le32 *)key); return 0; } -static int crc32c_intel_init(struct ahash_request *req) +static int crc32c_intel_init(struct shash_desc *desc) { - u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); - u32 *crcp = ahash_request_ctx(req); + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); *crcp = *mctx; return 0; } -static int crc32c_intel_update(struct ahash_request *req) +static int crc32c_intel_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - struct crypto_hash_walk walk; - u32 *crcp = ahash_request_ctx(req); - u32 crc = *crcp; - int nbytes; + u32 *crcp = shash_desc_ctx(desc); - for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; - nbytes = crypto_hash_walk_done(&walk, 0)) - crc = crc32c_intel_le_hw(crc, walk.data, nbytes); - - *crcp = crc; + *crcp = crc32c_intel_le_hw(*crcp, data, len); return 0; } -static int crc32c_intel_final(struct ahash_request *req) +static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) { - u32 *crcp = ahash_request_ctx(req); - - *(__le32 *)req->result = ~cpu_to_le32p(crcp); + *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); return 0; } -static int crc32c_intel_digest(struct ahash_request *req) +static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct crypto_hash_walk walk; - u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); - u32 crc = *mctx; - int nbytes; + return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out); +} - for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; - nbytes = crypto_hash_walk_done(&walk, 0)) - crc = crc32c_intel_le_hw(crc, walk.data, nbytes); +static int crc32c_intel_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); - *(__le32 *)req->result = ~cpu_to_le32(crc); + *(__le32 *)out = ~cpu_to_le32p(crcp); return 0; } +static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + static int crc32c_intel_cra_init(struct crypto_tfm *tfm) { u32 *key = crypto_tfm_ctx(tfm); *key = ~0; - tfm->crt_ahash.reqsize = sizeof(u32); - return 0; } -static struct crypto_alg alg = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-intel", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_AHASH, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_alignmask = 3, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(alg.cra_list), - .cra_init = crc32c_intel_cra_init, - .cra_type = &crypto_ahash_type, - .cra_u = { - .ahash = { - .digestsize = CHKSUM_DIGEST_SIZE, - .setkey = crc32c_intel_setkey, - .init = crc32c_intel_init, - .update = crc32c_intel_update, - .final = crc32c_intel_final, - .digest = crc32c_intel_digest, - } +static struct shash_alg alg = { + .setkey = crc32c_intel_setkey, + .init = crc32c_intel_init, + .update = crc32c_intel_update, + .final = crc32c_intel_final, + .finup = crc32c_intel_finup, + .digest = crc32c_intel_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-intel", + .cra_priority = 200, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = crc32c_intel_cra_init, } }; @@ -184,14 +177,14 @@ static struct crypto_alg alg = { static int __init crc32c_intel_mod_init(void) { if (cpu_has_xmm4_2) - return crypto_register_alg(&alg); + return crypto_register_shash(&alg); else return -ENODEV; } static void __exit crc32c_intel_mod_fini(void) { - crypto_unregister_alg(&alg); + crypto_unregister_shash(&alg); } module_init(crc32c_intel_mod_init); -- cgit v1.2.3 From 0ca59dd948a51c95d5a366d35f897bc5ef9df55d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Dec 2008 23:30:02 +0100 Subject: tracing/ftrace: don't trace on early stage of a secondary cpu boot, v3 Impact: fix a crash/hard-reboot on certain configs while enabling cpu runtime On some archs, the boot of a secondary cpu can have an early fragile state. On x86-64, the pda is not initialized on the first stage of a cpu boot but it is needed to get the cpu number and the current task pointer. This data is needed during tracing. As they were dereferenced at this stage, we got a crash while tracing a cpu being enabled at runtime. Some other archs like ia64 can have such kind of issue too. Changes on v2: We dropped the previous solution of a per-arch called function to guess the current state of a cpu. That could slow down the tracing. This patch removes the -pg flag on arch/x86/kernel/cpu/common.c where the low level cpu boot functions exist, on start_secondary() and a helper function used at this stage. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 3 ++- arch/x86/kernel/cpu/Makefile | 5 +++++ arch/x86/kernel/smpboot.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c2a812ebde89..b8a1799ea871 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -85,7 +85,8 @@ static inline void native_write_msr(unsigned int msr, asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); } -static inline int native_write_msr_safe(unsigned int msr, +/* Can be uninlined because referenced by paravirt */ +notrace static inline int native_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int err; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec6075c057..4ae495a313f3 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -2,6 +2,11 @@ # Makefile for x86-compatible CPU details and quirks # +# Don't trace early stages of a secondary CPU boot +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_common.o = -pg +endif + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f71f96fc9e62..f6174d229024 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -287,7 +287,7 @@ static int __cpuinitdata unsafe_smp; /* * Activate a secondary processor. */ -static void __cpuinit start_secondary(void *unused) +notrace static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too -- cgit v1.2.3 From 1fcccb008be12ea823aaa392758e1e41fb82de9a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Tue, 23 Dec 2008 21:50:11 +0530 Subject: x86: traps.c replace #if CONFIG_X86_32 with #ifdef CONFIG_X86_32 Impact: cleanup, avoid warning on X86_64 Fixes this warning on X86_64: CC arch/x86/kernel/traps.o arch/x86/kernel/traps.c:695:5: warning: "CONFIG_X86_32" is not defined Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c320c29255c2..f37cee75ab58 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -691,7 +691,7 @@ void math_error(void __user *ip) err = swd & ~cwd & 0x3f; -#if CONFIG_X86_32 +#ifdef CONFIG_X86_32 if (!err) return; #endif -- cgit v1.2.3 From 4e17fee24a39448f3a20e9cf98887b7665825848 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 25 Dec 2008 12:04:17 +0100 Subject: x86: turn CONFIG_SPARSE_IRQ off by default New feature - lets disable it by default first - can flip it around later. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5c243826334a..d14a8806227d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -241,7 +241,6 @@ config X86_HAS_BOOT_CPU_ID config SPARSE_IRQ bool "Support sparse irq numbering" depends on PCI_MSI || HT_IRQ - default y help This enables support for sparse irq, esp for msi/msi-x. You may need if you have lots of cards supports msi-x installed. -- cgit v1.2.3 From fc5243d98ac2575ad14a974b3c097e9ba874c03d Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 25 Dec 2008 13:38:35 +0100 Subject: [S390] arch_setup_additional_pages arguments arch_setup_additional_pages currently gets two arguments, the binary format descripton and an indication if the process uses an executable stack or not. The second argument is not used by anybody, it could be removed without replacement. What actually does make sense is to pass an indication if the process uses the elf interpreter or not. The glibc code will not use anything from the vdso if the process does not use the dynamic linker, so for statically linked binaries the architecture backend can choose not to map the vdso. Acked-by: Ingo Molnar Signed-off-by: Martin Schwidefsky --- arch/powerpc/include/asm/elf.h | 2 +- arch/powerpc/kernel/vdso.c | 3 +-- arch/sh/include/asm/elf.h | 2 +- arch/sh/kernel/vsyscall/vsyscall.c | 3 +-- arch/x86/include/asm/elf.h | 2 +- arch/x86/vdso/vdso32-setup.c | 2 +- arch/x86/vdso/vma.c | 2 +- fs/binfmt_elf.c | 2 +- 8 files changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index d812929390e4..cd46f023ec6d 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -267,7 +267,7 @@ extern int ucache_bsize; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int uses_interp); #define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 65639a43e644..f7ec7d0888fe 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -184,8 +184,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree */ -int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; struct page **vdso_pagelist; diff --git a/arch/sh/include/asm/elf.h b/arch/sh/include/asm/elf.h index 9eb9036a1bdc..9381397ebeb8 100644 --- a/arch/sh/include/asm/elf.h +++ b/arch/sh/include/asm/elf.h @@ -204,7 +204,7 @@ do { \ #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int uses_interp); extern unsigned int vdso_enabled; extern void __kernel_vsyscall; diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index 95f4de0800ec..3f7e415be86a 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -59,8 +59,7 @@ int __init vsyscall_init(void) } /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 40ca1bea7916..f51a3ddde01a 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -325,7 +325,7 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int uses_interp); extern int syscall32_setup_pages(struct linux_binprm *, int exstack); #define compat_arch_setup_additional_pages syscall32_setup_pages diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 513f330c5832..1241f118ab56 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -310,7 +310,7 @@ int __init sysenter_setup(void) } /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 257ba4a10abf..9c98cc6ba978 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -98,7 +98,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) /* Setup a VMA at program startup for the vsyscall page. Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8fcfa398d350..95a76ff9e01b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -949,7 +949,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = arch_setup_additional_pages(bprm, executable_stack); + retval = arch_setup_additional_pages(bprm, !!elf_interpreter); if (retval < 0) { send_sig(SIGKILL, current, 0); goto out; -- cgit v1.2.3 From 973656fe1afb4adf95d7b9ab75d4660cd3821ea1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 25 Dec 2008 16:26:47 +0100 Subject: x86, sparseirq: clean up Kconfig entry Impact: improve help text Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d14a8806227d..e4c038abb71c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -242,10 +242,14 @@ config SPARSE_IRQ bool "Support sparse irq numbering" depends on PCI_MSI || HT_IRQ help - This enables support for sparse irq, esp for msi/msi-x. You may need - if you have lots of cards supports msi-x installed. + This enables support for sparse irqs. This is useful for distro + kernels that want to define a high CONFIG_NR_CPUS value but still + want to have low kernel memory footprint on smaller machines. - If you don't know what to do here, say Y. + ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread + out the irq_desc[] array in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. config NUMA_MIGRATE_IRQ_DESC bool "Move irq desc when changing irq smp_affinity" -- cgit v1.2.3 From 393d68fb9929817cde7ab31c82d66fcb28ad35fc Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 26 Dec 2008 22:23:38 +1030 Subject: cpumask: x86: Introduce cpumask_of_{node,pcibus} to replace {node,pcibus}_to_cpumask Impact: New APIs The old node_to_cpumask/node_to_pcibus returned a cpumask_t: these return a pointer to a struct cpumask. Part of removing cpumasks from the stack. Also makes __pcibus_to_node take a const pointer. Signed-off-by: Rusty Russell Acked-by: Ingo Molnar --- arch/x86/include/asm/pci.h | 10 ++++++++-- arch/x86/include/asm/topology.h | 35 +++++++++++++++++++++++------------ arch/x86/kernel/setup_percpu.c | 8 ++++---- 3 files changed, 35 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 875b38edf193..52d80d3d94f3 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -98,9 +98,9 @@ static inline void early_quirks(void) { } #ifdef CONFIG_NUMA /* Returns the node based on pci bus */ -static inline int __pcibus_to_node(struct pci_bus *bus) +static inline int __pcibus_to_node(const struct pci_bus *bus) { - struct pci_sysdata *sd = bus->sysdata; + const struct pci_sysdata *sd = bus->sysdata; return sd->node; } @@ -109,6 +109,12 @@ static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus) { return node_to_cpumask(__pcibus_to_node(bus)); } + +static inline const struct cpumask * +cpumask_of_pcibus(const struct pci_bus *bus) +{ + return cpumask_of_node(__pcibus_to_node(bus)); +} #endif #endif /* _ASM_X86_PCI_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index ff386ff50ed7..45da5dc50fc8 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -61,13 +61,19 @@ static inline int cpu_to_node(int cpu) * * Side note: this function creates the returned cpumask on the stack * so with a high NR_CPUS count, excessive stack space is used. The - * node_to_cpumask_ptr function should be used whenever possible. + * cpumask_of_node function should be used whenever possible. */ static inline cpumask_t node_to_cpumask(int node) { return node_to_cpumask_map[node]; } +/* Returns a bitmask of CPUs on Node 'node'. */ +static inline const struct cpumask *cpumask_of_node(int node) +{ + return &node_to_cpumask_map[node]; +} + #else /* CONFIG_X86_64 */ /* Mappings between node number and cpus on that node. */ @@ -82,7 +88,7 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); #ifdef CONFIG_DEBUG_PER_CPU_MAPS extern int cpu_to_node(int cpu); extern int early_cpu_to_node(int cpu); -extern const cpumask_t *_node_to_cpumask_ptr(int node); +extern const cpumask_t *cpumask_of_node(int node); extern cpumask_t node_to_cpumask(int node); #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ @@ -103,7 +109,7 @@ static inline int early_cpu_to_node(int cpu) } /* Returns a pointer to the cpumask of CPUs on Node 'node'. */ -static inline const cpumask_t *_node_to_cpumask_ptr(int node) +static inline const cpumask_t *cpumask_of_node(int node) { return &node_to_cpumask_map[node]; } @@ -116,12 +122,15 @@ static inline cpumask_t node_to_cpumask(int node) #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -/* Replace default node_to_cpumask_ptr with optimized version */ +/* + * Replace default node_to_cpumask_ptr with optimized version + * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" + */ #define node_to_cpumask_ptr(v, node) \ - const cpumask_t *v = _node_to_cpumask_ptr(node) + const cpumask_t *v = cpumask_of_node(node) #define node_to_cpumask_ptr_next(v, node) \ - v = _node_to_cpumask_ptr(node) + v = cpumask_of_node(node) #endif /* CONFIG_X86_64 */ @@ -187,7 +196,7 @@ extern int __node_distance(int, int); #define cpu_to_node(cpu) 0 #define early_cpu_to_node(cpu) 0 -static inline const cpumask_t *_node_to_cpumask_ptr(int node) +static inline const cpumask_t *cpumask_of_node(int node) { return &cpu_online_map; } @@ -200,12 +209,15 @@ static inline int node_to_first_cpu(int node) return first_cpu(cpu_online_map); } -/* Replace default node_to_cpumask_ptr with optimized version */ +/* + * Replace default node_to_cpumask_ptr with optimized version + * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" + */ #define node_to_cpumask_ptr(v, node) \ - const cpumask_t *v = _node_to_cpumask_ptr(node) + const cpumask_t *v = cpumask_of_node(node) #define node_to_cpumask_ptr_next(v, node) \ - v = _node_to_cpumask_ptr(node) + v = cpumask_of_node(node) #endif #include @@ -214,8 +226,7 @@ static inline int node_to_first_cpu(int node) /* Returns the number of the first CPU on Node 'node'. */ static inline int node_to_first_cpu(int node) { - node_to_cpumask_ptr(mask, node); - return first_cpu(*mask); + return cpumask_first(cpumask_of_node(node)); } #endif diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1c2084291f97..8e8b1193add5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -334,25 +334,25 @@ static const cpumask_t cpu_mask_none; /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ -const cpumask_t *_node_to_cpumask_ptr(int node) +const cpumask_t *cpumask_of_node(int node) { if (node_to_cpumask_map == NULL) { printk(KERN_WARNING - "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", + "cpumask_of_node(%d): no node_to_cpumask_map!\n", node); dump_stack(); return (const cpumask_t *)&cpu_online_map; } if (node >= nr_node_ids) { printk(KERN_WARNING - "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", + "cpumask_of_node(%d): node > nr_node_ids(%d)\n", node, nr_node_ids); dump_stack(); return &cpu_mask_none; } return &node_to_cpumask_map[node]; } -EXPORT_SYMBOL(_node_to_cpumask_ptr); +EXPORT_SYMBOL(cpumask_of_node); /* * Returns a bitmask of CPUs on Node 'node'. -- cgit v1.2.3 From 030bb203e01db12e3f2866799f4f03a114d06349 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 26 Dec 2008 22:23:41 +1030 Subject: cpumask: cpu_coregroup_mask(): x86 Impact: New API Like cpu_coregroup_map, but returns a (const) pointer. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Cc: Ingo Molnar --- arch/x86/include/asm/topology.h | 1 + arch/x86/kernel/smpboot.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 45da5dc50fc8..168203c0c316 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -231,6 +231,7 @@ static inline int node_to_first_cpu(int node) #endif extern cpumask_t cpu_coregroup_map(int cpu); +extern const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef ENABLE_TOPO_DEFINES #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 468c2f9d47ae..d5274b6b088e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -497,7 +497,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) } /* maps the cpu to the sched domain representing multi-core */ -cpumask_t cpu_coregroup_map(int cpu) +const struct cpumask *cpu_coregroup_mask(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); /* @@ -505,9 +505,14 @@ cpumask_t cpu_coregroup_map(int cpu) * And for power savings, we return cpu_core_map */ if (sched_mc_power_savings || sched_smt_power_savings) - return per_cpu(cpu_core_map, cpu); + return &per_cpu(cpu_core_map, cpu); else - return c->llc_shared_map; + return &c->llc_shared_map; +} + +cpumask_t cpu_coregroup_map(int cpu) +{ + return *cpu_coregroup_mask(cpu); } static void impress_friends(void) -- cgit v1.2.3 From 58a24566449892dda409b9ad92c2e56c76c5670c Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Mon, 29 Sep 2008 01:40:07 -0300 Subject: lguest: move the initial guest page table creation code to the host This patch moves the initial guest page table creation code to the host, so the launcher keeps working with PAE enabled configs. Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 60 ++++------------------------------ arch/x86/lguest/i386_head.S | 15 --------- drivers/lguest/lg.h | 2 +- drivers/lguest/lguest_user.c | 13 +++----- drivers/lguest/page_tables.c | 72 +++++++++++++++++++++++++++++++++++++++-- include/linux/lguest_launcher.h | 2 +- 6 files changed, 83 insertions(+), 81 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index aa2574ca94c7..f2dbbf3bdeab 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -481,51 +481,6 @@ static unsigned long load_initrd(const char *name, unsigned long mem) /* We return the initrd size. */ return len; } - -/* Once we know how much memory we have we can construct simple linear page - * tables which set virtual == physical which will get the Guest far enough - * into the boot to create its own. - * - * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). */ -static unsigned long setup_pagetables(unsigned long mem, - unsigned long initrd_size) -{ - unsigned long *pgdir, *linear; - unsigned int mapped_pages, i, linear_pages; - unsigned int ptes_per_page = getpagesize()/sizeof(void *); - - mapped_pages = mem/getpagesize(); - - /* Each PTE page can map ptes_per_page pages: how many do we need? */ - linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; - - /* We put the toplevel page directory page at the top of memory. */ - pgdir = from_guest_phys(mem) - initrd_size - getpagesize(); - - /* Now we use the next linear_pages pages as pte pages */ - linear = (void *)pgdir - linear_pages*getpagesize(); - - /* Linear mapping is easy: put every page's address into the mapping in - * order. PAGE_PRESENT contains the flags Present, Writable and - * Executable. */ - for (i = 0; i < mapped_pages; i++) - linear[i] = ((i * getpagesize()) | PAGE_PRESENT); - - /* The top level points to the linear page table pages above. */ - for (i = 0; i < mapped_pages; i += ptes_per_page) { - pgdir[i/ptes_per_page] - = ((to_guest_phys(linear) + i*sizeof(void *)) - | PAGE_PRESENT); - } - - verbose("Linear mapping of %u pages in %u pte pages at %#lx\n", - mapped_pages, linear_pages, to_guest_phys(linear)); - - /* We return the top level (guest-physical) address: the kernel needs - * to know where it is. */ - return to_guest_phys(pgdir); -} /*:*/ /* Simple routine to roll all the commandline arguments together with spaces @@ -548,13 +503,13 @@ static void concat(char *dst, char *args[]) /*L:185 This is where we actually tell the kernel to initialize the Guest. We * saw the arguments it expects when we looked at initialize() in lguest_user.c: - * the base of Guest "physical" memory, the top physical page to allow, the - * top level pagetable and the entry point for the Guest. */ -static int tell_kernel(unsigned long pgdir, unsigned long start) + * the base of Guest "physical" memory, the top physical page to allow and the + * entry point for the Guest. */ +static int tell_kernel(unsigned long start) { unsigned long args[] = { LHREQ_INITIALIZE, (unsigned long)guest_base, - guest_limit / getpagesize(), pgdir, start }; + guest_limit / getpagesize(), start }; int fd; verbose("Guest: %p - %p (%#lx)\n", @@ -1941,7 +1896,7 @@ int main(int argc, char *argv[]) { /* Memory, top-level pagetable, code startpoint and size of the * (optional) initrd. */ - unsigned long mem = 0, pgdir, start, initrd_size = 0; + unsigned long mem = 0, start, initrd_size = 0; /* Two temporaries and the /dev/lguest file descriptor. */ int i, c, lguest_fd; /* The boot information for the Guest. */ @@ -2040,9 +1995,6 @@ int main(int argc, char *argv[]) boot->hdr.type_of_loader = 0xFF; } - /* Set up the initial linear pagetables, starting below the initrd. */ - pgdir = setup_pagetables(mem, initrd_size); - /* The Linux boot header contains an "E820" memory map: ours is a * simple, single region. */ boot->e820_entries = 1; @@ -2064,7 +2016,7 @@ int main(int argc, char *argv[]) /* We tell the kernel to initialize the Guest: this returns the open * /dev/lguest file descriptor. */ - lguest_fd = tell_kernel(pgdir, start); + lguest_fd = tell_kernel(start); /* We clone off a thread, which wakes the Launcher whenever one of the * input file descriptors needs attention. We call this the Waker, and diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 5c7cef34c9e7..10b9bd35a8ff 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -30,21 +30,6 @@ ENTRY(lguest_entry) movl $lguest_data - __PAGE_OFFSET, %edx int $LGUEST_TRAP_ENTRY - /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl - * instruction uses %esi implicitly as the source for the copy we're - * about to do. */ - movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi - - /* Copy first 32 entries of page directory to __PAGE_OFFSET entries. - * This means the first 128M of kernel memory will be mapped at - * PAGE_OFFSET where the kernel expects to run. This will get it far - * enough through boot to switch to its own pagetables. */ - movl $32, %ecx - movl %esi, %edi - addl $((__PAGE_OFFSET >> 22) * 4), %edi - rep - movsl - /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 5faefeaf6790..f2c641e0bdde 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -164,7 +164,7 @@ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt); void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); /* page_tables.c: */ -int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); +int init_guest_pagetable(struct lguest *lg); void free_guest_pagetable(struct lguest *lg); void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index e73a000473cc..34bc017b8b3c 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -146,7 +146,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) return 0; } -/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) +/*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit) * values (in addition to the LHREQ_INITIALIZE value). These are: * * base: The start of the Guest-physical memory inside the Launcher memory. @@ -155,9 +155,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) * allowed to access. The Guest memory lives inside the Launcher, so it sets * this to ensure the Guest can only reach its own memory. * - * pgdir: The (Guest-physical) address of the top of the initial Guest - * pagetables (which are set up by the Launcher). - * * start: The first instruction to execute ("eip" in x86-speak). */ static int initialize(struct file *file, const unsigned long __user *input) @@ -166,7 +163,7 @@ static int initialize(struct file *file, const unsigned long __user *input) * Guest. */ struct lguest *lg; int err; - unsigned long args[4]; + unsigned long args[3]; /* We grab the Big Lguest lock, which protects against multiple * simultaneous initializations. */ @@ -192,14 +189,14 @@ static int initialize(struct file *file, const unsigned long __user *input) lg->mem_base = (void __user *)args[0]; lg->pfn_limit = args[1]; - /* This is the first cpu (cpu 0) and it will start booting at args[3] */ - err = lg_cpu_start(&lg->cpus[0], 0, args[3]); + /* This is the first cpu (cpu 0) and it will start booting at args[2] */ + err = lg_cpu_start(&lg->cpus[0], 0, args[2]); if (err) goto release_guest; /* Initialize the Guest's shadow page tables, using the toplevel * address the Launcher gave us. This allocates memory, so can fail. */ - err = init_guest_pagetable(lg, args[2]); + err = init_guest_pagetable(lg); if (err) goto free_regs; diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 81d0c6053447..576a8318221c 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "lg.h" /*M:008 We hold reference to pages, which prevents them from being swapped. @@ -581,15 +582,82 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); } +/* Once we know how much memory we have we can construct simple identity + * (which set virtual == physical) and linear mappings + * which will get the Guest far enough into the boot to create its own. + * + * We lay them out of the way, just below the initrd (which is why we need to + * know its size here). */ +static unsigned long setup_pagetables(struct lguest *lg, + unsigned long mem, + unsigned long initrd_size) +{ + pgd_t __user *pgdir; + pte_t __user *linear; + unsigned int mapped_pages, i, linear_pages, phys_linear; + unsigned long mem_base = (unsigned long)lg->mem_base; + + /* We have mapped_pages frames to map, so we need + * linear_pages page tables to map them. */ + mapped_pages = mem / PAGE_SIZE; + linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; + + /* We put the toplevel page directory page at the top of memory. */ + pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE); + + /* Now we use the next linear_pages pages as pte pages */ + linear = (void *)pgdir - linear_pages * PAGE_SIZE; + + /* Linear mapping is easy: put every page's address into the + * mapping in order. */ + for (i = 0; i < mapped_pages; i++) { + pte_t pte; + pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); + if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0) + return -EFAULT; + } + + /* The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. */ + phys_linear = (unsigned long)linear - mem_base; + for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { + pgd_t pgd; + pgd = __pgd((phys_linear + i * sizeof(pte_t)) | + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + + if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) + || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) + + i / PTRS_PER_PTE], + &pgd, sizeof(pgd))) + return -EFAULT; + } + + /* We return the top level (guest-physical) address: remember where + * this is. */ + return (unsigned long)pgdir - mem_base; +} + /*H:500 (vii) Setting up the page tables initially. * * When a Guest is first created, the Launcher tells us where the toplevel of * its first page table is. We set some things up here: */ -int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) +int init_guest_pagetable(struct lguest *lg) { + u64 mem; + u32 initrd_size; + struct boot_params __user *boot = (struct boot_params *)lg->mem_base; + + /* Get the Guest memory size and the ramdisk size from the boot header + * located at lg->mem_base (Guest address 0). */ + if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) + || get_user(initrd_size, &boot->hdr.ramdisk_size)) + return -EFAULT; + /* We start on the first shadow page table, and give it a blank PGD * page. */ - lg->pgdirs[0].gpgdir = pgtable; + lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); + if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) + return lg->pgdirs[0].gpgdir; lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index bd0eba760522..a53407a4165c 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -54,7 +54,7 @@ struct lguest_vqconfig { /* Write command first word is a request. */ enum lguest_req { - LHREQ_INITIALIZE, /* + base, pfnlimit, pgdir, start */ + LHREQ_INITIALIZE, /* + base, pfnlimit, start */ LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ -- cgit v1.2.3