From 1d3381ebf42de1b6f8c118732893cb5bdc37edcd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Mar 2008 16:59:12 -0700 Subject: x86: convert mtrr/generic.c to kernel-doc Convert function comment blocks to kernel-doc notation. Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mtrr/generic.c | 42 +++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 103d61a59b19..3e18db4cefee 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -176,12 +176,13 @@ static inline void k8_enable_fixed_iorrs(void) } /** - * Checks and updates an fixed-range MTRR if it differs from the value it - * should have. If K8 extentions are wanted, update the K8 SYSCFG MSR also. - * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information - * \param msr MSR address of the MTTR which should be checked and updated - * \param changed pointer which indicates whether the MTRR needed to be changed - * \param msrwords pointer to the MSR values which the MSR should have + * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have + * @msr: MSR address of the MTTR which should be checked and updated + * @changed: pointer which indicates whether the MTRR needed to be changed + * @msrwords: pointer to the MSR values which the MSR should have + * + * If K8 extentions are wanted, update the K8 SYSCFG MSR also. + * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information. */ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { @@ -199,12 +200,15 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) } } +/** + * generic_get_free_region - Get a free MTRR. + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * @replace_reg: mtrr index to be replaced; set to invalid value if none. + * + * Returns: The index of the region on success, else negative on error. + */ int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free MTRR. - The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { int i, max; mtrr_type ltype; @@ -249,8 +253,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, } /** - * Checks and updates the fixed-range MTRRs if they differ from the saved set - * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges() + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set + * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ static int set_fixed_ranges(mtrr_type * frs) { @@ -294,13 +298,13 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) static u32 deftype_lo, deftype_hi; +/** + * set_mtrr_state - Set the MTRR state for this CPU. + * + * NOTE: The CPU must already be in a safe state for MTRR changes. + * RETURNS: 0 if no changes made, else a mask indicating what was changed. + */ static unsigned long set_mtrr_state(void) -/* [SUMMARY] Set the MTRR state for this CPU. - The MTRR state information to read. - Some relevant CPU context. - [NOTE] The CPU must already be in a safe state for MTRR changes. - [RETURNS] 0 if no changes made, else a mask indication what was changed. -*/ { unsigned int i; unsigned long change_mask = 0; -- cgit v1.2.3 From 3c274c2909e17aa0afeded4cd4520b7357357ca0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 21 Mar 2008 10:06:32 +0100 Subject: x86: add dmi quirk for io_delay reported by mereandor@gmail.com, in: http://bugzilla.kernel.org/show_bug.cgi?id=6307 Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/io_delay.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index c706a3061553..5921e5f0a640 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -76,6 +76,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "30B9") } }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B8") + } + }, { .callback = dmi_io_delay_0xed_port, .ident = "HP Pavilion tx1000", -- cgit v1.2.3 From 475613b9e374bf0c15340eb166a962da04aa02e8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Feb 2008 23:23:09 -0800 Subject: x86: fix memoryless node oops during boot fix oops during boot reported in this thread: http://lkml.org/lkml/2008/2/6/65 enable booting on memoryless nodes. Reported-by: Kamalesh Babulal Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 7637dc91c79b..f4f7ecfb898c 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -801,7 +801,7 @@ static void __cpuinit srat_detect_node(void) /* Don't do the funky fallback heuristics the AMD version employs for now. */ node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE) + if (node == NUMA_NO_NODE || !node_online(node)) node = first_node(node_online_map); numa_set_node(cpu, node); -- cgit v1.2.3 From c6e8256a7b15033bc5d7797e25c7e053040c4c7c Mon Sep 17 00:00:00 2001 From: Stephan Diestelhorst Date: Mon, 10 Mar 2008 16:05:41 +0100 Subject: x86, cpufreq: fix Speedfreq-SMI call that clobbers ECX I have found that using SMI to change the cpu's frequency on my DELL Latitude L400 clobbers the ECX register in speedstep_set_state, causing unneccessary retries because the "state" variable has changed silently (GCC assumes it is still present in ECX). play safe and avoid gcc caching any register across IO port accesses that trigger SMIs. Signed-off by: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/speedstep-smi.c | 39 ++++++++++++++++++----------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index f2b5a621d27b..8a85c93bd62a 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -63,7 +63,7 @@ static struct cpufreq_frequency_table speedstep_freqs[] = { */ static int speedstep_smi_ownership (void) { - u32 command, result, magic; + u32 command, result, magic, dummy; u32 function = GET_SPEEDSTEP_OWNER; unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation"; @@ -73,8 +73,11 @@ static int speedstep_smi_ownership (void) dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port); __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=D" (result) + "pop %%ebp\n" + : "=D" (result), "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), + "=S" (dummy) : "a" (command), "b" (function), "c" (0), "d" (smi_port), "D" (0), "S" (magic) : "memory" @@ -96,7 +99,7 @@ static int speedstep_smi_ownership (void) */ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) { - u32 command, result = 0, edi, high_mhz, low_mhz; + u32 command, result = 0, edi, high_mhz, low_mhz, dummy; u32 state=0; u32 function = GET_SPEEDSTEP_FREQS; @@ -109,10 +112,12 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port); - __asm__ __volatile__("movl $0, %%edi\n" + __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi) - : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0) + "pop %%ebp" + : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi), "=S" (dummy) + : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) ); dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz); @@ -135,16 +140,18 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) static int speedstep_get_state (void) { u32 function=GET_SPEEDSTEP_STATE; - u32 result, state, edi, command; + u32 result, state, edi, command, dummy; command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port); - __asm__ __volatile__("movl $0, %%edi\n" + __asm__ __volatile__( + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=a" (result), "=b" (state), "=D" (edi) - : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0) + "pop %%ebp\n" + : "=a" (result), "=b" (state), "=D" (edi), "=c" (dummy), "=d" (dummy), "=S" (dummy) + : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0), "D" (0) ); dprintk("state is %x, result is %x\n", state, result); @@ -160,7 +167,7 @@ static int speedstep_get_state (void) */ static void speedstep_set_state (unsigned int state) { - unsigned int result = 0, command, new_state; + unsigned int result = 0, command, new_state, dummy; unsigned long flags; unsigned int function=SET_SPEEDSTEP_STATE; unsigned int retry = 0; @@ -182,10 +189,12 @@ static void speedstep_set_state (unsigned int state) } retry++; __asm__ __volatile__( - "movl $0, %%edi\n" + "push %%ebp\n" "out %%al, (%%dx)\n" - : "=b" (new_state), "=D" (result) - : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0) + "pop %%ebp" + : "=b" (new_state), "=D" (result), "=c" (dummy), "=a" (dummy), + "=d" (dummy), "=S" (dummy) + : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) ); } while ((new_state != state) && (retry <= SMI_TRIES)); @@ -195,7 +204,7 @@ static void speedstep_set_state (unsigned int state) if (new_state == state) { dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result); } else { - printk(KERN_ERR "cpufreq: change failed with new_state %u and result %u\n", new_state, result); + printk(KERN_ERR "cpufreq: change to state %u failed with new_state %u and result %u\n", state, new_state, result); } return; -- cgit v1.2.3 From 923a0cf82f2b504e316642e2d152d38b6c0be4ba Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 26 Mar 2008 14:13:01 -0400 Subject: x86: GEODE: add missing module.h include On Wed, 26 Mar 2008 11:56:22 -0600 Jordan Crouse wrote: > On 26/03/08 14:31 +0100, Stefan Pfetzing wrote: > > Hello Jordan, > > > > I just tried to build your geodwdt driver for the geode watchdog. Therefore > > I pulled your repository from http://git.infradead.org/geode.git (or more, > > the git url). > > > > I tried to build the geodewdt driver as a module - which didn't work, and > > it failed with the same problem as earlier mentioned on lkmk [1]. I also > > checked the fix [2], but that seems to be already in your (or linus) tree - > > and so I'm unsure what the problem is. > > > > [1] http://kerneltrap.org/mailarchive/linux-kernel/2008/2/17/884074 > > [2] http://kerneltrap.org/mailarchive/linux-kernel/2008/2/17/884174 > > > > Building directly into the kernel seems to work. > > > > Maybe you have some idea? > > Hmm - that is strange. Exporting the symbols should work. I recommend > starting over with a clean tree. > > CCing Andres - any thoughts? > > Jordan > Er, yeah. The patch below should fix it. This should probably go into 2.6.25. Oops, EXPORT_SYMBOL_GPL wasn't being declared due to this header being missing. Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar --- arch/x86/kernel/mfgpt_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 027fc067b399..b402c0f3f192 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -30,6 +30,7 @@ #include #include +#include #include static struct mfgpt_timer_t { -- cgit v1.2.3 From 76c324182bbd29dfe4298ca65efb15be18055df1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 23 Mar 2008 00:16:49 -0700 Subject: x86: fix trim mtrr not to setup_memory two times we could call find_max_pfn() directly instead of setup_memory() to get max_pfn needed for mtrr trimming. otherwise setup_memory() is called two times... that is duplicated... [ mingo@elte.hu: both Thomas and me simulated a double call to setup_bootmem_allocator() and can confirm that it is a real bug which can hang in certain configs. It's not been reported yet but that is probably due to the relatively scarce nature of MTRR-trimming systems. ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_32.c | 9 ++++----- arch/x86/mm/discontig_32.c | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index a1d7071a51c9..2b3e5d45176b 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -406,8 +406,6 @@ static unsigned long __init setup_memory(void) */ min_low_pfn = PFN_UP(init_pg_tables_end); - find_max_pfn(); - max_low_pfn = find_max_low_pfn(); #ifdef CONFIG_HIGHMEM @@ -764,12 +762,13 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_init(); - max_low_pfn = setup_memory(); - /* update e820 for memory not covered by WB MTRRs */ + find_max_pfn(); mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - max_low_pfn = setup_memory(); + find_max_pfn(); + + max_low_pfn = setup_memory(); #ifdef CONFIG_VMI /* diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c394ca0720b8..8e25e06ff730 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -324,7 +324,6 @@ unsigned long __init setup_memory(void) * this space and use it to adjust the boundary between ZONE_NORMAL * and ZONE_HIGHMEM. */ - find_max_pfn(); get_memcfg_numa(); kva_pages = calculate_numa_remap_pages(); -- cgit v1.2.3 From d546b67a940eb42a99f56b86c5cd8d47c8348c2a Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 25 Mar 2008 17:39:12 -0700 Subject: x86: fix performance drop for glx fix the 3D performance drop reported at: http://bugzilla.kernel.org/show_bug.cgi?id=10328 fb drivers are using ioremap()/ioremap_nocache(), followed by mtrr_add with WC attribute. Recent changes in page attribute code made both ioremap()/ioremap_nocache() mappings as UC (instead of previous UC-). This breaks the graphics performance, as the effective memory type is UC instead of expected WC. The correct way to fix this is to add ioremap_wc() (which uses UC- in the absence of PAT kernel support and WC with PAT) and change all the fb drivers to use this new ioremap_wc() API. We can take this correct and longer route for post 2.6.25. For now, revert back to the UC- behavior for ioremap/ioremap_nocache. Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi Cc: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 6 +++++- arch/x86/mm/pageattr.c | 2 +- include/asm-x86/pgtable.h | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4afaba0ed722..794895c6dcc9 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -137,7 +137,11 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, switch (mode) { case IOR_MODE_UNCACHED: default: - prot = PAGE_KERNEL_NOCACHE; + /* + * FIXME: we will use UC MINUS for now, as video fb drivers + * depend on it. Upcoming ioremap_wc() will fix this behavior. + */ + prot = PAGE_KERNEL_UC_MINUS; break; case IOR_MODE_CACHED: prot = PAGE_KERNEL; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 14e48b5a94ba..7b79f6be4e7d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -771,7 +771,7 @@ static inline int change_page_attr_clear(unsigned long addr, int numpages, int set_memory_uc(unsigned long addr, int numpages) { return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_PCD | _PAGE_PWT)); + __pgprot(_PAGE_PCD)); } EXPORT_SYMBOL(set_memory_uc); diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 174b87738714..9cf472aeb9ce 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -85,6 +85,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) #define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) @@ -101,6 +102,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) #define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX) #define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS) #define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE) #define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) #define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC) -- cgit v1.2.3 From bc713dcf35c427ae8377fb9a4d1b7f891054ce13 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Mar 2008 15:58:28 +0100 Subject: x86: fix prefetch workaround some early Athlon XP's and Opterons generate bogus faults on prefetch instructions. The workaround for this regressed over .24 - reinstate it. Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fdc667422df9..c0c82bc143c9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -92,7 +92,8 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned char *max_instr; #ifdef CONFIG_X86_32 - if (!(__supported_pte_mask & _PAGE_NX)) + /* Catch an obscure case of prefetch inside an NX page: */ + if ((__supported_pte_mask & _PAGE_NX) && (error_code & 16)) return 0; #endif -- cgit v1.2.3 From d8d4f157b8d828bc837f0eb2ee4a2dd40dbdd572 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 4 Mar 2008 15:05:39 -0800 Subject: x86: ptrace.c: fix defined-but-unused warnings arch/x86/kernel/ptrace.c:548: warning: 'ptrace_bts_get_size' defined but not used arch/x86/kernel/ptrace.c:558: warning: 'ptrace_bts_read_record' defined but not used arch/x86/kernel/ptrace.c:607: warning: 'ptrace_bts_clear' defined but not used arch/x86/kernel/ptrace.c:617: warning: 'ptrace_bts_drain' defined but not used arch/x86/kernel/ptrace.c:720: warning: 'ptrace_bts_config' defined but not used arch/x86/kernel/ptrace.c:788: warning: 'ptrace_bts_status' defined but not used Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ptrace.c | 169 ++++++++++++++++++++++++----------------------- 1 file changed, 85 insertions(+), 84 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d5904eef1d31..eb92ccbb3502 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -600,21 +600,6 @@ static int ptrace_bts_read_record(struct task_struct *child, return sizeof(ret); } -static int ptrace_bts_write_record(struct task_struct *child, - const struct bts_struct *in) -{ - int retval; - - if (!child->thread.ds_area_msr) - return -ENXIO; - - retval = ds_write_bts((void *)child->thread.ds_area_msr, in); - if (retval) - return retval; - - return sizeof(*in); -} - static int ptrace_bts_clear(struct task_struct *child) { if (!child->thread.ds_area_msr) @@ -657,75 +642,6 @@ static int ptrace_bts_drain(struct task_struct *child, return end; } -static int ptrace_bts_realloc(struct task_struct *child, - int size, int reduce_size) -{ - unsigned long rlim, vm; - int ret, old_size; - - if (size < 0) - return -EINVAL; - - old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); - if (old_size < 0) - return old_size; - - ret = ds_free((void **)&child->thread.ds_area_msr); - if (ret < 0) - goto out; - - size >>= PAGE_SHIFT; - old_size >>= PAGE_SHIFT; - - current->mm->total_vm -= old_size; - current->mm->locked_vm -= old_size; - - if (size == 0) - goto out; - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->total_vm; - if (size <= 0) - goto out; - } - - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->locked_vm; - if (size <= 0) - goto out; - } - - ret = ds_allocate((void **)&child->thread.ds_area_msr, - size << PAGE_SHIFT); - if (ret < 0) - goto out; - - current->mm->total_vm += size; - current->mm->locked_vm += size; - -out: - if (child->thread.ds_area_msr) - set_tsk_thread_flag(child, TIF_DS_AREA_MSR); - else - clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); - - return ret; -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) @@ -828,6 +744,91 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } + +static int ptrace_bts_write_record(struct task_struct *child, + const struct bts_struct *in) +{ + int retval; + + if (!child->thread.ds_area_msr) + return -ENXIO; + + retval = ds_write_bts((void *)child->thread.ds_area_msr, in); + if (retval) + return retval; + + return sizeof(*in); +} + +static int ptrace_bts_realloc(struct task_struct *child, + int size, int reduce_size) +{ + unsigned long rlim, vm; + int ret, old_size; + + if (size < 0) + return -EINVAL; + + old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); + if (old_size < 0) + return old_size; + + ret = ds_free((void **)&child->thread.ds_area_msr); + if (ret < 0) + goto out; + + size >>= PAGE_SHIFT; + old_size >>= PAGE_SHIFT; + + current->mm->total_vm -= old_size; + current->mm->locked_vm -= old_size; + + if (size == 0) + goto out; + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->total_vm; + if (size <= 0) + goto out; + } + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->locked_vm; + if (size <= 0) + goto out; + } + + ret = ds_allocate((void **)&child->thread.ds_area_msr, + size << PAGE_SHIFT); + if (ret < 0) + goto out; + + current->mm->total_vm += size; + current->mm->locked_vm += size; + +out: + if (child->thread.ds_area_msr) + set_tsk_thread_flag(child, TIF_DS_AREA_MSR); + else + clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); + + return ret; +} + void ptrace_bts_take_timestamp(struct task_struct *tsk, enum bts_qualifier qualifier) { -- cgit v1.2.3 From b2ef749720a97053d60605a7456772a1752164cc Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 26 Mar 2008 22:39:15 +0100 Subject: rdc321x: GPIO routines bugfixes This patch fixes the use of GPIO routines which are in the PCI configuration space of the RDC321x, therefore reading/writing to this space without spinlock protection can be problematic. We also now request and free GPIOs and support the MGB100 board, previous code was very AR525W-centric. Signed-off-by: Volker Weiss Signed-off-by: Florian Fainelli Signed-off-by: Ingo Molnar --- arch/x86/mach-rdc321x/gpio.c | 199 +++++++++++++++++++++------- arch/x86/mach-rdc321x/platform.c | 2 + include/asm-x86/mach-rdc321x/gpio.h | 9 +- include/asm-x86/mach-rdc321x/rdc321x_defs.h | 8 +- 4 files changed, 165 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c index 031269163bd6..247f33d3a407 100644 --- a/arch/x86/mach-rdc321x/gpio.c +++ b/arch/x86/mach-rdc321x/gpio.c @@ -1,91 +1,194 @@ /* - * Copyright (C) 2007, OpenWrt.org, Florian Fainelli - * RDC321x architecture specific GPIO support + * GPIO support for RDC SoC R3210/R8610 + * + * Copyright (C) 2007, Florian Fainelli + * Copyright (C) 2008, Volker Weiss + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. */ -#include -#include + +#include #include #include #include -#include +#include #include -static inline int rdc_gpio_is_valid(unsigned gpio) + +/* spin lock to protect our private copy of GPIO data register plus + the access to PCI conf registers. */ +static DEFINE_SPINLOCK(gpio_lock); + +/* copy of GPIO data registers */ +static u32 gpio_data_reg1; +static u32 gpio_data_reg2; + +static u32 gpio_request_data[2]; + + +static inline void rdc321x_conf_write(unsigned addr, u32 value) { - return (gpio <= RDC_MAX_GPIO); + outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR); + outl(value, RDC3210_CFGREG_DATA); } -static unsigned int rdc_gpio_read(unsigned gpio) +static inline void rdc321x_conf_or(unsigned addr, u32 value) { - unsigned int val; - - val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x84:0x48)); - outl(val, RDC3210_CFGREG_ADDR); - udelay(10); - val = inl(RDC3210_CFGREG_DATA); - val |= (0x1 << (gpio & 0x1F)); - outl(val, RDC3210_CFGREG_DATA); - udelay(10); - val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x88:0x4C)); - outl(val, RDC3210_CFGREG_ADDR); - udelay(10); - val = inl(RDC3210_CFGREG_DATA); - - return val; + outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR); + value |= inl(RDC3210_CFGREG_DATA); + outl(value, RDC3210_CFGREG_DATA); } -static void rdc_gpio_write(unsigned int val) +static inline u32 rdc321x_conf_read(unsigned addr) { - if (val) { - outl(val, RDC3210_CFGREG_DATA); - udelay(10); - } + outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR); + + return inl(RDC3210_CFGREG_DATA); } -int rdc_gpio_get_value(unsigned gpio) +/* configure pin as GPIO */ +static void rdc321x_configure_gpio(unsigned gpio) +{ + unsigned long flags; + + spin_lock_irqsave(&gpio_lock, flags); + rdc321x_conf_or(gpio < 32 + ? RDC321X_GPIO_CTRL_REG1 : RDC321X_GPIO_CTRL_REG2, + 1 << (gpio & 0x1f)); + spin_unlock_irqrestore(&gpio_lock, flags); +} + +/* initially setup the 2 copies of the gpio data registers. + This function must be called by the platform setup code. */ +void __init rdc321x_gpio_setup() +{ + /* this might not be, what others (BIOS, bootloader, etc.) + wrote to these registers before, but it's a good guess. Still + better than just using 0xffffffff. */ + + gpio_data_reg1 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG1); + gpio_data_reg2 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG2); +} + +/* determine, if gpio number is valid */ +static inline int rdc321x_is_gpio(unsigned gpio) +{ + return gpio <= RDC321X_MAX_GPIO; +} + +/* request GPIO */ +int rdc_gpio_request(unsigned gpio, const char *label) { - if (rdc_gpio_is_valid(gpio)) - return (int)rdc_gpio_read(gpio); - else + unsigned long flags; + + if (!rdc321x_is_gpio(gpio)) return -EINVAL; + + spin_lock_irqsave(&gpio_lock, flags); + if (gpio_request_data[(gpio & 0x20) ? 1 : 0] & (1 << (gpio & 0x1f))) + goto inuse; + gpio_request_data[(gpio & 0x20) ? 1 : 0] |= (1 << (gpio & 0x1f)); + spin_unlock_irqrestore(&gpio_lock, flags); + + return 0; +inuse: + spin_unlock_irqrestore(&gpio_lock, flags); + return -EINVAL; } -EXPORT_SYMBOL(rdc_gpio_get_value); +EXPORT_SYMBOL(rdc_gpio_request); -void rdc_gpio_set_value(unsigned gpio, int value) +/* release previously-claimed GPIO */ +void rdc_gpio_free(unsigned gpio) { - unsigned int val; + unsigned long flags; - if (!rdc_gpio_is_valid(gpio)) + if (!rdc321x_is_gpio(gpio)) return; - val = rdc_gpio_read(gpio); + spin_lock_irqsave(&gpio_lock, flags); + gpio_request_data[(gpio & 0x20) ? 1 : 0] &= ~(1 << (gpio & 0x1f)); + spin_unlock_irqrestore(&gpio_lock, flags); +} +EXPORT_SYMBOL(rdc_gpio_free); + +/* read GPIO pin */ +int rdc_gpio_get_value(unsigned gpio) +{ + u32 reg; + unsigned long flags; + + spin_lock_irqsave(&gpio_lock, flags); + reg = rdc321x_conf_read(gpio < 32 + ? RDC321X_GPIO_DATA_REG1 : RDC321X_GPIO_DATA_REG2); + spin_unlock_irqrestore(&gpio_lock, flags); - if (value) - val &= ~(0x1 << (gpio & 0x1F)); - else - val |= (0x1 << (gpio & 0x1F)); + return (1 << (gpio & 0x1f)) & reg ? 1 : 0; +} +EXPORT_SYMBOL(rdc_gpio_get_value); - rdc_gpio_write(val); +/* set GPIO pin to value */ +void rdc_gpio_set_value(unsigned gpio, int value) +{ + unsigned long flags; + u32 reg; + + reg = 1 << (gpio & 0x1f); + if (gpio < 32) { + spin_lock_irqsave(&gpio_lock, flags); + if (value) + gpio_data_reg1 |= reg; + else + gpio_data_reg1 &= ~reg; + rdc321x_conf_write(RDC321X_GPIO_DATA_REG1, gpio_data_reg1); + spin_unlock_irqrestore(&gpio_lock, flags); + } else { + spin_lock_irqsave(&gpio_lock, flags); + if (value) + gpio_data_reg2 |= reg; + else + gpio_data_reg2 &= ~reg; + rdc321x_conf_write(RDC321X_GPIO_DATA_REG2, gpio_data_reg2); + spin_unlock_irqrestore(&gpio_lock, flags); + } } EXPORT_SYMBOL(rdc_gpio_set_value); +/* configure GPIO pin as input */ int rdc_gpio_direction_input(unsigned gpio) { + if (!rdc321x_is_gpio(gpio)) + return -EINVAL; + + rdc321x_configure_gpio(gpio); + return 0; } EXPORT_SYMBOL(rdc_gpio_direction_input); +/* configure GPIO pin as output and set value */ int rdc_gpio_direction_output(unsigned gpio, int value) { + if (!rdc321x_is_gpio(gpio)) + return -EINVAL; + + gpio_set_value(gpio, value); + rdc321x_configure_gpio(gpio); + return 0; } EXPORT_SYMBOL(rdc_gpio_direction_output); - - diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c index dda6024a5862..a037041817c7 100644 --- a/arch/x86/mach-rdc321x/platform.c +++ b/arch/x86/mach-rdc321x/platform.c @@ -62,6 +62,8 @@ static struct platform_device *rdc321x_devs[] = { static int __init rdc_board_setup(void) { + rdc321x_gpio_setup(); + return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs)); } diff --git a/include/asm-x86/mach-rdc321x/gpio.h b/include/asm-x86/mach-rdc321x/gpio.h index db31b929b990..acce0b7d397b 100644 --- a/include/asm-x86/mach-rdc321x/gpio.h +++ b/include/asm-x86/mach-rdc321x/gpio.h @@ -5,19 +5,20 @@ extern int rdc_gpio_get_value(unsigned gpio); extern void rdc_gpio_set_value(unsigned gpio, int value); extern int rdc_gpio_direction_input(unsigned gpio); extern int rdc_gpio_direction_output(unsigned gpio, int value); - +extern int rdc_gpio_request(unsigned gpio, const char *label); +extern void rdc_gpio_free(unsigned gpio); +extern void __init rdc321x_gpio_setup(void); /* Wrappers for the arch-neutral GPIO API */ static inline int gpio_request(unsigned gpio, const char *label) { - /* Not yet implemented */ - return 0; + return rdc_gpio_request(gpio, label); } static inline void gpio_free(unsigned gpio) { - /* Not yet implemented */ + rdc_gpio_free(gpio); } static inline int gpio_direction_input(unsigned gpio) diff --git a/include/asm-x86/mach-rdc321x/rdc321x_defs.h b/include/asm-x86/mach-rdc321x/rdc321x_defs.h index 838ba8f64fd3..c8e9c8bed3d0 100644 --- a/include/asm-x86/mach-rdc321x/rdc321x_defs.h +++ b/include/asm-x86/mach-rdc321x/rdc321x_defs.h @@ -3,4 +3,10 @@ /* General purpose configuration and data registers */ #define RDC3210_CFGREG_ADDR 0x0CF8 #define RDC3210_CFGREG_DATA 0x0CFC -#define RDC_MAX_GPIO 0x3A + +#define RDC321X_GPIO_CTRL_REG1 0x48 +#define RDC321X_GPIO_CTRL_REG2 0x84 +#define RDC321X_GPIO_DATA_REG1 0x4c +#define RDC321X_GPIO_DATA_REG2 0x88 + +#define RDC321X_MAX_GPIO 58 -- cgit v1.2.3 From 25e59881f109dc6378ebc463ae4c2de907435de3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 26 Mar 2008 21:03:04 -0700 Subject: x86: stricter check in follow_huge_addr() The first page of the compound page is determined in follow_huge_addr() but then PageCompound() only checks if the page is part of a compound page. PageHead() allows checking if this is indeed the first page of the compound. Cc: Jeremy Fitzhardinge Signed-off-by: Christoph Lameter Signed-off-by: Ingo Molnar --- arch/x86/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 4fbafb4bc2f0..0b3d567e686d 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -178,7 +178,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - WARN_ON(!PageCompound(page)); + WARN_ON(!PageHead(page)); return page; } -- cgit v1.2.3 From 04c44a080d2f699a3042d4e743f7ad2ffae9d538 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:36:52 -0700 Subject: xen: fix RMW when unmasking events xen_irq_enable_direct and xen_sysexit were using "andw $0x00ff, XEN_vcpu_info_pending(vcpu)" to unmask events and test for pending ones in one instuction. Unfortunately, the pending flag must be modified with a locked operation since it can be set by another CPU, and the unlocked form of this operation was causing the pending flag to get lost, allowing the processor to return to usermode with pending events and ultimately deadlock. The simple fix would be to make it a locked operation, but that's rather costly and unnecessary. The fix here is to split the mask-clearing and pending-testing into two instructions; the interrupt window between them is of no concern because either way pending or new events will be processed. This should fix lingering bugs in using direct vcpu structure access too. [ Stable: needed in 2.6.24.x ] Signed-off-by: Jeremy Fitzhardinge Cc: Stable Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/xen-asm.S | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8b9ee27805fd..1a20318c8cff 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -95,7 +95,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 0; +static int have_vcpu_info_placement = 1; static void __init xen_vcpu_setup(int cpu) { diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 1a43b60c0c62..6b7190449d07 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -33,12 +33,17 @@ events, then enter the hypervisor to get them handled. */ ENTRY(xen_irq_enable_direct) - /* Clear mask and test pending */ - andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + /* Unmask events */ + movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + /* Preempt here doesn't matter because that will deal with any pending interrupts. The pending check may end up being run on the wrong CPU, but that doesn't hurt. */ + + /* Test for pending */ + testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending jz 1f + 2: call check_events 1: ENDPATCH(xen_irq_enable_direct) -- cgit v1.2.3 From 2e8fe719b57bbdc9e313daed1204bb55fed3ed44 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 17 Mar 2008 16:36:53 -0700 Subject: xen: fix UP setup of shared_info We need to set up the shared_info pointer once we've mapped the real shared_info into its fixmap slot. That needs to happen once the general pagetable setup has been done. Previously, the UP shared_info was set up one in xen_start_kernel, but that was left pointing to the dummy shared info. Unfortunately there's no really good place to do a later setup of the shared_info in UP, so just do it once the pagetable setup has been done. [ Stable: needed in 2.6.24.x ] Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1a20318c8cff..de4e6f05840b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -103,6 +103,7 @@ static void __init xen_vcpu_setup(int cpu) int err; struct vcpu_info *vcpup; + BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; if (!have_vcpu_info_placement) @@ -805,33 +806,43 @@ static __init void xen_pagetable_setup_start(pgd_t *base) PFN_DOWN(__pa(xen_start_info->pt_base))); } -static __init void xen_pagetable_setup_done(pgd_t *base) +static __init void setup_shared_info(void) { - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pt = xen_alloc_pt; - pv_mmu_ops.alloc_pd = xen_alloc_pd; - pv_mmu_ops.release_pt = xen_release_pt; - pv_mmu_ops.release_pd = xen_release_pt; - pv_mmu_ops.set_pte = xen_set_pte; - if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); + /* * Create a mapping for the shared info page. * Should be set_fixmap(), but shared_info is a machine * address with no corresponding pseudo-phys address. */ - set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + set_pte_mfn(addr, PFN_DOWN(xen_start_info->shared_info), PAGE_KERNEL); - HYPERVISOR_shared_info = - (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); - + HYPERVISOR_shared_info = (struct shared_info *)addr; } else HYPERVISOR_shared_info = (struct shared_info *)__va(xen_start_info->shared_info); +#ifndef CONFIG_SMP + /* In UP this is as good a place as any to set up shared info */ + xen_setup_vcpu_info_placement(); +#endif +} + +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pt = xen_alloc_pt; + pv_mmu_ops.alloc_pd = xen_alloc_pd; + pv_mmu_ops.release_pt = xen_release_pt; + pv_mmu_ops.release_pd = xen_release_pt; + pv_mmu_ops.set_pte = xen_set_pte; + + setup_shared_info(); + /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ { @@ -1182,15 +1193,9 @@ asmlinkage void __init xen_start_kernel(void) x86_write_percpu(xen_cr3, __pa(pgd)); x86_write_percpu(xen_current_cr3, __pa(pgd)); -#ifdef CONFIG_SMP /* Don't do the full vcpu_info placement stuff until we have a - possible map. */ + possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; -#else - /* May as well do it now, since there's no good time to call - it later on UP. */ - xen_setup_vcpu_info_placement(); -#endif pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) -- cgit v1.2.3 From 3085354de635179d70c240e6d942bcbd1d93056c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Mar 2008 21:29:09 +0100 Subject: x86: prefetch fix #2 Linus noticed a second bug and an uncleanliness: - we'd return on any instruction fetch fault - we'd use both the value of 16 and the PF_INSTR symbol which are the same and make no sense the cleanup nicely unifies this piece of logic. Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c0c82bc143c9..ec08d8389850 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -91,13 +91,10 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr, int prefetch = 0; unsigned char *max_instr; -#ifdef CONFIG_X86_32 - /* Catch an obscure case of prefetch inside an NX page: */ - if ((__supported_pte_mask & _PAGE_NX) && (error_code & 16)) - return 0; -#endif - - /* If it was a exec fault on NX page, ignore */ + /* + * If it was a exec (instruction fetch) fault on NX page, then + * do not ignore the fault: + */ if (error_code & PF_INSTR) return 0; -- cgit v1.2.3 From a6bd8e13034dd7d60b6f14217096efa192d0adc1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 28 Mar 2008 11:05:53 -0500 Subject: lguest: comment documentation update. Took some cycles to re-read the Lguest Journey end-to-end, fix some rot and tighten some phrases. Only comments change. No new jokes, but a couple of recycled old jokes. Signed-off-by: Rusty Russell --- Documentation/lguest/lguest.c | 69 ++++++++++++---------- arch/x86/lguest/boot.c | 108 +++++++++++++++++++--------------- arch/x86/lguest/i386_head.S | 15 +++-- drivers/lguest/core.c | 18 +++--- drivers/lguest/hypercalls.c | 11 +++- drivers/lguest/interrupts_and_traps.c | 7 +-- drivers/lguest/lguest_device.c | 11 ++-- drivers/lguest/lguest_user.c | 30 +++++++--- drivers/lguest/page_tables.c | 32 +++++----- drivers/lguest/x86/core.c | 33 +++++++---- drivers/lguest/x86/switcher_32.S | 8 +-- include/asm-x86/lguest_hcall.h | 2 +- include/linux/lguest_launcher.h | 6 +- 13 files changed, 208 insertions(+), 142 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index d45c7f682b1b..4c1fc65a8b3d 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1,7 +1,7 @@ /*P:100 This is the Launcher code, a simple program which lays out the - * "physical" memory for the new Guest by mapping the kernel image and the - * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. -:*/ + * "physical" memory for the new Guest by mapping the kernel image and + * the virtual devices, then opens /dev/lguest to tell the kernel + * about the Guest and control it. :*/ #define _LARGEFILE64_SOURCE #define _GNU_SOURCE #include @@ -43,7 +43,7 @@ #include "linux/virtio_console.h" #include "linux/virtio_ring.h" #include "asm-x86/bootparam.h" -/*L:110 We can ignore the 38 include files we need for this program, but I do +/*L:110 We can ignore the 39 include files we need for this program, but I do * want to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I @@ -320,7 +320,7 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) err(1, "Reading program headers"); /* Try all the headers: there are usually only three. A read-only one, - * a read-write one, and a "note" section which isn't loadable. */ + * a read-write one, and a "note" section which we don't load. */ for (i = 0; i < ehdr->e_phnum; i++) { /* If this isn't a loadable segment, we ignore it */ if (phdr[i].p_type != PT_LOAD) @@ -387,7 +387,7 @@ static unsigned long load_kernel(int fd) if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) return map_elf(fd, &hdr); - /* Otherwise we assume it's a bzImage, and try to unpack it */ + /* Otherwise we assume it's a bzImage, and try to load it. */ return load_bzimage(fd); } @@ -433,12 +433,12 @@ static unsigned long load_initrd(const char *name, unsigned long mem) return len; } -/* Once we know how much memory we have, we can construct simple linear page +/* Once we know how much memory we have we can construct simple linear page * tables which set virtual == physical which will get the Guest far enough * into the boot to create its own. * * We lay them out of the way, just below the initrd (which is why we need to - * know its size). */ + * know its size here). */ static unsigned long setup_pagetables(unsigned long mem, unsigned long initrd_size) { @@ -850,7 +850,8 @@ static void handle_console_output(int fd, struct virtqueue *vq) * * Handling output for network is also simple: we get all the output buffers * and write them (ignoring the first element) to this device's file descriptor - * (stdout). */ + * (/dev/net/tun). + */ static void handle_net_output(int fd, struct virtqueue *vq) { unsigned int head, out, in; @@ -924,7 +925,7 @@ static void enable_fd(int fd, struct virtqueue *vq) write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); } -/* Resetting a device is fairly easy. */ +/* When the Guest asks us to reset a device, it's is fairly easy. */ static void reset_device(struct device *dev) { struct virtqueue *vq; @@ -1003,8 +1004,8 @@ static void handle_input(int fd) if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) break; - /* Otherwise, call the device(s) which have readable - * file descriptors and a method of handling them. */ + /* Otherwise, call the device(s) which have readable file + * descriptors and a method of handling them. */ for (i = devices.dev; i; i = i->next) { if (i->handle_input && FD_ISSET(i->fd, &fds)) { int dev_fd; @@ -1015,8 +1016,7 @@ static void handle_input(int fd) * should no longer service it. Networking and * console do this when there's no input * buffers to deliver into. Console also uses - * it when it discovers that stdin is - * closed. */ + * it when it discovers that stdin is closed. */ FD_CLR(i->fd, &devices.infds); /* Tell waker to ignore it too, by sending a * negative fd number (-1, since 0 is a valid @@ -1033,7 +1033,8 @@ static void handle_input(int fd) * * All devices need a descriptor so the Guest knows it exists, and a "struct * device" so the Launcher can keep track of it. We have common helper - * routines to allocate and manage them. */ + * routines to allocate and manage them. + */ /* The layout of the device page is a "struct lguest_device_desc" followed by a * number of virtqueue descriptors, then two sets of feature bits, then an @@ -1078,7 +1079,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, struct virtqueue **i, *vq = malloc(sizeof(*vq)); void *p; - /* First we need some pages for this virtqueue. */ + /* First we need some memory for this virtqueue. */ pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1) / getpagesize(); p = get_pages(pages); @@ -1122,7 +1123,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, } /* The first half of the feature bitmask is for us to advertise features. The - * second half if for the Guest to accept features. */ + * second half is for the Guest to accept features. */ static void add_feature(struct device *dev, unsigned bit) { u8 *features = get_feature_bits(dev); @@ -1151,7 +1152,9 @@ static void set_config(struct device *dev, unsigned len, const void *conf) } /* This routine does all the creation and setup of a new device, including - * calling new_dev_desc() to allocate the descriptor and device memory. */ + * calling new_dev_desc() to allocate the descriptor and device memory. + * + * See what I mean about userspace being boring? */ static struct device *new_device(const char *name, u16 type, int fd, bool (*handle_input)(int, struct device *)) { @@ -1492,7 +1495,10 @@ static int io_thread(void *_dev) while (read(vblk->workpipe[0], &c, 1) == 1) { /* We acknowledge each request immediately to reduce latency, * rather than waiting until we've done them all. I haven't - * measured to see if it makes any difference. */ + * measured to see if it makes any difference. + * + * That would be an interesting test, wouldn't it? You could + * also try having more than one I/O thread. */ while (service_io(dev)) write(vblk->done_fd, &c, 1); } @@ -1500,7 +1506,7 @@ static int io_thread(void *_dev) } /* Now we've seen the I/O thread, we return to the Launcher to see what happens - * when the thread tells us it's completed some I/O. */ + * when that thread tells us it's completed some I/O. */ static bool handle_io_finish(int fd, struct device *dev) { char c; @@ -1572,11 +1578,12 @@ static void setup_block_file(const char *filename) * more work. */ pipe(vblk->workpipe); - /* Create stack for thread and run it */ + /* Create stack for thread and run it. Since stack grows upwards, we + * point the stack pointer to the end of this region. */ stack = malloc(32768); /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from * becoming a zombie. */ - if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) + if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) err(1, "Creating clone"); /* We don't need to keep the I/O thread's end of the pipes open. */ @@ -1586,14 +1593,14 @@ static void setup_block_file(const char *filename) verbose("device %u: virtblock %llu sectors\n", devices.device_num, le64_to_cpu(conf.capacity)); } -/* That's the end of device setup. :*/ +/* That's the end of device setup. */ -/* Reboot */ +/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ static void __attribute__((noreturn)) restart_guest(void) { unsigned int i; - /* Closing pipes causes the waker thread and io_threads to die, and + /* Closing pipes causes the Waker thread and io_threads to die, and * closing /dev/lguest cleans up the Guest. Since we don't track all * open fds, we simply close everything beyond stderr. */ for (i = 3; i < FD_SETSIZE; i++) @@ -1602,7 +1609,7 @@ static void __attribute__((noreturn)) restart_guest(void) err(1, "Could not exec %s", main_args[0]); } -/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves +/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves * its input and output, and finally, lays it to rest. */ static void __attribute__((noreturn)) run_guest(int lguest_fd) { @@ -1643,7 +1650,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) err(1, "Resetting break"); } } -/* +/*L:240 * This is the end of the Launcher. The good news: we are over halfway * through! The bad news: the most fiendish part of the code still lies ahead * of us. @@ -1690,8 +1697,8 @@ int main(int argc, char *argv[]) * device receive input from a file descriptor, we keep an fdset * (infds) and the maximum fd number (max_infd) with the head of the * list. We also keep a pointer to the last device. Finally, we keep - * the next interrupt number to hand out (1: remember that 0 is used by - * the timer). */ + * the next interrupt number to use for devices (1: remember that 0 is + * used by the timer). */ FD_ZERO(&devices.infds); devices.max_infd = -1; devices.lastdev = NULL; @@ -1792,8 +1799,8 @@ int main(int argc, char *argv[]) lguest_fd = tell_kernel(pgdir, start); /* We fork off a child process, which wakes the Launcher whenever one - * of the input file descriptors needs attention. Otherwise we would - * run the Guest until it tries to output something. */ + * of the input file descriptors needs attention. We call this the + * Waker, and we'll cover it in a moment. */ waker_fd = setup_waker(lguest_fd); /* Finally, run the Guest. This doesn't return. */ diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a104c532ff70..3335b4595efd 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -10,21 +10,19 @@ * (such as the example in Documentation/lguest/lguest.c) is called the * Launcher. * - * Secondly, we only run specially modified Guests, not normal kernels. When - * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets - * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows - * how to be a Guest. This means that you can use the same kernel you boot - * normally (ie. as a Host) as a Guest. + * Secondly, we only run specially modified Guests, not normal kernels: setting + * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows + * how to be a Guest at boot time. This means that you can use the same kernel + * you boot normally (ie. as a Host) as a Guest. * * These Guests know that they cannot do privileged operations, such as disable * interrupts, and that they have to ask the Host to do such things explicitly. * This file consists of all the replacements for such low-level native * hardware operations: these special Guest versions call the Host. * - * So how does the kernel know it's a Guest? The Guest starts at a special - * entry point marked with a magic string, which sets up a few things then - * calls here. We replace the native functions various "paravirt" structures - * with our Guest versions, then boot like normal. :*/ + * So how does the kernel know it's a Guest? We'll see that later, but let's + * just say that we end up here where we replace the native functions various + * "paravirt" structures with our Guest versions, then boot like normal. :*/ /* * Copyright (C) 2006, Rusty Russell IBM Corporation. @@ -134,7 +132,7 @@ static void async_hcall(unsigned long call, unsigned long arg1, * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing. */ + * future processing: */ static void lazy_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, @@ -147,7 +145,7 @@ static void lazy_hcall(unsigned long call, } /* When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue a hypercall to flush any stored calls. */ + * issue the do-nothing hypercall to flush any stored calls. */ static void lguest_leave_lazy_mode(void) { paravirt_leave_lazy(paravirt_get_lazy_mode()); @@ -164,7 +162,7 @@ static void lguest_leave_lazy_mode(void) * * So instead we keep an "irq_enabled" field inside our "struct lguest_data", * which the Guest can update with a single instruction. The Host knows to - * check there when it wants to deliver an interrupt. + * check there before it tries to deliver an interrupt. */ /* save_flags() is expected to return the processor state (ie. "flags"). The @@ -196,10 +194,15 @@ static void irq_enable(void) /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, * since interrupts are rare and we'll just get the interrupt on the next timer - * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way + * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way * would be to put the "irq_enabled" field in a page by itself, and have the * Host write-protect it when an interrupt comes in when irqs are disabled. - * There will then be a page fault as soon as interrupts are re-enabled. :*/ + * There will then be a page fault as soon as interrupts are re-enabled. + * + * A better method is to implement soft interrupt disable generally for x86: + * instead of disabling interrupts, we set a flag. If an interrupt does come + * in, we then disable them for real. This is uncommon, so we could simply use + * a hypercall for interrupt control and not worry about efficiency. :*/ /*G:034 * The Interrupt Descriptor Table (IDT). @@ -212,6 +215,10 @@ static void irq_enable(void) static void lguest_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { + /* The gate_desc structure is 8 bytes long: we hand it to the Host in + * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors + * around like this; typesafety wasn't a big concern in Linux's early + * years. */ u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); @@ -243,7 +250,8 @@ static void lguest_load_idt(const struct desc_ptr *desc) * * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY * hypercall and use that repeatedly to load a new IDT. I don't think it - * really matters, but wouldn't it be nice if they were the same? + * really matters, but wouldn't it be nice if they were the same? Wouldn't + * it be even better if you were the one to send the patch to fix it? */ static void lguest_load_gdt(const struct desc_ptr *desc) { @@ -298,9 +306,9 @@ static void lguest_load_tr_desc(void) /* The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the - * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you - * might imagine, after a decade and a half this treatment, it is now a giant - * ball of hair. Its entry in the current Intel manual runs to 28 pages. + * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. + * As you might imagine, after a decade and a half this treatment, it is now a + * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry * has been translated into 4 languages. I am not making this up! @@ -594,17 +602,17 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } -/* The TSC is a Time Stamp Counter. The Host tells us what speed it runs at, - * or 0 if it's unusable as a reliable clock source. This matches what we want - * here: if we return 0 from this function, the x86 TSC clock will not register - * itself. */ +/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us + * what speed it runs at, or 0 if it's unusable as a reliable clock source. + * This matches what we want here: if we return 0 from this function, the x86 + * TSC clock will give up and not register itself. */ static unsigned long lguest_cpu_khz(void) { return lguest_data.tsc_khz; } -/* If we can't use the TSC, the kernel falls back to our "lguest_clock", where - * we read the time value given to us by the Host. */ +/* If we can't use the TSC, the kernel falls back to our lower-priority + * "lguest_clock", where we read the time value given to us by the Host. */ static cycle_t lguest_clock_read(void) { unsigned long sec, nsec; @@ -648,12 +656,16 @@ static struct clocksource lguest_clock = { static int lguest_clockevent_set_next_event(unsigned long delta, struct clock_event_device *evt) { + /* FIXME: I don't think this can ever happen, but James tells me he had + * to put this code in. Maybe we should remove it now. Anyone? */ if (delta < LG_CLOCK_MIN_DELTA) { if (printk_ratelimit()) printk(KERN_DEBUG "%s: small delta %lu ns\n", __FUNCTION__, delta); return -ETIME; } + + /* Please wake us this far in the future. */ hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); return 0; } @@ -738,7 +750,7 @@ static void lguest_time_init(void) * will not tolerate us trying to use that), the stack pointer, and the number * of pages in the stack. */ static void lguest_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0, THREAD_SIZE/PAGE_SIZE); @@ -786,9 +798,8 @@ static void lguest_safe_halt(void) hcall(LHCALL_HALT, 0, 0, 0); } -/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a - * message out when we're crashing as well as elegant termination like powering - * off. +/* The SHUTDOWN hypercall takes a string to describe what's happening, and + * an argument which says whether this to restart (reboot) the Guest or not. * * Note that the Host always prefers that the Guest speak in physical addresses * rather than virtual addresses, so we use __pa() here. */ @@ -816,8 +827,9 @@ static struct notifier_block paniced = { /* Setting up memory is fairly easy. */ static __init char *lguest_memory_setup(void) { - /* We do this here and not earlier because lockcheck barfs if we do it - * before start_kernel() */ + /* We do this here and not earlier because lockcheck used to barf if we + * did it before start_kernel(). I think we fixed that, so it'd be + * nice to move it back to lguest_init. Patch welcome... */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); /* The Linux bootloader header contains an "e820" memory map: the @@ -850,12 +862,19 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) return len; } +/* Rebooting also tells the Host we're finished, but the RESTART flag tells the + * Launcher to reboot us. */ +static void lguest_restart(char *reason) +{ + hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); +} + /*G:050 * Patching (Powerfully Placating Performance Pedants) * - * We have already seen that pv_ops structures let us replace simple - * native instructions with calls to the appropriate back end all throughout - * the kernel. This allows the same kernel to run as a Guest and as a native + * We have already seen that pv_ops structures let us replace simple native + * instructions with calls to the appropriate back end all throughout the + * kernel. This allows the same kernel to run as a Guest and as a native * kernel, but it's slow because of all the indirect branches. * * Remember that David Wheeler quote about "Any problem in computer science can @@ -908,14 +927,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); -} - -/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops - * structures in the kernel provide points for (almost) every routine we have - * to override to avoid privileged instructions. */ +/*G:030 Once we get to lguest_init(), we know we're a Guest. The various + * pv_ops structures in the kernel provide points for (almost) every routine we + * have to override to avoid privileged instructions. */ __init void lguest_init(void) { /* We're under lguest, paravirt is enabled, and we're running at @@ -1003,9 +1017,9 @@ __init void lguest_init(void) * the normal data segment to get through booting. */ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); - /* The Host uses the top of the Guest's virtual address space for the - * Host<->Guest Switcher, and it tells us how big that is in - * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ + /* The Host<->Guest Switcher lives at the top of our address space, and + * the Host told us how big it is when we made LGUEST_INIT hypercall: + * it put the answer in lguest_data.reserve_mem */ reserve_top_address(lguest_data.reserve_mem); /* If we don't initialize the lock dependency checker now, it crashes @@ -1027,6 +1041,7 @@ __init void lguest_init(void) /* Math is always hard! */ new_cpu_data.hard_math = 1; + /* We don't have features. We have puppies! Puppies! */ #ifdef CONFIG_X86_MCE mce_disabled = 1; #endif @@ -1044,10 +1059,11 @@ __init void lguest_init(void) virtio_cons_early_init(early_put_chars); /* Last of all, we set the power management poweroff hook to point to - * the Guest routine to power off. */ + * the Guest routine to power off, and the reboot hook to our restart + * routine. */ pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; + /* Now we're set up, call start_kernel() in init/main.c and we proceed * to boot as normal. It never returns. */ start_kernel(); diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 95b6fbcded63..5c7cef34c9e7 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -5,13 +5,20 @@ #include #include -/*G:020 This is where we begin: head.S notes that the boot header's platform - * type field is "1" (lguest), so calls us here. +/*G:020 Our story starts with the kernel booting into startup_32 in + * arch/x86/kernel/head_32.S. It expects a boot header, which is created by + * the bootloader (the Launcher in our case). + * + * The startup_32 function does very little: it clears the uninitialized global + * C variables which we expect to be zero (ie. BSS) and then copies the boot + * header and kernel command line somewhere safe. Finally it checks the + * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: + * if it's set to '1' (lguest's assigned number), then it calls us here. * * WARNING: be very careful here! We're running at addresses equal to physical * addesses (around 0), not above PAGE_OFFSET as most code expectes * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any - * data. + * data without remembering to subtract __PAGE_OFFSET! * * The .section line puts this code in .init.text so it will be discarded after * boot. */ @@ -24,7 +31,7 @@ ENTRY(lguest_entry) int $LGUEST_TRAP_ENTRY /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl - * instruction uses %esi implicitly as the source for the copy we' + * instruction uses %esi implicitly as the source for the copy we're * about to do. */ movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index c632c08cbbdc..5eea4356d703 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -1,8 +1,6 @@ /*P:400 This contains run_guest() which actually calls into the Host<->Guest * Switcher and analyzes the return, such as determining if the Guest wants the - * Host to do something. This file also contains useful helper routines, and a - * couple of non-obvious setup and teardown pieces which were implemented after - * days of debugging pain. :*/ + * Host to do something. This file also contains useful helper routines. :*/ #include #include #include @@ -49,8 +47,8 @@ static __init int map_switcher(void) * easy. */ - /* We allocate an array of "struct page"s. map_vm_area() wants the - * pages in this form, rather than just an array of pointers. */ + /* We allocate an array of struct page pointers. map_vm_area() wants + * this, rather than just an array of pages. */ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, GFP_KERNEL); if (!switcher_page) { @@ -172,7 +170,7 @@ void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) } } -/* This is the write (copy into guest) version. */ +/* This is the write (copy into Guest) version. */ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, unsigned bytes) { @@ -209,9 +207,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->break_out) return -EAGAIN; - /* Check if there are any interrupts which can be delivered - * now: if so, this sets up the hander to be executed when we - * next run the Guest. */ + /* Check if there are any interrupts which can be delivered now: + * if so, this sets up the hander to be executed when we next + * run the Guest. */ maybe_do_interrupt(cpu); /* All long-lived kernel loops need to check with this horrible @@ -246,8 +244,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) lguest_arch_handle_trap(cpu); } + /* Special case: Guest is 'dead' but wants a reboot. */ if (cpu->lg->dead == ERR_PTR(-ERESTART)) return -ERESTART; + /* The Guest is dead => "No such file or directory" */ return -ENOENT; } diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 0f2cb4fd7c69..54d66f05fefa 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -29,7 +29,7 @@ #include "lg.h" /*H:120 This is the core hypercall routine: where the Guest gets what it wants. - * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) { switch (args->arg0) { @@ -190,6 +190,13 @@ static void initialize(struct lg_cpu *cpu) * pagetable. */ guest_pagetable_clear_all(cpu); } +/*:*/ + +/*M:013 If a Guest reads from a page (so creates a mapping) that it has never + * written to, and then the Launcher writes to it (ie. the output of a virtual + * device), the Guest will still see the old page. In practice, this never + * happens: why would the Guest read a page which it has never written to? But + * a similar scenario might one day bite us, so it's worth mentioning. :*/ /*H:100 * Hypercalls @@ -227,7 +234,7 @@ void do_hypercalls(struct lg_cpu *cpu) * However, if we are signalled or the Guest sends I/O to the * Launcher, the run_guest() loop will exit without running the * Guest. When it comes back it would try to re-run the - * hypercall. */ + * hypercall. Finding that bug sucked. */ cpu->hcall = NULL; } } diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 32e97c1858e5..0414ddf87587 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -144,7 +144,6 @@ void maybe_do_interrupt(struct lg_cpu *cpu) if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) return; - bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); /* Find the first interrupt. */ @@ -237,9 +236,9 @@ void free_interrupts(void) clear_bit(syscall_vector, used_vectors); } -/*H:220 Now we've got the routines to deliver interrupts, delivering traps - * like page fault is easy. The only trick is that Intel decided that some - * traps should have error codes: */ +/*H:220 Now we've got the routines to deliver interrupts, delivering traps like + * page fault is easy. The only trick is that Intel decided that some traps + * should have error codes: */ static int has_err(unsigned int trap) { return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 1b2ec0bf5eb1..2bc9bf7e88e5 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -1,10 +1,10 @@ /*P:050 Lguest guests use a very simple method to describe devices. It's a - * series of device descriptors contained just above the top of normal + * series of device descriptors contained just above the top of normal Guest * memory. * * We use the standard "virtio" device infrastructure, which provides us with a * console, a network and a block driver. Each one expects some configuration - * information and a "virtqueue" mechanism to send and receive data. :*/ + * information and a "virtqueue" or two to send and receive data. :*/ #include #include #include @@ -53,7 +53,7 @@ struct lguest_device { * Device configurations * * The configuration information for a device consists of one or more - * virtqueues, a feature bitmaks, and some configuration bytes. The + * virtqueues, a feature bitmap, and some configuration bytes. The * configuration bytes don't really matter to us: the Launcher sets them up, and * the driver will look at them during setup. * @@ -179,7 +179,7 @@ struct lguest_vq_info }; /* When the virtio_ring code wants to prod the Host, it calls us here and we - * make a hypercall. We hand the page number of the virtqueue so the Host + * make a hypercall. We hand the physical address of the virtqueue so the Host * knows which virtqueue we're talking about. */ static void lg_notify(struct virtqueue *vq) { @@ -199,7 +199,8 @@ static void lg_notify(struct virtqueue *vq) * allocate its own pages and tell the Host where they are, but for lguest it's * simpler for the Host to simply tell us where the pages are. * - * So we provide devices with a "find virtqueue and set it up" function. */ + * So we provide drivers with a "find the Nth virtqueue and set it up" + * function. */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq)) diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 2221485b0773..564e425d71dd 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -73,7 +73,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) if (current != cpu->tsk) return -EPERM; - /* If the guest is already dead, we indicate why */ + /* If the Guest is already dead, we indicate why */ if (lg->dead) { size_t len; @@ -88,7 +88,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return len; } - /* If we returned from read() last time because the Guest notified, + /* If we returned from read() last time because the Guest sent I/O, * clear the flag. */ if (cpu->pending_notify) cpu->pending_notify = 0; @@ -97,14 +97,20 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return run_guest(cpu, (unsigned long __user *)user); } +/*L:025 This actually initializes a CPU. For the moment, a Guest is only + * uniprocessor, so "id" is always 0. */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { + /* We have a limited number the number of CPUs in the lguest struct. */ if (id >= NR_CPUS) return -EINVAL; + /* Set up this CPU's id, and pointer back to the lguest struct. */ cpu->id = id; cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); cpu->lg->nr_cpus++; + + /* Each CPU has a timer it can set. */ init_clockdev(cpu); /* We need a complete page for the Guest registers: they are accessible @@ -120,11 +126,11 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) * address. */ lguest_arch_setup_regs(cpu, start_ip); - /* Initialize the queue for the waker to wait on */ + /* Initialize the queue for the Waker to wait on */ init_waitqueue_head(&cpu->break_wq); /* We keep a pointer to the Launcher task (ie. current task) for when - * other Guests want to wake this one (inter-Guest I/O). */ + * other Guests want to wake this one (eg. console input). */ cpu->tsk = current; /* We need to keep a pointer to the Launcher's memory map, because if @@ -136,6 +142,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) * when the same Guest runs on the same CPU twice. */ cpu->last_pages = NULL; + /* No error == success. */ return 0; } @@ -185,14 +192,13 @@ static int initialize(struct file *file, const unsigned long __user *input) lg->mem_base = (void __user *)(long)args[0]; lg->pfn_limit = args[1]; - /* This is the first cpu */ + /* This is the first cpu (cpu 0) and it will start booting at args[3] */ err = lg_cpu_start(&lg->cpus[0], 0, args[3]); if (err) goto release_guest; /* Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can - * fail. */ + * address the Launcher gave us. This allocates memory, so can fail. */ err = init_guest_pagetable(lg, args[2]); if (err) goto free_regs; @@ -218,11 +224,16 @@ unlock: /*L:010 The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use - * writes of other values to send interrupts. */ + * writes of other values to send interrupts. + * + * Note that we overload the "offset" in the /dev/lguest file to indicate what + * CPU number we're dealing with. Currently this is always 0, since we only + * support uniprocessor Guests, but you can see the beginnings of SMP support + * here. */ static ssize_t write(struct file *file, const char __user *in, size_t size, loff_t *off) { - /* Once the guest is initialized, we hold the "struct lguest" in the + /* Once the Guest is initialized, we hold the "struct lguest" in the * file private data. */ struct lguest *lg = file->private_data; const unsigned long __user *input = (const unsigned long __user *)in; @@ -230,6 +241,7 @@ static ssize_t write(struct file *file, const char __user *in, struct lg_cpu *uninitialized_var(cpu); unsigned int cpu_id = *off; + /* The first value tells us what this request is. */ if (get_user(req, input) != 0) return -EFAULT; input++; diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a7f64a9d67e0..d93500f24fbb 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -2,8 +2,8 @@ * previous encounters. It's functional, and as neat as it can be in the * circumstances, but be wary, for these things are subtle and break easily. * The Guest provides a virtual to physical mapping, but we can neither trust - * it nor use it: we verify and convert it here to point the hardware to the - * actual Guest pages when running the Guest. :*/ + * it nor use it: we verify and convert it here then point the CPU to the + * converted Guest pages when running the Guest. :*/ /* Copyright (C) Rusty Russell IBM Corporation 2006. * GPL v2 and any later version */ @@ -106,6 +106,11 @@ static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); } +/*:*/ + +/*M:014 get_pfn is slow; it takes the mmap sem and calls get_user_pages. We + * could probably try to grab batches of pages here as an optimization + * (ie. pre-faulting). :*/ /*H:350 This routine takes a page number given by the Guest and converts it to * an actual, physical page number. It can fail for several reasons: the @@ -113,8 +118,8 @@ static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) * and the page is read-only, or the write flag was set and the page was * shared so had to be copied, but we ran out of memory. * - * This holds a reference to the page, so release_pte() is careful to - * put that back. */ + * This holds a reference to the page, so release_pte() is careful to put that + * back. */ static unsigned long get_pfn(unsigned long virtpfn, int write) { struct page *page; @@ -532,13 +537,13 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * all processes. So when the page table above that address changes, we update * all the page tables, not just the current one. This is rare. * - * The benefit is that when we have to track a new page table, we can copy keep - * all the kernel mappings. This speeds up context switch immensely. */ + * The benefit is that when we have to track a new page table, we can keep all + * the kernel mappings. This speeds up context switch immensely. */ void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { - /* Kernel mappings must be changed on all top levels. Slow, but - * doesn't happen often. */ + /* Kernel mappings must be changed on all top levels. Slow, but doesn't + * happen often. */ if (vaddr >= cpu->lg->kernel_address) { unsigned int i; for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) @@ -704,12 +709,11 @@ static __init void populate_switcher_pte_page(unsigned int cpu, /* We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * - * If nothing else, note that all this complexity in juggling shadow page - * tables in sync with the Guest's page tables is for one reason: for most - * Guests this page table dance determines how bad performance will be. This - * is why Xen uses exotic direct Guest pagetable manipulation, and why both - * Intel and AMD have implemented shadow page table support directly into - * hardware. + * If nothing else, note that all this complexity in juggling shadow page tables + * in sync with the Guest's page tables is for one reason: for most Guests this + * page table dance determines how bad performance will be. This is why Xen + * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD + * have implemented shadow page table support directly into hardware. * * There is just one file remaining in the Host. */ diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 635187812d52..5126d5d9ea0e 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -17,6 +17,13 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +/*P:450 This file contains the x86-specific lguest code. It used to be all + * mixed in with drivers/lguest/core.c but several foolhardy code slashers + * wrestled most of the dependencies out to here in preparation for porting + * lguest to other architectures (see what I mean by foolhardy?). + * + * This also contains a couple of non-obvious setup and teardown pieces which + * were implemented after days of debugging pain. :*/ #include #include #include @@ -157,6 +164,8 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * also simplify copy_in_guest_info(). Note that we'd still need to restore * things when we exit to Launcher userspace, but that's fairly easy. * + * We could also try using this hooks for PGE, but that might be too expensive. + * * The hooks were designed for KVM, but we can also put them to good use. :*/ /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts @@ -182,7 +191,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * was doing. */ run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); - /* Note that the "regs" pointer contains two extra entries which are + /* Note that the "regs" structure contains two extra entries which are * not really registers: a trap number which says what interrupt or * trap made the switcher code come back, and an error code which some * traps set. */ @@ -293,11 +302,10 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) break; case 14: /* We've intercepted a Page Fault. */ /* The Guest accessed a virtual address that wasn't mapped. - * This happens a lot: we don't actually set up most of the - * page tables for the Guest at all when we start: as it runs - * it asks for more and more, and we set them up as - * required. In this case, we don't even tell the Guest that - * the fault happened. + * This happens a lot: we don't actually set up most of the page + * tables for the Guest at all when we start: as it runs it asks + * for more and more, and we set them up as required. In this + * case, we don't even tell the Guest that the fault happened. * * The errcode tells whether this was a read or a write, and * whether kernel or userspace code. */ @@ -342,7 +350,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) if (!deliver_trap(cpu, cpu->regs->trapnum)) /* If the Guest doesn't have a handler (either it hasn't * registered any yet, or it's one of the faults we don't let - * it handle), it dies with a cryptic error message. */ + * it handle), it dies with this cryptic error message. */ kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", cpu->regs->trapnum, cpu->regs->eip, cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault @@ -375,8 +383,8 @@ void __init lguest_arch_host_init(void) * The only exception is the interrupt handlers in switcher.S: their * addresses are placed in a table (default_idt_entries), so we need to * update the table with the new addresses. switcher_offset() is a - * convenience function which returns the distance between the builtin - * switcher code and the high-mapped copy we just made. */ + * convenience function which returns the distance between the + * compiled-in switcher code and the high-mapped copy we just made. */ for (i = 0; i < IDT_ENTRIES; i++) default_idt_entries[i] += switcher_offset(); @@ -416,7 +424,7 @@ void __init lguest_arch_host_init(void) state->guest_gdt_desc.address = (long)&state->guest_gdt; /* We know where we want the stack to be when the Guest enters - * the switcher: in pages->regs. The stack grows upwards, so + * the Switcher: in pages->regs. The stack grows upwards, so * we start it at the end of that structure. */ state->guest_tss.sp0 = (long)(&pages->regs + 1); /* And this is the GDT entry to use for the stack: we keep a @@ -513,8 +521,8 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) { u32 tsc_speed; - /* The pointer to the Guest's "struct lguest_data" is the only - * argument. We check that address now. */ + /* The pointer to the Guest's "struct lguest_data" is the only argument. + * We check that address now. */ if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, sizeof(*cpu->lg->lguest_data))) return -EFAULT; @@ -546,6 +554,7 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) return 0; } +/*:*/ /*L:030 lguest_arch_setup_regs() * diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 0af8baaa0d4a..3fc15318a80f 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,6 +1,6 @@ -/*P:900 This is the Switcher: code which sits at 0xFFC00000 to do the low-level - * Guest<->Host switch. It is as simple as it can be made, but it's naturally - * very specific to x86. +/*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the + * Host and Guest to do the low-level Guest<->Host switch. It is as simple as + * it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage @@ -189,7 +189,7 @@ ENTRY(switch_to_guest) // Interrupts are turned back on: we are Guest. iret -// We treat two paths to switch back to the Host +// We tread two paths to switch back to the Host // Yet both must save Guest state and restore Host // So we put the routine in a macro. #define SWITCH_TO_HOST \ diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h index 758b9a5d4539..f239e7069cab 100644 --- a/include/asm-x86/lguest_hcall.h +++ b/include/asm-x86/lguest_hcall.h @@ -27,7 +27,7 @@ #ifndef __ASSEMBLY__ #include -/*G:031 First, how does our Guest contact the Host to ask for privileged +/*G:031 But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index 589be3e1f3ac..e7217dc58f39 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -16,6 +16,10 @@ * a new device, we simply need to write a new virtio driver and create support * for it in the Launcher: this code won't need to change. * + * Virtio devices are also used by kvm, so we can simply reuse their optimized + * device drivers. And one day when everyone uses virtio, my plan will be + * complete. Bwahahahah! + * * Devices are described by a simplified ID, a status byte, and some "config" * bytes which describe this device's configuration. This is placed by the * Launcher just above the top of physical memory: @@ -26,7 +30,7 @@ struct lguest_device_desc { /* The number of virtqueues (first in config array) */ __u8 num_vq; /* The number of bytes of feature bits. Multiply by 2: one for host - * features and one for guest acknowledgements. */ + * features and one for Guest acknowledgements. */ __u8 feature_len; /* The number of bytes of the config array after virtqueues. */ __u8 config_len; -- cgit v1.2.3 From 9c312058b2e530722c7bd30c1b6f26eea35dc5fe Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 28 Mar 2008 11:47:34 -0700 Subject: Avoid false positive warnings in kmap_atomic_prot() with DEBUG_HIGHMEM I believe http://bugzilla.kernel.org/show_bug.cgi?id=10318 is a false positive. There's no way in which networking will be using highmem pages here, so it won't be taking the KM_USER0 kmap slot, so there's no point in performing these checks. Cc: Pawel Staszewski Cc: Ingo Molnar Acked-by: Christoph Lameter Cc: "David S. Miller" Signed-off-by: Andrew Morton [ Really sad. We lose almost all real-life coverage of the debug tests with this patch. Now it will only report problems for the cases where people actually end up using a HIGHMEM page, not when they just _might_ use one. - Linus ] Signed-off-by: Linus Torvalds --- arch/x86/mm/highmem_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 3d936f232704..9cf33d3ee5bc 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -73,15 +73,15 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ - - debug_kmap_atomic_prot(type); + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ pagefault_disable(); if (!PageHighMem(page)) return page_address(page); + debug_kmap_atomic_prot(type); + idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); -- cgit v1.2.3 From 629c8b4cdb354518308663aff2f719e02f69ffbe Mon Sep 17 00:00:00 2001 From: Ken'ichi Ohmichi Date: Wed, 2 Apr 2008 13:04:50 -0700 Subject: vmcoreinfo: add the symbol "phys_base" Fix the problem that makedumpfile sometimes fails on x86_64 machine. This patch adds the symbol "phys_base" to a vmcoreinfo data. The vmcoreinfo data has the minimum debugging information only for dump filtering. makedumpfile (dump filtering command) gets it to distinguish unnecessary pages, and makedumpfile creates a small dumpfile. On x86_64 kernel which compiled with CONFIG_PHYSICAL_START=0x0 and CONFIG_RELOCATABLE=y, makedumpfile fails like the following: # makedumpfile -d31 /proc/vmcore dumpfile The kernel version is not supported. The created dumpfile may be incomplete. _exclude_free_page: Can't get next online node. makedumpfile Failed. # The cause is the lack of the symbol "phys_base" in a vmcoreinfo data. If the symbol "phys_base" does not exist, makedumpfile considers an x86_64 kernel as non relocatable. As the result, makedumpfile misunderstands the physical address where the kernel is loaded, and it cannot translate a kernel virtual address to physical address correctly. To fix this problem, this patch adds the symbol "phys_base" to a vmcoreinfo data. Signed-off-by: Ken'ichi Ohmichi Cc: "Eric W. Biederman" Cc: Acked-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 236d2f8f7ddc..576a03db4511 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -233,6 +233,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { + VMCOREINFO_SYMBOL(phys_base); VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA -- cgit v1.2.3 From 4ba51fd75cc3789be83f0d6f878dabbb0cb19bca Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 3 Apr 2008 14:18:55 -0700 Subject: x86 ptrace: avoid unnecessary wrmsr This avoids using wrmsr on MSR_IA32_DEBUGCTLMSR when it's not needed. No wrmsr ever needs to be done if noone has ever used block stepping. Without this change, using ptrace on 2.6.25 on an x86 KVM guest will tickle KVM's missing support for the MSR and crash the guest kernel. Though host KVM is the buggy one, this makes for a regression in the guest behavior from 2.6.24->2.6.25 that we can easily avoid. I also corrected some bad whitespace. Signed-off-by: Roland McGrath Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- arch/x86/kernel/step.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 9d406cdc847f..071ff4798236 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -140,6 +140,9 @@ static int enable_single_step(struct task_struct *child) */ static void write_debugctlmsr(struct task_struct *child, unsigned long val) { + if (child->thread.debugctlmsr == val) + return; + child->thread.debugctlmsr = val; if (child != current) @@ -165,11 +168,11 @@ static void enable_step(struct task_struct *child, bool block) write_debugctlmsr(child, child->thread.debugctlmsr | DEBUGCTLMSR_BTF); } else { - write_debugctlmsr(child, - child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); + write_debugctlmsr(child, + child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); } } -- cgit v1.2.3 From 4f14bdef41e599e218d71e3d0abf339d65e9b480 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Mar 2008 23:37:58 +0100 Subject: x86: fix nmi_watchdog=2 on Pentium-D CPUs implement nmi_watchdog=2 on this class of CPUs: cpu family : 15 model : 6 model name : Intel(R) Pentium(R) D CPU 3.00GHz the watchdog's ->setup() method is safe anyway, so if the CPU cannot support it we'll bail out safely. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9b838324b818..19a359472ae1 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -652,9 +652,6 @@ static void probe_nmi_watchdog(void) wd_ops = &p6_wd_ops; break; case 15: - if (boot_cpu_data.x86_model > 0x4) - return; - wd_ops = &p4_wd_ops; break; default: -- cgit v1.2.3 From 9c9b81f77330ddc003a2de2f35fa6a20410c1a62 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 27 Mar 2008 23:39:42 +0100 Subject: x86: print message if nmi_watchdog=2 cannot be enabled right now if there's no CPU support for nmi_watchdog=2 we'll just refuse it silently. print a useful warning. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 19a359472ae1..b943e10ad814 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -667,8 +667,10 @@ int lapic_watchdog_init(unsigned nmi_hz) { if (!wd_ops) { probe_nmi_watchdog(); - if (!wd_ops) + if (!wd_ops) { + printk(KERN_INFO "NMI watchdog: CPU not supported\n"); return -1; + } if (!wd_ops->reserve()) { printk(KERN_ERR -- cgit v1.2.3 From 8f59610de2fb244b5bc1a3feafd328a8d4d511d6 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 1 Apr 2008 14:24:03 +0200 Subject: x86, agpgart: scary messages are fortunately obsolete Fix obsolete printks in aperture-64. We used not to handle missing agpgart, but we handle it okay now. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index faf3229f8fb3..700e4647dd30 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -615,8 +615,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ - printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); + printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + KERN_WARNING "falling back to iommu=soft.\n"); return -1; } @@ -692,9 +692,9 @@ void __init gart_iommu_init(void) !gart_iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { if (end_pfn > MAX_DMA32_PFN) { - printk(KERN_ERR "WARNING more than 4GB of memory " - "but GART IOMMU not available.\n" - KERN_ERR "WARNING 32bit PCI may malfunction.\n"); + printk(KERN_WARNING "More than 4GB of memory " + "but GART IOMMU not available.\n" + KERN_WARNING "falling back to iommu=soft.\n"); } return; } -- cgit v1.2.3 From f64337062c09c2c318fbcbf44ed1d739e8bc72ab Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 2 Apr 2008 15:36:36 +0100 Subject: xen: refactor xen_{alloc,release}_{pt,pd}() Signed-off-by: Mark McLoughlin Cc: xen-devel@lists.xensource.com Cc: Mark McLoughlin Cc: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 27 ++++++++++++++++++++------- arch/x86/xen/mmu.c | 7 ------- arch/x86/xen/mmu.h | 7 +++++++ 3 files changed, 27 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index de4e6f05840b..16e2f8096a1a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -667,10 +667,10 @@ static void xen_release_pt_init(u32 pfn) make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } -static void pin_pagetable_pfn(unsigned level, unsigned long pfn) +static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) { struct mmuext_op op; - op.cmd = level; + op.cmd = cmd; op.arg1.mfn = pfn_to_mfn(pfn); if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) BUG(); @@ -687,7 +687,10 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - pin_pagetable_pfn(level, pfn); + if (level == PT_PTE) + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + else if (level == PT_PMD) + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, pfn); } else /* make sure there are no stray mappings of this page */ @@ -697,16 +700,16 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) { - xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L1_TABLE); + xen_alloc_ptpage(mm, pfn, PT_PTE); } static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) { - xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L2_TABLE); + xen_alloc_ptpage(mm, pfn, PT_PMD); } /* This should never happen until we're OK to use struct page */ -static void xen_release_pt(u32 pfn) +static void xen_release_ptpage(u32 pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -718,6 +721,16 @@ static void xen_release_pt(u32 pfn) } } +static void xen_release_pt(u32 pfn) +{ + xen_release_ptpage(pfn, PT_PTE); +} + +static void xen_release_pd(u32 pfn) +{ + xen_release_ptpage(pfn, PT_PMD); +} + #ifdef CONFIG_HIGHPTE static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) { @@ -838,7 +851,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pv_mmu_ops.alloc_pt = xen_alloc_pt; pv_mmu_ops.alloc_pd = xen_alloc_pd; pv_mmu_ops.release_pt = xen_release_pt; - pv_mmu_ops.release_pd = xen_release_pt; + pv_mmu_ops.release_pd = xen_release_pd; pv_mmu_ops.set_pte = xen_set_pte; setup_shared_info(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0144395448ae..2a054ef2a3da 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -310,13 +310,6 @@ pgd_t xen_make_pgd(unsigned long pgd) } #endif /* CONFIG_X86_PAE */ -enum pt_level { - PT_PGD, - PT_PUD, - PT_PMD, - PT_PTE -}; - /* (Yet another) pagetable walker. This one is intended for pinning a pagetable. This means that it walks a pagetable and calls the diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index c9ff27f3ac3a..b5e189b1519d 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -3,6 +3,13 @@ #include #include +enum pt_level { + PT_PGD, + PT_PUD, + PT_PMD, + PT_PTE +}; + /* * Page-directory addresses above 4GB do not fit into architectural %cr3. * When accessing %cr3, or equivalent field in vcpu_guest_context, guests -- cgit v1.2.3 From a684d69d15a8fafede7c5c0daac8c646bbee805c Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 2 Apr 2008 15:36:37 +0100 Subject: xen: Do not pin/unpin PMD pages i.e. with this simple test case: int fd = open("/dev/zero", O_RDONLY); munmap(mmap((void *)0x40000000, 0x1000_LEN, PROT_READ, MAP_PRIVATE, fd, 0), 0x1000); close(fd); we currently get: kernel BUG at arch/x86/xen/enlighten.c:678! ... EIP is at xen_release_pt+0x79/0xa9 ... Call Trace: [] ? __pmd_free_tlb+0x1a/0x75 [] ? free_pgd_range+0x1d2/0x2b5 [] ? free_pgtables+0x7e/0x93 [] ? unmap_region+0xb9/0xf5 [] ? do_munmap+0x193/0x1f5 [] ? sys_munmap+0x30/0x3f [] ? syscall_call+0x7/0xb ======================= and xen complains: (XEN) mm.c:2241:d4 Mfn 1cc37 not pinned Further details at: https://bugzilla.redhat.com/436453 Signed-off-by: Mark McLoughlin Cc: xen-devel@lists.xensource.com Cc: Mark McLoughlin Cc: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 16e2f8096a1a..f16b056e5c56 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -689,8 +689,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); if (level == PT_PTE) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - else if (level == PT_PMD) - pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, pfn); } else /* make sure there are no stray mappings of this page */ @@ -715,7 +713,8 @@ static void xen_release_ptpage(u32 pfn, unsigned level) if (PagePinned(page)) { if (!PageHighMem(page)) { - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + if (level == PT_PTE) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } } -- cgit v1.2.3 From c946c7de49a9ba50bc205d6359b41bbc8f01174c Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 2 Apr 2008 15:36:38 +0100 Subject: xen: Clear PG_pinned in release_{pt,pd}() Signed-off-by: Mark McLoughlin Cc: xen-devel@lists.xensource.com Cc: Mark McLoughlin Cc: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f16b056e5c56..27ee26aedf94 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -717,6 +717,7 @@ static void xen_release_ptpage(u32 pfn, unsigned level) pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } + ClearPagePinned(page); } } -- cgit v1.2.3 From 47001d603375f857a7fab0e9c095d964a1ea0039 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Apr 2008 19:45:18 +0200 Subject: x86: tsc prevent time going backwards We already catch most of the TSC problems by sanity checks, but there is a subtle bug which has been in the code for ever. This can cause time jumps in the range of hours. This was reported in: http://lkml.org/lkml/2007/8/23/96 and http://lkml.org/lkml/2008/3/31/23 I was able to reproduce the problem with a gettimeofday loop test on a dual core and a quad core machine which both have sychronized TSCs. The TSCs seems not to be perfectly in sync though, but the kernel is not able to detect the slight delta in the sync check. Still there exists an extremly small window where this delta can be observed with a real big time jump. So far I was only able to reproduce this with the vsyscall gettimeofday implementation, but in theory this might be observable with the syscall based version as well. CPU 0 updates the clock source variables under xtime/vyscall lock and CPU1, where the TSC is slighty behind CPU0, is reading the time right after the seqlock was unlocked. The clocksource reference data was updated with the TSC from CPU0 and the value which is read from TSC on CPU1 is less than the reference data. This results in a huge delta value due to the unsigned subtraction of the TSC value and the reference value. This algorithm can not be changed due to the support of wrapping clock sources like pm timer. The huge delta is converted to nanoseconds and added to xtime, which is then observable by the caller. The next gettimeofday call on CPU1 will show the correct time again as now the TSC has advanced above the reference value. To prevent this TSC specific wreckage we need to compare the TSC value against the reference value and return the latter when it is larger than the actual TSC value. I pondered to mark the TSC unstable when the readout is smaller than the reference value, but this would render an otherwise good and fast clocksource unusable without a real good reason. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 15 ++++++++++++++- arch/x86/kernel/tsc_64.c | 23 ++++++++++++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index f14cfd9d1f94..d7498b34c8e9 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -287,14 +287,27 @@ core_initcall(cpufreq_tsc); /* clock source code */ static unsigned long current_tsc_khz = 0; +static struct clocksource clocksource_tsc; +/* + * We compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp issue. This can be observed in + * a very small window right after one CPU updated cycle_last under + * xtime lock and the other CPU reads a TSC value which is smaller + * than the cycle_last reference value due to a TSC which is slighty + * behind. This delta is nowhere else observable, but in that case it + * results in a forward time jump in the range of hours due to the + * unsigned delta calculation of the time keeping core code, which is + * necessary to support wrapping clocksources like pm timer. + */ static cycle_t read_tsc(void) { cycle_t ret; rdtscll(ret); - return ret; + return ret >= clocksource_tsc.cycle_last ? + ret : clocksource_tsc.cycle_last; } static struct clocksource clocksource_tsc = { diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 947554ddabb6..01fc9f0c39e2 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -11,6 +11,7 @@ #include #include #include +#include static int notsc __initdata = 0; @@ -290,18 +291,34 @@ int __init notsc_setup(char *s) __setup("notsc", notsc_setup); +static struct clocksource clocksource_tsc; -/* clock source code: */ +/* + * We compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp. This can be observed in a + * very small window right after one CPU updated cycle_last under + * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which + * is smaller than the cycle_last reference value due to a TSC which + * is slighty behind. This delta is nowhere else observable, but in + * that case it results in a forward time jump in the range of hours + * due to the unsigned delta calculation of the time keeping core + * code, which is necessary to support wrapping clocksources like pm + * timer. + */ static cycle_t read_tsc(void) { cycle_t ret = (cycle_t)get_cycles(); - return ret; + + return ret >= clocksource_tsc.cycle_last ? + ret : clocksource_tsc.cycle_last; } static cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret = (cycle_t)vget_cycles(); - return ret; + + return ret >= __vsyscall_gtod_data.clock.cycle_last ? + ret : __vsyscall_gtod_data.clock.cycle_last; } static struct clocksource clocksource_tsc = { -- cgit v1.2.3 From 5761d64b277c287a7520b868c32d656ef03374b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Apr 2008 16:26:10 +0200 Subject: x86: revert assign IRQs to hpet timer The commits: commit 37a47db8d7f0f38dac5acf5a13abbc8f401707fa Author: Balaji Rao Date: Wed Jan 30 13:30:03 2008 +0100 x86: assign IRQs to HPET timers, fix and commit e3f37a54f690d3e64995ea7ecea08c5ab3070faf Author: Balaji Rao Date: Wed Jan 30 13:30:03 2008 +0100 x86: assign IRQs to HPET timers have been identified to cause a regression on some platforms due to the assignement of legacy IRQs which makes the legacy devices connected to those IRQs disfunctional. Revert them. This fixes http://bugzilla.kernel.org/show_bug.cgi?id=10382 Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 9 ++++++--- drivers/char/hpet.c | 51 +++++++------------------------------------------- include/linux/hpet.h | 2 +- 3 files changed, 14 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 235fd6c77504..36652ea1a265 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -133,13 +133,16 @@ static void hpet_reserve_platform_timers(unsigned long id) #ifdef CONFIG_HPET_EMULATE_RTC hpet_reserve_timer(&hd, 1); #endif + hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; - for (i = 2; i < nrtimers; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; + for (i = 2; i < nrtimers; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + hpet_alloc(&hd); + } #else static void hpet_reserve_platform_timers(unsigned long id) { } diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 465ad35ed38f..1399971be689 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -731,14 +731,14 @@ static unsigned long hpet_calibrate(struct hpets *hpetp) int hpet_alloc(struct hpet_data *hdp) { - u64 cap, mcfg, hpet_config; + u64 cap, mcfg; struct hpet_dev *devp; - u32 i, ntimer, irq; + u32 i, ntimer; struct hpets *hpetp; size_t siz; struct hpet __iomem *hpet; static struct hpets *last = NULL; - unsigned long period, irq_bitmap; + unsigned long period; unsigned long long temp; /* @@ -765,47 +765,11 @@ int hpet_alloc(struct hpet_data *hdp) hpetp->hp_hpet_phys = hdp->hd_phys_address; hpetp->hp_ntimer = hdp->hd_nirqs; - hpet = hpetp->hp_hpet; - - /* Assign IRQs statically for legacy devices */ - hpetp->hp_dev[0].hd_hdwirq = hdp->hd_irq[0]; - hpetp->hp_dev[1].hd_hdwirq = hdp->hd_irq[1]; - - /* Assign IRQs dynamically for the others */ - for (i = 2, devp = &hpetp->hp_dev[2]; i < hdp->hd_nirqs; i++, devp++) { - struct hpet_timer __iomem *timer; - timer = &hpet->hpet_timers[devp - hpetp->hp_dev]; + for (i = 0; i < hdp->hd_nirqs; i++) + hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i]; - /* Check if there's already an IRQ assigned to the timer */ - if (hdp->hd_irq[i]) { - hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i]; - continue; - } - - hpet_config = readq(&timer->hpet_config); - irq_bitmap = (hpet_config & Tn_INT_ROUTE_CAP_MASK) - >> Tn_INT_ROUTE_CAP_SHIFT; - if (!irq_bitmap) - irq = 0; /* No valid IRQ Assignable */ - else { - irq = find_first_bit(&irq_bitmap, 32); - do { - hpet_config |= irq << Tn_INT_ROUTE_CNF_SHIFT; - writeq(hpet_config, &timer->hpet_config); - - /* - * Verify whether we have written a valid - * IRQ number by reading it back again - */ - hpet_config = readq(&timer->hpet_config); - if (irq == (hpet_config & Tn_INT_ROUTE_CNF_MASK) - >> Tn_INT_ROUTE_CNF_SHIFT) - break; /* Success */ - } while ((irq = (find_next_bit(&irq_bitmap, 32, irq)))); - } - hpetp->hp_dev[i].hd_hdwirq = irq; - } + hpet = hpetp->hp_hpet; cap = readq(&hpet->hpet_cap); @@ -836,8 +800,7 @@ int hpet_alloc(struct hpet_data *hdp) hpetp->hp_which, hdp->hd_phys_address, hpetp->hp_ntimer > 1 ? "s" : ""); for (i = 0; i < hpetp->hp_ntimer; i++) - printk("%s %d", i > 0 ? "," : "", - hpetp->hp_dev[i].hd_hdwirq); + printk("%s %d", i > 0 ? "," : "", hdp->hd_irq[i]); printk("\n"); printk(KERN_INFO "hpet%u: %u %d-bit timers, %Lu Hz\n", diff --git a/include/linux/hpet.h b/include/linux/hpet.h index 9cd94bfd07e5..2dc29ce6c8e4 100644 --- a/include/linux/hpet.h +++ b/include/linux/hpet.h @@ -64,7 +64,7 @@ struct hpet { */ #define Tn_INT_ROUTE_CAP_MASK (0xffffffff00000000ULL) -#define Tn_INT_ROUTE_CAP_SHIFT (32UL) +#define Tn_INI_ROUTE_CAP_SHIFT (32UL) #define Tn_FSB_INT_DELCAP_MASK (0x8000UL) #define Tn_FSB_INT_DELCAP_SHIFT (15) #define Tn_FSB_EN_CNF_MASK (0x4000UL) -- cgit v1.2.3 From 64ba4f230d30b089bc89db2e59d02c1efa5ac769 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 6 Apr 2008 17:23:38 +1000 Subject: Fix booting pentium+ with dodgy TSC We handle a broken tsc these days, so no need to panic. We clear the TSC bit when tsc_init decides it's unreliable (eg. under lguest w/ bad host TSC), leading to bogus panic. Signed-off-by: Rusty Russell Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/bugs.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 027e5c003b16..170d2f5523b2 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -142,14 +142,6 @@ static void __init check_config(void) panic("Kernel requires i486+ for 'invlpg' and other features"); #endif -/* - * If we configured ourselves for a TSC, we'd better have one! - */ -#ifdef CONFIG_X86_TSC - if (!cpu_has_tsc) - panic("Kernel compiled for Pentium+, requires TSC feature!"); -#endif - /* * If we were told we had a good local APIC, check for buggy Pentia, * i.e. all B steppings and the C2 stepping of P54C when using their -- cgit v1.2.3 From 5b13d863573e746739ccfc24ac1a9473cfee8df1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 7 Apr 2008 20:58:08 +0200 Subject: revert "x86: tsc prevent time going backwards" revert: | commit 47001d603375f857a7fab0e9c095d964a1ea0039 | Author: Thomas Gleixner | Date: Tue Apr 1 19:45:18 2008 +0200 | | x86: tsc prevent time going backwards it has been identified to cause suspend regression - and the commit fixes a longstanding bug that existed before 2.6.25 was opened - so it can wait some more until the effects are better understood. Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 15 +-------------- arch/x86/kernel/tsc_64.c | 23 +++-------------------- 2 files changed, 4 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index d7498b34c8e9..f14cfd9d1f94 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -287,27 +287,14 @@ core_initcall(cpufreq_tsc); /* clock source code */ static unsigned long current_tsc_khz = 0; -static struct clocksource clocksource_tsc; -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp issue. This can be observed in - * a very small window right after one CPU updated cycle_last under - * xtime lock and the other CPU reads a TSC value which is smaller - * than the cycle_last reference value due to a TSC which is slighty - * behind. This delta is nowhere else observable, but in that case it - * results in a forward time jump in the range of hours due to the - * unsigned delta calculation of the time keeping core code, which is - * necessary to support wrapping clocksources like pm timer. - */ static cycle_t read_tsc(void) { cycle_t ret; rdtscll(ret); - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; + return ret; } static struct clocksource clocksource_tsc = { diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 01fc9f0c39e2..947554ddabb6 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -11,7 +11,6 @@ #include #include #include -#include static int notsc __initdata = 0; @@ -291,34 +290,18 @@ int __init notsc_setup(char *s) __setup("notsc", notsc_setup); -static struct clocksource clocksource_tsc; -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp. This can be observed in a - * very small window right after one CPU updated cycle_last under - * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which - * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in - * that case it results in a forward time jump in the range of hours - * due to the unsigned delta calculation of the time keeping core - * code, which is necessary to support wrapping clocksources like pm - * timer. - */ +/* clock source code: */ static cycle_t read_tsc(void) { cycle_t ret = (cycle_t)get_cycles(); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; + return ret; } static cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret = (cycle_t)vget_cycles(); - - return ret >= __vsyscall_gtod_data.clock.cycle_last ? - ret : __vsyscall_gtod_data.clock.cycle_last; + return ret; } static struct clocksource clocksource_tsc = { -- cgit v1.2.3 From 4f41c94d5c24e3b3453e9df03c0a80ca1acf00d2 Mon Sep 17 00:00:00 2001 From: Karsten Wiese Date: Mon, 7 Apr 2008 12:14:45 +0200 Subject: x86: fix call to set_cyc2ns_scale() from time_cpufreq_notifier() In time_cpufreq_notifier() the cpu id to act upon is held in freq->cpu. Use it instead of smp_processor_id() in the call to set_cyc2ns_scale(). This makes the preempt_*able() unnecessary and lets set_cyc2ns_scale() update the intended cpu's cyc2ns. Related mail/thread: http://lkml.org/lkml/2007/12/7/130 Signed-off-by: Karsten Wiese Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 4 +--- arch/x86/kernel/tsc_64.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index f14cfd9d1f94..c2241e04ea5f 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -256,9 +256,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { tsc_khz = cpu_khz; - preempt_disable(); - set_cyc2ns_scale(cpu_khz, smp_processor_id()); - preempt_enable(); + set_cyc2ns_scale(cpu_khz, freq->cpu); /* * TSC based sched_clock turns * to junk w/ cpufreq diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 947554ddabb6..d3bebaaad842 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -148,9 +148,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, mark_tsc_unstable("cpufreq changes"); } - preempt_disable(); - set_cyc2ns_scale(tsc_khz_ref, smp_processor_id()); - preempt_enable(); + set_cyc2ns_scale(tsc_khz_ref, freq->cpu); return 0; } -- cgit v1.2.3 From f4be31ec9690cfe6e94fcbed6ae60a6a38b3c3ed Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Apr 2008 19:04:07 -0400 Subject: pop previous section in alternative.c gcc expects all toplevel assembly to return to the original section type. The code in alteranative.c does not do this. This caused some strange bugs in sched-devel where code would end up in the .rodata section and when the kernel sets the NX bit on all .rodata, the kernel would crash when executing this code. This patch adds a .previous marker to return the code back to the original section. Credit goes to Andrew Pinski for telling me it wasn't a gcc bug but a bug in the toplevel asm code in the kernel. ;-) Signed-off-by: Steven Rostedt Signed-off-by: Linus Torvalds --- arch/x86/kernel/alternative.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 45d79ea890ae..5fed98ca0e1f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -65,7 +65,8 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); get them easily into strings. */ asm("\t.section .rodata, \"a\"\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 - GENERIC_NOP7 GENERIC_NOP8); + GENERIC_NOP7 GENERIC_NOP8 + "\t.previous"); extern const unsigned char intelnops[]; static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { NULL, @@ -83,7 +84,8 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { #ifdef K8_NOP1 asm("\t.section .rodata, \"a\"\nk8nops: " K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); + K8_NOP7 K8_NOP8 + "\t.previous"); extern const unsigned char k8nops[]; static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { NULL, @@ -101,7 +103,8 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { #ifdef K7_NOP1 asm("\t.section .rodata, \"a\"\nk7nops: " K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8); + K7_NOP7 K7_NOP8 + "\t.previous"); extern const unsigned char k7nops[]; static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { NULL, @@ -119,7 +122,8 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { #ifdef P6_NOP1 asm("\t.section .rodata, \"a\"\np6nops: " P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 - P6_NOP7 P6_NOP8); + P6_NOP7 P6_NOP8 + "\t.previous"); extern const unsigned char p6nops[]; static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { NULL, -- cgit v1.2.3 From 783e391b7b5b273cd20856d8f6f4878da8ec31b3 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Thu, 10 Apr 2008 09:49:58 -0700 Subject: x86: Simplify cpu_idle_wait This patch also resolves hangs on boot: http://lkml.org/lkml/2008/2/23/263 http://bugzilla.kernel.org/show_bug.cgi?id=10093 The bug was causing once-in-few-reboots 10-15 sec wait during boot on certain laptops. Earlier commit 40d6a146629b98d8e322b6f9332b182c7cbff3df added smp_call_function in cpu_idle_wait() to kick cpus that are in tickless idle. Looking at cpu_idle_wait code at that time, code seemed to be over-engineered for a case which is rarely used (while changing idle handler). Below is a simplified version of cpu_idle_wait, which just makes a dummy smp_call_function to all cpus, to make them come out of old idle handler and start using the new idle handler. It eliminates code in the idle loop to handle cpu_idle_wait. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Linus Torvalds --- arch/x86/kernel/process_32.c | 47 +++++++++++--------------------------------- arch/x86/kernel/process_64.c | 47 +++++++++++--------------------------------- 2 files changed, 22 insertions(+), 72 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index be3c7a299f02..43930e73f657 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -82,7 +82,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) */ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); void disable_hlt(void) { @@ -190,9 +189,6 @@ void cpu_idle(void) while (!need_resched()) { void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - check_pgt_cache(); rmb(); idle = pm_idle; @@ -220,40 +216,19 @@ static void do_nothing(void *unused) { } +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ void cpu_idle_wait(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, NULL, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3baf9b9f4c87..46c4c546b499 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -63,7 +63,6 @@ EXPORT_SYMBOL(boot_option_idle_override); */ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -173,9 +172,6 @@ void cpu_idle(void) while (!need_resched()) { void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - rmb(); idle = pm_idle; if (!idle) @@ -207,40 +203,19 @@ static void do_nothing(void *unused) { } +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ void cpu_idle_wait(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, 0, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); -- cgit v1.2.3 From 54a015104136974262afa4b8ddd943ea70dec8a2 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 10 Apr 2008 15:37:38 -0700 Subject: asmlinkage_protect replaces prevent_tail_call The prevent_tail_call() macro works around the problem of the compiler clobbering argument words on the stack, which for asmlinkage functions is the caller's (user's) struct pt_regs. The tail/sibling-call optimization is not the only way that the compiler can decide to use stack argument words as scratch space, which we have to prevent. Other optimizations can do it too. Until we have new compiler support to make "asmlinkage" binding on the compiler's own use of the stack argument frame, we have work around all the manifestations of this issue that crop up. More cases seem to be prevented by also keeping the incoming argument variables live at the end of the function. This makes their original stack slots attractive places to leave those variables, so the compiler tends not clobber them for something else. It's still no guarantee, but it handles some observed cases that prevent_tail_call() did not. Signed-off-by: Roland McGrath Signed-off-by: Linus Torvalds --- arch/x86/kernel/tls.c | 4 ++-- fs/open.c | 8 ++++---- include/asm-x86/linkage.h | 24 +++++++++++++++++++++++- include/linux/linkage.h | 4 ++-- kernel/exit.c | 4 ++-- kernel/uid16.c | 22 +++++++++++----------- 6 files changed, 44 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 022bcaa3b42e..ab6bf375a307 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -92,7 +92,7 @@ int do_set_thread_area(struct task_struct *p, int idx, asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) { int ret = do_set_thread_area(current, -1, u_info, 1); - prevent_tail_call(ret); + asmlinkage_protect(1, ret, u_info); return ret; } @@ -142,7 +142,7 @@ int do_get_thread_area(struct task_struct *p, int idx, asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) { int ret = do_get_thread_area(current, -1, u_info); - prevent_tail_call(ret); + asmlinkage_protect(1, ret, u_info); return ret; } diff --git a/fs/open.c b/fs/open.c index a4b12022edaa..3fa4e4ffce4c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -335,7 +335,7 @@ asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) { long ret = do_sys_ftruncate(fd, length, 1); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(2, ret, fd, length); return ret; } @@ -350,7 +350,7 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) { long ret = do_sys_ftruncate(fd, length, 0); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(2, ret, fd, length); return ret; } #endif @@ -1067,7 +1067,7 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode) ret = do_sys_open(AT_FDCWD, filename, flags, mode); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, filename, flags, mode); return ret; } @@ -1081,7 +1081,7 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, ret = do_sys_open(dfd, filename, flags, mode); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(4, ret, dfd, filename, flags, mode); return ret; } diff --git a/include/asm-x86/linkage.h b/include/asm-x86/linkage.h index 31739c7d66a9..d605eeba0f70 100644 --- a/include/asm-x86/linkage.h +++ b/include/asm-x86/linkage.h @@ -8,12 +8,34 @@ #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) -#define prevent_tail_call(ret) __asm__ ("" : "=r" (ret) : "0" (ret)) /* * For 32-bit UML - mark functions implemented in assembly that use * regparm input parameters: */ #define asmregparm __attribute__((regparm(3))) + +#define asmlinkage_protect(n, ret, args...) \ + __asmlinkage_protect##n(ret, ##args) +#define __asmlinkage_protect_n(ret, args...) \ + __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args) +#define __asmlinkage_protect0(ret) \ + __asmlinkage_protect_n(ret) +#define __asmlinkage_protect1(ret, arg1) \ + __asmlinkage_protect_n(ret, "g" (arg1)) +#define __asmlinkage_protect2(ret, arg1, arg2) \ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2)) +#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3)) +#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ + "g" (arg4)) +#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ + "g" (arg4), "g" (arg5)) +#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ + "g" (arg4), "g" (arg5), "g" (arg6)) + #endif #ifdef CONFIG_X86_ALIGNMENT_16 diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 0592936344c4..fe2a39c489b6 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -17,8 +17,8 @@ # define asmregparm #endif -#ifndef prevent_tail_call -# define prevent_tail_call(ret) do { } while (0) +#ifndef asmlinkage_protect +# define asmlinkage_protect(n, ret, args...) do { } while (0) #endif #ifndef __ALIGN diff --git a/kernel/exit.c b/kernel/exit.c index 53872bf993fa..073005b1cfb2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1608,7 +1608,7 @@ asmlinkage long sys_waitid(int which, pid_t upid, put_pid(pid); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(5, ret, which, upid, infop, options, ru); return ret; } @@ -1640,7 +1640,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, put_pid(pid); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } diff --git a/kernel/uid16.c b/kernel/uid16.c index dd308ba4e03b..3e41c1673e2f 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -21,7 +21,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi { long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, filename, user, group); return ret; } @@ -29,7 +29,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g { long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, filename, user, group); return ret; } @@ -37,7 +37,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) { long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, fd, user, group); return ret; } @@ -45,7 +45,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) { long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(2, ret, rgid, egid); return ret; } @@ -53,7 +53,7 @@ asmlinkage long sys_setgid16(old_gid_t gid) { long ret = sys_setgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, gid); return ret; } @@ -61,7 +61,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) { long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(2, ret, ruid, euid); return ret; } @@ -69,7 +69,7 @@ asmlinkage long sys_setuid16(old_uid_t uid) { long ret = sys_setuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, uid); return ret; } @@ -78,7 +78,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, ruid, euid, suid); return ret; } @@ -98,7 +98,7 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, rgid, egid, sgid); return ret; } @@ -117,7 +117,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid) { long ret = sys_setfsuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, uid); return ret; } @@ -125,7 +125,7 @@ asmlinkage long sys_setfsgid16(old_gid_t gid) { long ret = sys_setfsgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, gid); return ret; } -- cgit v1.2.3