From bb6f1d9a99f1947d91693de62ed54ac3bf1e2dfe Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 16 Dec 2010 17:03:13 -0600 Subject: lguest: fix crash lguest_time_init fe25c7fc2e "x86: lguest: Convert to new irq chip functions" converted enable_lguest_irq() to take a struct irq_data *, but didn't fix the one internal caller. Signed-off-by: Rusty Russell To: x86@kernel.org --- arch/x86/lguest/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/lguest') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 73b1e1a1f489..45e64b37b237 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1002,7 +1002,7 @@ static void lguest_time_init(void) clockevents_register_device(&lguest_clockevent); /* Finally, we unblock the timer interrupt. */ - enable_lguest_irq(0); + clear_bit(0, lguest_data.blocked_interrupts); } /* -- cgit v1.2.3 From bb4093deb259ea9c92415796a6a139e35272f8a8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 16 Dec 2010 17:03:15 -0600 Subject: lguest: restore boot speed lguest is dumb and drops *all* the pagetables for set_pte (which is only used for kernel mapping manipulation, so it's OK without highmem). But it's used a lot in boot, too. As a guest optimization, we suppressed this flushing until the first page switch. Now we have initial_page_table, that happens much earlier, so extend the heuristic to wait until we switch to something other than the swapper_pg_dir or initial_page_table. As measured on my laptop under kvm, this dropped the time-to-mount-root from 48 seconds to 4.3 seconds. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/lguest') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 45e64b37b237..24e49737df7a 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -531,7 +531,10 @@ static void lguest_write_cr3(unsigned long cr3) { lguest_data.pgdir = cr3; lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); - cr3_changed = true; + + /* These two page tables are simple, linear, and used during boot */ + if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) + cr3_changed = true; } static unsigned long lguest_read_cr3(void) @@ -703,9 +706,9 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * to forget all of them. Fortunately, this is very rare. * * ... except in early boot when the kernel sets up the initial pagetables, - * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell - * the Host anything changed until we've done the first page table switch, - * which brings boot back to 0.25 seconds. + * which makes booting astonishingly slow: 48 seconds! So we don't even tell + * the Host anything changed until we've done the first real page table switch, + * which brings boot back to 4.3 seconds. */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { -- cgit v1.2.3 From da32dac101263fb5b155407507c548e3ac2a6a2a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 16 Dec 2010 17:03:15 -0600 Subject: lguest: populate initial_page_table Two x86 patches broke lguest: 1) v2.6.35-492-g72d7c3b, which changed x86 to use the memblock allocator. In lguest, the host places linear page tables at the top of mem, which used to be enough to get us up to the swapper_pg_dir page tables. With the first patch, the direct mapping tables used that memory: Before: kernel direct mapping tables up to 4000000 @ 7000-1a000 After: kernel direct mapping tables up to 4000000 @ 3fed000-4000000 I initially fixed this by lying about the amount of memory we had, so the kernel wouldn't blatt the lguest boot pagetables (yuk!), but then... 2) v2.6.36-rc8-54-gb40827f, which made x86 boot use initial_page_table. This was initialized in a part of head_32.S which isn't executed by lguest; it is then copied into swapper_pg_dir. So we have to initialize it; and anyway we switch to it before we blatt the old tables, so that fixes the previous damage as well. For the moment, I cut & pasted the code into lguest's boot code, but next merge window I will merge them. Signed-off-by: Rusty Russell Cc: Jeremy Fitzhardinge Cc: Konrad Rzeszutek Wilk To: x86@kernel.org --- arch/x86/kernel/head_32.S | 4 +- arch/x86/lguest/boot.c | 3 -- arch/x86/lguest/i386_head.S | 105 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 5 deletions(-) (limited to 'arch/x86/lguest') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index bcece91dd311..f0bea76f6ea5 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -620,13 +620,13 @@ ENTRY(initial_code) __PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -initial_pg_pmd: +ENTRY(initial_pg_pmd) .fill 1024*KPMDS,4,0 #else ENTRY(initial_page_table) .fill 1024,4,0 #endif -initial_pg_fixmap: +ENTRY(initial_pg_fixmap) .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 24e49737df7a..4996cf5f73a0 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1352,9 +1352,6 @@ __init void lguest_init(void) */ switch_to_new_gdt(0); - /* We actually boot with all memory mapped, but let's say 128MB. */ - max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; - /* * The Host<->Guest Switcher lives at the top of our address space, and * the Host told us how big it is when we made LGUEST_INIT hypercall: diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 4f420c2f2d55..e7d5382ef263 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -4,6 +4,7 @@ #include #include #include +#include /*G:020 * Our story starts with the kernel booting into startup_32 in @@ -37,9 +38,113 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp + call init_pagetables + /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET +/* + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond __brk_base. The variable + * _brk_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) + * and PAGE_OFFSET for up to _end. + * + * FIXME: This code is taken verbatim from arch/x86/kernel/head_32.S: they + * don't have a stack at this point, so we can't just use call and ret. + */ +init_pagetables: +#if PTRS_PER_PMD > 1 +#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) +#else +#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) +#endif +#define pa(X) ((X) - __PAGE_OFFSET) + +/* Enough space to fit pagetables for the low memory linear map */ +MAPPING_BEYOND_END = \ + PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT +#ifdef CONFIG_X86_PAE + + /* + * In PAE mode initial_page_table is statically defined to contain + * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD entries + * to the first kernel PMD. + * + * Note the upper half of each PMD or PTE are always zero at this stage. + */ + +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ + + xorl %ebx,%ebx /* %ebx is kept at zero */ + + movl $pa(__brk_base), %edi + movl $pa(initial_pg_pmd), %edx + movl $PTE_IDENT_ATTR, %eax +10: + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ + movl %ecx,(%edx) /* Store PMD entry */ + /* Upper half already zero */ + addl $8,%edx + movl $512,%ecx +11: + stosl + xchgl %eax,%ebx + stosl + xchgl %eax,%ebx + addl $0x1000,%eax + loop 11b + + /* + * End condition: we must map up to the end + MAPPING_BEYOND_END. + */ + movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp + cmpl %ebp,%eax + jb 10b +1: + addl $__PAGE_OFFSET, %edi + movl %edi, pa(_brk_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) + + /* Do early initialization of the fixmap area */ + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax + movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) +#else /* Not PAE */ + +page_pde_offset = (__PAGE_OFFSET >> 20); + + movl $pa(__brk_base), %edi + movl $pa(initial_page_table), %edx + movl $PTE_IDENT_ATTR, %eax +10: + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ + movl %ecx,(%edx) /* Store identity PDE entry */ + movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ + addl $4,%edx + movl $1024, %ecx +11: + stosl + addl $0x1000,%eax + loop 11b + /* + * End condition: we must map up to the end + MAPPING_BEYOND_END. + */ + movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp + cmpl %ebp,%eax + jb 10b + addl $__PAGE_OFFSET, %edi + movl %edi, pa(_brk_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) + + /* Do early initialization of the fixmap area */ + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax + movl %eax,pa(initial_page_table+0xffc) +#endif + ret + /*G:055 * We create a macro which puts the assembler code between lgstart_ and lgend_ * markers. These templates are put in the .text section: they can't be -- cgit v1.2.3 From d50d8fe192428090790e7178e9507e981e0b005b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 4 Jan 2011 17:20:54 +1030 Subject: x86, mm: Initialize initial_page_table before paravirt jumps v2.6.36-rc8-54-gb40827f (x86-32, mm: Add an initial page table for core bootstrapping) made x86 boot using initial_page_table and broke lguest. For 2.6.37 we simply cut & paste the initialization code into lguest (da32dac10126 "lguest: populate initial_page_table"), now we fix it properly by doing that initialization before the paravirt jump. Signed-off-by: Rusty Russell Acked-by: Jeremy Fitzhardinge Cc: lguest Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra LKML-Reference: <201101041720.54535.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 77 ++++++++++++++++---------------- arch/x86/lguest/i386_head.S | 105 -------------------------------------------- 2 files changed, 39 insertions(+), 143 deletions(-) (limited to 'arch/x86/lguest') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c0dbd9ac24f0..e3a8bb91e168 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -139,39 +139,6 @@ ENTRY(startup_32) movl %eax, pa(olpc_ofw_pgd) #endif -#ifdef CONFIG_PARAVIRT - /* This is can only trip for a broken bootloader... */ - cmpw $0x207, pa(boot_params + BP_version) - jb default_entry - - /* Paravirt-compatible boot parameters. Look to see what architecture - we're booting under. */ - movl pa(boot_params + BP_hardware_subarch), %eax - cmpl $num_subarch_entries, %eax - jae bad_subarch - - movl pa(subarch_entries)(,%eax,4), %eax - subl $__PAGE_OFFSET, %eax - jmp *%eax - -bad_subarch: -WEAK(lguest_entry) -WEAK(xen_entry) - /* Unknown implementation; there's really - nothing we can do at this point. */ - ud2a - - __INITDATA - -subarch_entries: - .long default_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ - .long xen_entry /* Xen hypervisor */ - .long default_entry /* Moorestown MID */ -num_subarch_entries = (. - subarch_entries) / 4 -.previous -#endif /* CONFIG_PARAVIRT */ - /* * Initialize page tables. This creates a PDE and a set of page * tables, which are located immediately beyond __brk_base. The variable @@ -181,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4 * * Note that the stack is not yet set up! */ -default_entry: #ifdef CONFIG_X86_PAE /* @@ -261,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(initial_page_table+0xffc) #endif - jmp 3f + +#ifdef CONFIG_PARAVIRT + /* This is can only trip for a broken bootloader... */ + cmpw $0x207, pa(boot_params + BP_version) + jb default_entry + + /* Paravirt-compatible boot parameters. Look to see what architecture + we're booting under. */ + movl pa(boot_params + BP_hardware_subarch), %eax + cmpl $num_subarch_entries, %eax + jae bad_subarch + + movl pa(subarch_entries)(,%eax,4), %eax + subl $__PAGE_OFFSET, %eax + jmp *%eax + +bad_subarch: +WEAK(lguest_entry) +WEAK(xen_entry) + /* Unknown implementation; there's really + nothing we can do at this point. */ + ud2a + + __INITDATA + +subarch_entries: + .long default_entry /* normal x86/PC */ + .long lguest_entry /* lguest hypervisor */ + .long xen_entry /* Xen hypervisor */ + .long default_entry /* Moorestown MID */ +num_subarch_entries = (. - subarch_entries) / 4 +.previous +#else + jmp default_entry +#endif /* CONFIG_PARAVIRT */ + /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but @@ -282,7 +283,7 @@ ENTRY(startup_32_smp) movl %eax,%fs movl %eax,%gs #endif /* CONFIG_SMP */ -3: +default_entry: /* * New page tables may be in 4Mbyte page mode and may @@ -622,13 +623,13 @@ ENTRY(initial_code) __PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -ENTRY(initial_pg_pmd) +initial_pg_pmd: .fill 1024*KPMDS,4,0 #else ENTRY(initial_page_table) .fill 1024,4,0 #endif -ENTRY(initial_pg_fixmap) +initial_pg_fixmap: .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index e7d5382ef263..4f420c2f2d55 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -4,7 +4,6 @@ #include #include #include -#include /*G:020 * Our story starts with the kernel booting into startup_32 in @@ -38,113 +37,9 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp - call init_pagetables - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET -/* - * Initialize page tables. This creates a PDE and a set of page - * tables, which are located immediately beyond __brk_base. The variable - * _brk_end is set up to point to the first "safe" location. - * Mappings are created both at virtual address 0 (identity mapping) - * and PAGE_OFFSET for up to _end. - * - * FIXME: This code is taken verbatim from arch/x86/kernel/head_32.S: they - * don't have a stack at this point, so we can't just use call and ret. - */ -init_pagetables: -#if PTRS_PER_PMD > 1 -#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) -#else -#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) -#endif -#define pa(X) ((X) - __PAGE_OFFSET) - -/* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = \ - PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT -#ifdef CONFIG_X86_PAE - - /* - * In PAE mode initial_page_table is statically defined to contain - * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 - * entries). The identity mapping is handled by pointing two PGD entries - * to the first kernel PMD. - * - * Note the upper half of each PMD or PTE are always zero at this stage. - */ - -#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ - - xorl %ebx,%ebx /* %ebx is kept at zero */ - - movl $pa(__brk_base), %edi - movl $pa(initial_pg_pmd), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ - movl %ecx,(%edx) /* Store PMD entry */ - /* Upper half already zero */ - addl $8,%edx - movl $512,%ecx -11: - stosl - xchgl %eax,%ebx - stosl - xchgl %eax,%ebx - addl $0x1000,%eax - loop 11b - - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b -1: - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ - movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax - movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) -#else /* Not PAE */ - -page_pde_offset = (__PAGE_OFFSET >> 20); - - movl $pa(__brk_base), %edi - movl $pa(initial_page_table), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ - movl %ecx,(%edx) /* Store identity PDE entry */ - movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ - addl $4,%edx - movl $1024, %ecx -11: - stosl - addl $0x1000,%eax - loop 11b - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ - movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax - movl %eax,pa(initial_page_table+0xffc) -#endif - ret - /*G:055 * We create a macro which puts the assembler code between lgstart_ and lgend_ * markers. These templates are put in the .text section: they can't be -- cgit v1.2.3