diff options
Diffstat (limited to 'arch/x86_64')
151 files changed, 19 insertions, 38930 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index b4d9089a6a06..b1b98e614f7c 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -704,7 +704,7 @@ source kernel/power/Kconfig source "drivers/acpi/Kconfig" -source "arch/x86_64/kernel/cpufreq/Kconfig" +source "arch/x86/kernel/cpufreq/Kconfig" endmenu @@ -778,7 +778,7 @@ source fs/Kconfig menu "Instrumentation Support" depends on EXPERIMENTAL -source "arch/x86_64/oprofile/Kconfig" +source "arch/x86/oprofile/Kconfig" config KPROBES bool "Kprobes" diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index b024e4a86895..8bffb94c71b5 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -21,6 +21,9 @@ # # $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $ +# Fill in SRCARCH +SRCARCH := x86 + LDFLAGS := -m elf_x86_64 OBJCOPYFLAGS := -O binary -R .note -R .comment -S LDFLAGS_vmlinux := @@ -71,18 +74,18 @@ CFLAGS += $(cflags-y) CFLAGS_KERNEL += $(cflags-kernel-y) AFLAGS += -m64 -head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o +head-y := arch/x86/kernel/head_64.o arch/x86/kernel/head64.o arch/x86/kernel/init_task_64.o -libs-y += arch/x86_64/lib/ -core-y += arch/x86_64/kernel/ \ - arch/x86_64/mm/ \ - arch/x86_64/crypto/ \ - arch/x86_64/vdso/ -core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ -drivers-$(CONFIG_PCI) += arch/x86_64/pci/ -drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ +libs-y += arch/x86/lib/ +core-y += arch/x86/kernel/ \ + arch/x86/mm/ \ + arch/x86/crypto/ \ + arch/x86/vdso/ +core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ +drivers-$(CONFIG_PCI) += arch/x86/pci/ +drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ -boot := arch/x86_64/boot +boot := arch/x86/boot PHONY += bzImage bzlilo install archmrproper \ fdimage fdimage144 fdimage288 isoimage archclean @@ -90,10 +93,12 @@ PHONY += bzImage bzlilo install archmrproper \ #Default target when executing "make" all: bzImage -BOOTIMAGE := arch/x86_64/boot/bzImage +BOOTIMAGE := arch/x86/boot/bzImage KBUILD_IMAGE := $(BOOTIMAGE) bzImage: vmlinux + $(Q)mkdir -p $(objtree)/arch/x86_64/boot + $(Q)ln -fsn $(objtree)/arch/x86/boot/bzImage $(objtree)/arch/x86_64/boot/bzImage $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE) bzlilo: vmlinux @@ -109,6 +114,7 @@ install: $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ archclean: + $(Q)rm -rf $(objtree)/arch/x86_64/boot $(Q)$(MAKE) $(clean)=$(boot) define archhelp diff --git a/arch/x86_64/boot/.gitignore b/arch/x86_64/boot/.gitignore deleted file mode 100644 index 18465143cfa2..000000000000 --- a/arch/x86_64/boot/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -bootsect -bzImage -setup -setup.bin -setup.elf diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile deleted file mode 100644 index 67096389de1f..000000000000 --- a/arch/x86_64/boot/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# arch/x86_64/boot/Makefile -# -# The actual boot code is shared with i386 including the Makefile. -# So tell kbuild that we fetch the code from i386 and include the -# Makefile from i386 too. - -src := arch/i386/boot -include $(src)/Makefile diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile deleted file mode 100644 index 877c0bdbbc67..000000000000 --- a/arch/x86_64/boot/compressed/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# -# linux/arch/x86_64/boot/compressed/Makefile -# -# create a compressed vmlinux image from the original vmlinux -# - -targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o - -CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \ - -fno-strict-aliasing -fPIC -mcmodel=small \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-stack-protector) -AFLAGS := $(CFLAGS) -D__ASSEMBLY__ -LDFLAGS := -m elf_x86_64 - -LDFLAGS_vmlinux := -T -$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE - $(call if_changed,ld) - @: - -$(obj)/vmlinux.bin: vmlinux FORCE - $(call if_changed,objcopy) - -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) - -LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T - -$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE - $(call if_changed,ld) diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S deleted file mode 100644 index 9fd8030cc54f..000000000000 --- a/arch/x86_64/boot/compressed/head.S +++ /dev/null @@ -1,311 +0,0 @@ -/* - * linux/boot/head.S - * - * Copyright (C) 1991, 1992, 1993 Linus Torvalds - */ - -/* - * head.S contains the 32-bit startup code. - * - * NOTE!!! Startup happens at absolute address 0x00001000, which is also where - * the page directory will exist. The startup code will be overwritten by - * the page directory. [According to comments etc elsewhere on a compressed - * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] - * - * Page 0 is deliberately kept safe, since System Management Mode code in - * laptops may need to access the BIOS data stored there. This is also - * useful for future device drivers that either access the BIOS via VM86 - * mode. - */ - -/* - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 - */ -.code32 -.text - -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/pgtable.h> -#include <asm/page.h> -#include <asm/msr.h> - -.section ".text.head" - .code32 - .globl startup_32 - -startup_32: - cld - cli - movl $(__KERNEL_DS), %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %ss - -/* Calculate the delta between where we were compiled to run - * at and where we were actually loaded at. This can only be done - * with a short local call on x86. Nothing else will tell us what - * address we are running at. The reserved chunk of the real-mode - * data at 0x1e4 (defined as a scratch field) are used as the stack - * for this calculation. Only 4 bytes are needed. - */ - leal (0x1e4+4)(%esi), %esp - call 1f -1: popl %ebp - subl $1b, %ebp - -/* setup a stack and make sure cpu supports long mode. */ - movl $user_stack_end, %eax - addl %ebp, %eax - movl %eax, %esp - - call verify_cpu - testl %eax, %eax - jnz no_longmode - -/* Compute the delta between where we were compiled to run at - * and where the code will actually run at. - */ -/* %ebp contains the address we are loaded at by the boot loader and %ebx - * contains the address where we should move the kernel image temporarily - * for safe in-place decompression. - */ - -#ifdef CONFIG_RELOCATABLE - movl %ebp, %ebx - addl $(LARGE_PAGE_SIZE -1), %ebx - andl $LARGE_PAGE_MASK, %ebx -#else - movl $CONFIG_PHYSICAL_START, %ebx -#endif - - /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx - /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx - /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ - addl $(32768 + 18 + 4095), %ebx - andl $~4095, %ebx - -/* - * Prepare for entering 64 bit mode - */ - - /* Load new GDT with the 64bit segments using 32bit descriptor */ - leal gdt(%ebp), %eax - movl %eax, gdt+2(%ebp) - lgdt gdt(%ebp) - - /* Enable PAE mode */ - xorl %eax, %eax - orl $(1 << 5), %eax - movl %eax, %cr4 - - /* - * Build early 4G boot pagetable - */ - /* Initialize Page tables to 0*/ - leal pgtable(%ebx), %edi - xorl %eax, %eax - movl $((4096*6)/4), %ecx - rep stosl - - /* Build Level 4 */ - leal pgtable + 0(%ebx), %edi - leal 0x1007 (%edi), %eax - movl %eax, 0(%edi) - - /* Build Level 3 */ - leal pgtable + 0x1000(%ebx), %edi - leal 0x1007(%edi), %eax - movl $4, %ecx -1: movl %eax, 0x00(%edi) - addl $0x00001000, %eax - addl $8, %edi - decl %ecx - jnz 1b - - /* Build Level 2 */ - leal pgtable + 0x2000(%ebx), %edi - movl $0x00000183, %eax - movl $2048, %ecx -1: movl %eax, 0(%edi) - addl $0x00200000, %eax - addl $8, %edi - decl %ecx - jnz 1b - - /* Enable the boot page tables */ - leal pgtable(%ebx), %eax - movl %eax, %cr3 - - /* Enable Long mode in EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_LME, %eax - wrmsr - - /* Setup for the jump to 64bit mode - * - * When the jump is performend we will be in long mode but - * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 - * (and in turn EFER.LMA = 1). To jump into 64bit mode we use - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. - * We place all of the values on our mini stack so lret can - * used to perform that far jump. - */ - pushl $__KERNEL_CS - leal startup_64(%ebp), %eax - pushl %eax - - /* Enter paged protected Mode, activating Long Mode */ - movl $0x80000001, %eax /* Enable Paging and Protected mode */ - movl %eax, %cr0 - - /* Jump from 32bit compatibility mode into 64bit mode. */ - lret - -no_longmode: - /* This isn't an x86-64 CPU so hang */ -1: - hlt - jmp 1b - -#include "../../kernel/verify_cpu.S" - - /* Be careful here startup_64 needs to be at a predictable - * address so I can export it in an ELF header. Bootloaders - * should look at the ELF header to find this address, as - * it may change in the future. - */ - .code64 - .org 0x200 -ENTRY(startup_64) - /* We come here either from startup_32 or directly from a - * 64bit bootloader. If we come here from a bootloader we depend on - * an identity mapped page table being provied that maps our - * entire text+data+bss and hopefully all of memory. - */ - - /* Setup data segments. */ - xorl %eax, %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %ss - movl %eax, %fs - movl %eax, %gs - lldt %ax - movl $0x20, %eax - ltr %ax - - /* Compute the decompressed kernel start address. It is where - * we were loaded at aligned to a 2M boundary. %rbp contains the - * decompressed kernel start address. - * - * If it is a relocatable kernel then decompress and run the kernel - * from load address aligned to 2MB addr, otherwise decompress and - * run the kernel from CONFIG_PHYSICAL_START - */ - - /* Start with the delta to where the kernel will run at. */ -#ifdef CONFIG_RELOCATABLE - leaq startup_32(%rip) /* - $startup_32 */, %rbp - addq $(LARGE_PAGE_SIZE - 1), %rbp - andq $LARGE_PAGE_MASK, %rbp - movq %rbp, %rbx -#else - movq $CONFIG_PHYSICAL_START, %rbp - movq %rbp, %rbx -#endif - - /* Replace the compressed data size with the uncompressed size */ - movl input_len(%rip), %eax - subq %rax, %rbx - movl output_len(%rip), %eax - addq %rax, %rbx - /* Add 8 bytes for every 32K input block */ - shrq $12, %rax - addq %rax, %rbx - /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ - addq $(32768 + 18 + 4095), %rbx - andq $~4095, %rbx - -/* Copy the compressed kernel to the end of our buffer - * where decompression in place becomes safe. - */ - leaq _end(%rip), %r8 - leaq _end(%rbx), %r9 - movq $_end /* - $startup_32 */, %rcx -1: subq $8, %r8 - subq $8, %r9 - movq 0(%r8), %rax - movq %rax, 0(%r9) - subq $8, %rcx - jnz 1b - -/* - * Jump to the relocated address. - */ - leaq relocated(%rbx), %rax - jmp *%rax - -.section ".text" -relocated: - -/* - * Clear BSS - */ - xorq %rax, %rax - leaq _edata(%rbx), %rdi - leaq _end(%rbx), %rcx - subq %rdi, %rcx - cld - rep - stosb - - /* Setup the stack */ - leaq user_stack_end(%rip), %rsp - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq - -/* - * Do the decompression, and jump to the new kernel.. - */ - pushq %rsi # Save the real mode argument - movq %rsi, %rdi # real mode address - leaq _heap(%rip), %rsi # _heap - leaq input_data(%rip), %rdx # input_data - movl input_len(%rip), %eax - movq %rax, %rcx # input_len - movq %rbp, %r8 # output - call decompress_kernel - popq %rsi - - -/* - * Jump to the decompressed kernel. - */ - jmp *%rbp - - .data -gdt: - .word gdt_end - gdt - .long gdt - .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x0080890000000000 /* TS descriptor */ - .quad 0x0000000000000000 /* TS continued */ -gdt_end: - .bss -/* Stack for uncompression */ - .balign 4 -user_stack: - .fill 4096,4,0 -user_stack_end: diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c deleted file mode 100644 index f932b0e89096..000000000000 --- a/arch/x86_64/boot/compressed/misc.c +++ /dev/null @@ -1,371 +0,0 @@ -/* - * misc.c - * - * This is a collection of several routines from gzip-1.0.3 - * adapted for Linux. - * - * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 - * puts by Nick Holloway 1993, better puts by Martin Mares 1995 - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 - */ - -#define _LINUX_STRING_H_ 1 -#define __LINUX_BITMAP_H 1 - -#include <linux/linkage.h> -#include <linux/screen_info.h> -#include <asm/io.h> -#include <asm/page.h> - -/* WARNING!! - * This code is compiled with -fPIC and it is relocated dynamically - * at run time, but no relocation processing is performed. - * This means that it is not safe to place pointers in static structures. - */ - -/* - * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analized. - * Background information: - * - * The file layout is: - * magic[2] - * method[1] - * flags[1] - * timestamp[4] - * extraflags[1] - * os[1] - * compressed data blocks[N] - * crc[4] orig_len[4] - * - * resulting in 18 bytes of non compressed data overhead. - * - * Files divided into blocks - * 1 bit (last block flag) - * 2 bits (block type) - * - * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. - * The smallest block type encoding is always used. - * - * stored: - * 32 bits length in bytes. - * - * fixed: - * magic fixed tree. - * symbols. - * - * dynamic: - * dynamic tree encoding. - * symbols. - * - * - * The buffer for decompression in place is the length of the - * uncompressed data, plus a small amount extra to keep the algorithm safe. - * The compressed data is placed at the end of the buffer. The output - * pointer is placed at the start of the buffer and the input pointer - * is placed where the compressed data starts. Problems will occur - * when the output pointer overruns the input pointer. - * - * The output pointer can only overrun the input pointer if the input - * pointer is moving faster than the output pointer. A condition only - * triggered by data whose compressed form is larger than the uncompressed - * form. - * - * The worst case at the block level is a growth of the compressed data - * of 5 bytes per 32767 bytes. - * - * The worst case internal to a compressed block is very hard to figure. - * The worst case can at least be boundined by having one bit that represents - * 32764 bytes and then all of the rest of the bytes representing the very - * very last byte. - * - * All of which is enough to compute an amount of extra data that is required - * to be safe. To avoid problems at the block level allocating 5 extra bytes - * per 32767 bytes of data is sufficient. To avoind problems internal to a block - * adding an extra 32767 bytes (the worst case uncompressed block size) is - * sufficient, to ensure that in the worst case the decompressed data for - * block will stop the byte before the compressed data for a block begins. - * To avoid problems with the compressed data's meta information an extra 18 - * bytes are needed. Leading to the formula: - * - * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. - * - * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. - * Adding 32768 instead of 32767 just makes for round numbers. - * Adding the decompressor_size is necessary as it musht live after all - * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actuall data and 4K of bss. - * - */ - -/* - * gzip declarations - */ - -#define OF(args) args -#define STATIC static - -#undef memset -#undef memcpy -#define memzero(s, n) memset ((s), 0, (n)) - -typedef unsigned char uch; -typedef unsigned short ush; -typedef unsigned long ulg; - -#define WSIZE 0x80000000 /* Window size must be at least 32k, - * and a power of two - * We don't actually have a window just - * a huge output buffer so I report - * a 2G windows size, as that should - * always be larger than our output buffer. - */ - -static uch *inbuf; /* input buffer */ -static uch *window; /* Sliding window buffer, (and final output buffer) */ - -static unsigned insize; /* valid bytes in inbuf */ -static unsigned inptr; /* index of next byte to be processed in inbuf */ -static unsigned outcnt; /* bytes in output buffer */ - -/* gzip flag byte */ -#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ -#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ -#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ -#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ -#define COMMENT 0x10 /* bit 4 set: file comment present */ -#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ -#define RESERVED 0xC0 /* bit 6,7: reserved */ - -#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) - -/* Diagnostic functions */ -#ifdef DEBUG -# define Assert(cond,msg) {if(!(cond)) error(msg);} -# define Trace(x) fprintf x -# define Tracev(x) {if (verbose) fprintf x ;} -# define Tracevv(x) {if (verbose>1) fprintf x ;} -# define Tracec(c,x) {if (verbose && (c)) fprintf x ;} -# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;} -#else -# define Assert(cond,msg) -# define Trace(x) -# define Tracev(x) -# define Tracevv(x) -# define Tracec(c,x) -# define Tracecv(c,x) -#endif - -static int fill_inbuf(void); -static void flush_window(void); -static void error(char *m); -static void gzip_mark(void **); -static void gzip_release(void **); - -/* - * This is set up by the setup-routine at boot-time - */ -static unsigned char *real_mode; /* Pointer to real-mode data */ - -#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) -#ifndef STANDARD_MEMORY_BIOS_CALL -#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) -#endif -#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0)) - -extern unsigned char input_data[]; -extern int input_len; - -static long bytes_out = 0; - -static void *malloc(int size); -static void free(void *where); - -static void *memset(void *s, int c, unsigned n); -static void *memcpy(void *dest, const void *src, unsigned n); - -static void putstr(const char *); - -static long free_mem_ptr; -static long free_mem_end_ptr; - -#define HEAP_SIZE 0x7000 - -static char *vidmem = (char *)0xb8000; -static int vidport; -static int lines, cols; - -#include "../../../../lib/inflate.c" - -static void *malloc(int size) -{ - void *p; - - if (size <0) error("Malloc error"); - if (free_mem_ptr <= 0) error("Memory error"); - - free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */ - - p = (void *)free_mem_ptr; - free_mem_ptr += size; - - if (free_mem_ptr >= free_mem_end_ptr) - error("Out of memory"); - - return p; -} - -static void free(void *where) -{ /* Don't care */ -} - -static void gzip_mark(void **ptr) -{ - *ptr = (void *) free_mem_ptr; -} - -static void gzip_release(void **ptr) -{ - free_mem_ptr = (long) *ptr; -} - -static void scroll(void) -{ - int i; - - memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 ); - for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 ) - vidmem[i] = ' '; -} - -static void putstr(const char *s) -{ - int x,y,pos; - char c; - - x = RM_SCREEN_INFO.orig_x; - y = RM_SCREEN_INFO.orig_y; - - while ( ( c = *s++ ) != '\0' ) { - if ( c == '\n' ) { - x = 0; - if ( ++y >= lines ) { - scroll(); - y--; - } - } else { - vidmem [ ( x + cols * y ) * 2 ] = c; - if ( ++x >= cols ) { - x = 0; - if ( ++y >= lines ) { - scroll(); - y--; - } - } - } - } - - RM_SCREEN_INFO.orig_x = x; - RM_SCREEN_INFO.orig_y = y; - - pos = (x + cols * y) * 2; /* Update cursor position */ - outb_p(14, vidport); - outb_p(0xff & (pos >> 9), vidport+1); - outb_p(15, vidport); - outb_p(0xff & (pos >> 1), vidport+1); -} - -static void* memset(void* s, int c, unsigned n) -{ - int i; - char *ss = (char*)s; - - for (i=0;i<n;i++) ss[i] = c; - return s; -} - -static void* memcpy(void* dest, const void* src, unsigned n) -{ - int i; - char *d = (char *)dest, *s = (char *)src; - - for (i=0;i<n;i++) d[i] = s[i]; - return dest; -} - -/* =========================================================================== - * Fill the input buffer. This is called only when the buffer is empty - * and at least one byte is really needed. - */ -static int fill_inbuf(void) -{ - error("ran out of input data"); - return 0; -} - -/* =========================================================================== - * Write the output window window[0..outcnt-1] and update crc and bytes_out. - * (Used for the decompressed data only.) - */ -static void flush_window(void) -{ - /* With my window equal to my output buffer - * I only need to compute the crc here. - */ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, ch; - - in = window; - for (n = 0; n < outcnt; n++) { - ch = *in++; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - outcnt = 0; -} - -static void error(char *x) -{ - putstr("\n\n"); - putstr(x); - putstr("\n\n -- System halted"); - - while(1); /* Halt */ -} - -asmlinkage void decompress_kernel(void *rmode, unsigned long heap, - uch *input_data, unsigned long input_len, uch *output) -{ - real_mode = rmode; - - if (RM_SCREEN_INFO.orig_video_mode == 7) { - vidmem = (char *) 0xb0000; - vidport = 0x3b4; - } else { - vidmem = (char *) 0xb8000; - vidport = 0x3d4; - } - - lines = RM_SCREEN_INFO.orig_video_lines; - cols = RM_SCREEN_INFO.orig_video_cols; - - window = output; /* Output buffer (Normally at 1M) */ - free_mem_ptr = heap; /* Heap */ - free_mem_end_ptr = heap + HEAP_SIZE; - inbuf = input_data; /* Input buffer */ - insize = input_len; - inptr = 0; - - if ((ulg)output & (__KERNEL_ALIGN - 1)) - error("Destination address not 2M aligned"); - if ((ulg)output >= 0xffffffffffUL) - error("Destination address too large"); - - makecrc(); - putstr(".\nDecompressing Linux..."); - gunzip(); - putstr("done.\nBooting the kernel.\n"); - return; -} diff --git a/arch/x86_64/boot/compressed/vmlinux.lds b/arch/x86_64/boot/compressed/vmlinux.lds deleted file mode 100644 index 94c13e557fb4..000000000000 --- a/arch/x86_64/boot/compressed/vmlinux.lds +++ /dev/null @@ -1,44 +0,0 @@ -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(startup_64) -SECTIONS -{ - /* Be careful parts of head.S assume startup_32 is at - * address 0. - */ - . = 0; - .text : { - _head = . ; - *(.text.head) - _ehead = . ; - *(.text.compressed) - _text = .; /* Text */ - *(.text) - *(.text.*) - _etext = . ; - } - .rodata : { - _rodata = . ; - *(.rodata) /* read-only data */ - *(.rodata.*) - _erodata = . ; - } - .data : { - _data = . ; - *(.data) - *(.data.*) - _edata = . ; - } - .bss : { - _bss = . ; - *(.bss) - *(.bss.*) - *(COMMON) - . = ALIGN(8); - _end = . ; - . = ALIGN(4096); - pgtable = . ; - . = . + 4096 * 6; - _heap = .; - } -} diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr deleted file mode 100644 index bd1429ce193e..000000000000 --- a/arch/x86_64/boot/compressed/vmlinux.scr +++ /dev/null @@ -1,10 +0,0 @@ -SECTIONS -{ - .text.compressed : { - input_len = .; - LONG(input_data_end - input_data) input_data = .; - *(.data) - output_len = . - 4; - input_data_end = .; - } -} diff --git a/arch/x86_64/boot/tools/.gitignore b/arch/x86_64/boot/tools/.gitignore deleted file mode 100644 index 378eac25d311..000000000000 --- a/arch/x86_64/boot/tools/.gitignore +++ /dev/null @@ -1 +0,0 @@ -build diff --git a/arch/x86_64/crypto/Makefile b/arch/x86_64/crypto/Makefile deleted file mode 100644 index 15b538a8b7f7..000000000000 --- a/arch/x86_64/crypto/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# -# x86_64/crypto/Makefile -# -# Arch-specific CryptoAPI modules. -# - -obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o -obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o - -aes-x86_64-y := aes-x86_64-asm.o aes.o -twofish-x86_64-y := twofish-x86_64-asm.o twofish.o - diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S deleted file mode 100644 index 26b40de4d0b0..000000000000 --- a/arch/x86_64/crypto/aes-x86_64-asm.S +++ /dev/null @@ -1,190 +0,0 @@ -/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64 - * - * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de> - * - * License: - * This code can be distributed under the terms of the GNU General Public - * License (GPL) Version 2 provided that the above header down to and - * including this sentence is retained in full. - */ - -.extern aes_ft_tab -.extern aes_it_tab -.extern aes_fl_tab -.extern aes_il_tab - -.text - -#include <asm/asm-offsets.h> - -#define BASE crypto_tfm_ctx_offset - -#define R1 %rax -#define R1E %eax -#define R1X %ax -#define R1H %ah -#define R1L %al -#define R2 %rbx -#define R2E %ebx -#define R2X %bx -#define R2H %bh -#define R2L %bl -#define R3 %rcx -#define R3E %ecx -#define R3X %cx -#define R3H %ch -#define R3L %cl -#define R4 %rdx -#define R4E %edx -#define R4X %dx -#define R4H %dh -#define R4L %dl -#define R5 %rsi -#define R5E %esi -#define R6 %rdi -#define R6E %edi -#define R7 %rbp -#define R7E %ebp -#define R8 %r8 -#define R9 %r9 -#define R10 %r10 -#define R11 %r11 - -#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ - .global FUNC; \ - .type FUNC,@function; \ - .align 8; \ -FUNC: movq r1,r2; \ - movq r3,r4; \ - leaq BASE+KEY+52(r8),r9; \ - movq r10,r11; \ - movl (r7),r5 ## E; \ - movl 4(r7),r1 ## E; \ - movl 8(r7),r6 ## E; \ - movl 12(r7),r7 ## E; \ - movl BASE(r8),r10 ## E; \ - xorl -48(r9),r5 ## E; \ - xorl -44(r9),r1 ## E; \ - xorl -40(r9),r6 ## E; \ - xorl -36(r9),r7 ## E; \ - cmpl $24,r10 ## E; \ - jb B128; \ - leaq 32(r9),r9; \ - je B192; \ - leaq 32(r9),r9; - -#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \ - movq r1,r2; \ - movq r3,r4; \ - movl r5 ## E,(r9); \ - movl r6 ## E,4(r9); \ - movl r7 ## E,8(r9); \ - movl r8 ## E,12(r9); \ - ret; - -#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ - movzbl r2 ## H,r5 ## E; \ - movzbl r2 ## L,r6 ## E; \ - movl TAB+1024(,r5,4),r5 ## E;\ - movw r4 ## X,r2 ## X; \ - movl TAB(,r6,4),r6 ## E; \ - roll $16,r2 ## E; \ - shrl $16,r4 ## E; \ - movzbl r4 ## H,r7 ## E; \ - movzbl r4 ## L,r4 ## E; \ - xorl OFFSET(r8),ra ## E; \ - xorl OFFSET+4(r8),rb ## E; \ - xorl TAB+3072(,r7,4),r5 ## E;\ - xorl TAB+2048(,r4,4),r6 ## E;\ - movzbl r1 ## L,r7 ## E; \ - movzbl r1 ## H,r4 ## E; \ - movl TAB+1024(,r4,4),r4 ## E;\ - movw r3 ## X,r1 ## X; \ - roll $16,r1 ## E; \ - shrl $16,r3 ## E; \ - xorl TAB(,r7,4),r5 ## E; \ - movzbl r3 ## H,r7 ## E; \ - movzbl r3 ## L,r3 ## E; \ - xorl TAB+3072(,r7,4),r4 ## E;\ - xorl TAB+2048(,r3,4),r5 ## E;\ - movzbl r1 ## H,r7 ## E; \ - movzbl r1 ## L,r3 ## E; \ - shrl $16,r1 ## E; \ - xorl TAB+3072(,r7,4),r6 ## E;\ - movl TAB+2048(,r3,4),r3 ## E;\ - movzbl r1 ## H,r7 ## E; \ - movzbl r1 ## L,r1 ## E; \ - xorl TAB+1024(,r7,4),r6 ## E;\ - xorl TAB(,r1,4),r3 ## E; \ - movzbl r2 ## H,r1 ## E; \ - movzbl r2 ## L,r7 ## E; \ - shrl $16,r2 ## E; \ - xorl TAB+3072(,r1,4),r3 ## E;\ - xorl TAB+2048(,r7,4),r4 ## E;\ - movzbl r2 ## H,r1 ## E; \ - movzbl r2 ## L,r2 ## E; \ - xorl OFFSET+8(r8),rc ## E; \ - xorl OFFSET+12(r8),rd ## E; \ - xorl TAB+1024(,r1,4),r3 ## E;\ - xorl TAB(,r2,4),r4 ## E; - -#define move_regs(r1,r2,r3,r4) \ - movl r3 ## E,r1 ## E; \ - movl r4 ## E,r2 ## E; - -#define entry(FUNC,KEY,B128,B192) \ - prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) - -#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11) - -#define encrypt_round(TAB,OFFSET) \ - round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ - move_regs(R1,R2,R5,R6) - -#define encrypt_final(TAB,OFFSET) \ - round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) - -#define decrypt_round(TAB,OFFSET) \ - round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \ - move_regs(R1,R2,R5,R6) - -#define decrypt_final(TAB,OFFSET) \ - round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) - -/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ - - entry(aes_enc_blk,0,enc128,enc192) - encrypt_round(aes_ft_tab,-96) - encrypt_round(aes_ft_tab,-80) -enc192: encrypt_round(aes_ft_tab,-64) - encrypt_round(aes_ft_tab,-48) -enc128: encrypt_round(aes_ft_tab,-32) - encrypt_round(aes_ft_tab,-16) - encrypt_round(aes_ft_tab, 0) - encrypt_round(aes_ft_tab, 16) - encrypt_round(aes_ft_tab, 32) - encrypt_round(aes_ft_tab, 48) - encrypt_round(aes_ft_tab, 64) - encrypt_round(aes_ft_tab, 80) - encrypt_round(aes_ft_tab, 96) - encrypt_final(aes_fl_tab,112) - return - -/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ - - entry(aes_dec_blk,240,dec128,dec192) - decrypt_round(aes_it_tab,-96) - decrypt_round(aes_it_tab,-80) -dec192: decrypt_round(aes_it_tab,-64) - decrypt_round(aes_it_tab,-48) -dec128: decrypt_round(aes_it_tab,-32) - decrypt_round(aes_it_tab,-16) - decrypt_round(aes_it_tab, 0) - decrypt_round(aes_it_tab, 16) - decrypt_round(aes_it_tab, 32) - decrypt_round(aes_it_tab, 48) - decrypt_round(aes_it_tab, 64) - decrypt_round(aes_it_tab, 80) - decrypt_round(aes_it_tab, 96) - decrypt_final(aes_il_tab,112) - return diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c deleted file mode 100644 index 5cdb13ea5cc2..000000000000 --- a/arch/x86_64/crypto/aes.c +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Cryptographic API. - * - * AES Cipher Algorithm. - * - * Based on Brian Gladman's code. - * - * Linux developers: - * Alexander Kjeldaas <astor@fast.no> - * Herbert Valerio Riedel <hvr@hvrlab.org> - * Kyle McMartin <kyle@debian.org> - * Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API). - * Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * --------------------------------------------------------------------------- - * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK. - * All rights reserved. - * - * LICENSE TERMS - * - * The free distribution and use of this software in both source and binary - * form is allowed (with or without changes) provided that: - * - * 1. distributions of this source code include the above copyright - * notice, this list of conditions and the following disclaimer; - * - * 2. distributions in binary form include the above copyright - * notice, this list of conditions and the following disclaimer - * in the documentation and/or other associated materials; - * - * 3. the copyright holder's name is not used to endorse products - * built using this software without specific written permission. - * - * ALTERNATIVELY, provided that this notice is retained in full, this product - * may be distributed under the terms of the GNU General Public License (GPL), - * in which case the provisions of the GPL apply INSTEAD OF those given above. - * - * DISCLAIMER - * - * This software is provided 'as is' with no explicit or implied warranties - * in respect of its properties, including, but not limited to, correctness - * and/or fitness for purpose. - * --------------------------------------------------------------------------- - */ - -/* Some changes from the Gladman version: - s/RIJNDAEL(e_key)/E_KEY/g - s/RIJNDAEL(d_key)/D_KEY/g -*/ - -#include <asm/byteorder.h> -#include <linux/bitops.h> -#include <linux/crypto.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/types.h> - -#define AES_MIN_KEY_SIZE 16 -#define AES_MAX_KEY_SIZE 32 - -#define AES_BLOCK_SIZE 16 - -/* - * #define byte(x, nr) ((unsigned char)((x) >> (nr*8))) - */ -static inline u8 byte(const u32 x, const unsigned n) -{ - return x >> (n << 3); -} - -struct aes_ctx -{ - u32 key_length; - u32 buf[120]; -}; - -#define E_KEY (&ctx->buf[0]) -#define D_KEY (&ctx->buf[60]) - -static u8 pow_tab[256] __initdata; -static u8 log_tab[256] __initdata; -static u8 sbx_tab[256] __initdata; -static u8 isb_tab[256] __initdata; -static u32 rco_tab[10]; -u32 aes_ft_tab[4][256]; -u32 aes_it_tab[4][256]; - -u32 aes_fl_tab[4][256]; -u32 aes_il_tab[4][256]; - -static inline u8 f_mult(u8 a, u8 b) -{ - u8 aa = log_tab[a], cc = aa + log_tab[b]; - - return pow_tab[cc + (cc < aa ? 1 : 0)]; -} - -#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0) - -#define ls_box(x) \ - (aes_fl_tab[0][byte(x, 0)] ^ \ - aes_fl_tab[1][byte(x, 1)] ^ \ - aes_fl_tab[2][byte(x, 2)] ^ \ - aes_fl_tab[3][byte(x, 3)]) - -static void __init gen_tabs(void) -{ - u32 i, t; - u8 p, q; - - /* log and power tables for GF(2**8) finite field with - 0x011b as modular polynomial - the simplest primitive - root is 0x03, used here to generate the tables */ - - for (i = 0, p = 1; i < 256; ++i) { - pow_tab[i] = (u8)p; - log_tab[p] = (u8)i; - - p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0); - } - - log_tab[1] = 0; - - for (i = 0, p = 1; i < 10; ++i) { - rco_tab[i] = p; - - p = (p << 1) ^ (p & 0x80 ? 0x01b : 0); - } - - for (i = 0; i < 256; ++i) { - p = (i ? pow_tab[255 - log_tab[i]] : 0); - q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2)); - p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2)); - sbx_tab[i] = p; - isb_tab[p] = (u8)i; - } - - for (i = 0; i < 256; ++i) { - p = sbx_tab[i]; - - t = p; - aes_fl_tab[0][i] = t; - aes_fl_tab[1][i] = rol32(t, 8); - aes_fl_tab[2][i] = rol32(t, 16); - aes_fl_tab[3][i] = rol32(t, 24); - - t = ((u32)ff_mult(2, p)) | - ((u32)p << 8) | - ((u32)p << 16) | ((u32)ff_mult(3, p) << 24); - - aes_ft_tab[0][i] = t; - aes_ft_tab[1][i] = rol32(t, 8); - aes_ft_tab[2][i] = rol32(t, 16); - aes_ft_tab[3][i] = rol32(t, 24); - - p = isb_tab[i]; - - t = p; - aes_il_tab[0][i] = t; - aes_il_tab[1][i] = rol32(t, 8); - aes_il_tab[2][i] = rol32(t, 16); - aes_il_tab[3][i] = rol32(t, 24); - - t = ((u32)ff_mult(14, p)) | - ((u32)ff_mult(9, p) << 8) | - ((u32)ff_mult(13, p) << 16) | - ((u32)ff_mult(11, p) << 24); - - aes_it_tab[0][i] = t; - aes_it_tab[1][i] = rol32(t, 8); - aes_it_tab[2][i] = rol32(t, 16); - aes_it_tab[3][i] = rol32(t, 24); - } -} - -#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b) - -#define imix_col(y, x) \ - u = star_x(x); \ - v = star_x(u); \ - w = star_x(v); \ - t = w ^ (x); \ - (y) = u ^ v ^ w; \ - (y) ^= ror32(u ^ t, 8) ^ \ - ror32(v ^ t, 16) ^ \ - ror32(t, 24) - -/* initialise the key schedule from the user supplied key */ - -#define loop4(i) \ -{ \ - t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \ - t ^= E_KEY[4 * i]; E_KEY[4 * i + 4] = t; \ - t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t; \ - t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t; \ - t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t; \ -} - -#define loop6(i) \ -{ \ - t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \ - t ^= E_KEY[6 * i]; E_KEY[6 * i + 6] = t; \ - t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t; \ - t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t; \ - t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t; \ - t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t; \ - t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t; \ -} - -#define loop8(i) \ -{ \ - t = ror32(t, 8); ; t = ls_box(t) ^ rco_tab[i]; \ - t ^= E_KEY[8 * i]; E_KEY[8 * i + 8] = t; \ - t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t; \ - t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t; \ - t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t; \ - t = E_KEY[8 * i + 4] ^ ls_box(t); \ - E_KEY[8 * i + 12] = t; \ - t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t; \ - t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t; \ - t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t; \ -} - -static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, - unsigned int key_len) -{ - struct aes_ctx *ctx = crypto_tfm_ctx(tfm); - const __le32 *key = (const __le32 *)in_key; - u32 *flags = &tfm->crt_flags; - u32 i, j, t, u, v, w; - - if (key_len % 8) { - *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } - - ctx->key_length = key_len; - - D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]); - D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]); - D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]); - D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]); - - switch (key_len) { - case 16: - t = E_KEY[3]; - for (i = 0; i < 10; ++i) - loop4(i); - break; - - case 24: - E_KEY[4] = le32_to_cpu(key[4]); - t = E_KEY[5] = le32_to_cpu(key[5]); - for (i = 0; i < 8; ++i) - loop6 (i); - break; - - case 32: - E_KEY[4] = le32_to_cpu(key[4]); - E_KEY[5] = le32_to_cpu(key[5]); - E_KEY[6] = le32_to_cpu(key[6]); - t = E_KEY[7] = le32_to_cpu(key[7]); - for (i = 0; i < 7; ++i) - loop8(i); - break; - } - - D_KEY[0] = E_KEY[key_len + 24]; - D_KEY[1] = E_KEY[key_len + 25]; - D_KEY[2] = E_KEY[key_len + 26]; - D_KEY[3] = E_KEY[key_len + 27]; - - for (i = 4; i < key_len + 24; ++i) { - j = key_len + 24 - (i & ~3) + (i & 3); - imix_col(D_KEY[j], E_KEY[i]); - } - - return 0; -} - -asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); -asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); - -static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - aes_enc_blk(tfm, dst, src); -} - -static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - aes_dec_blk(tfm, dst, src); -} - -static struct crypto_alg aes_alg = { - .cra_name = "aes", - .cra_driver_name = "aes-x86_64", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct aes_ctx), - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, - .cia_encrypt = aes_encrypt, - .cia_decrypt = aes_decrypt - } - } -}; - -static int __init aes_init(void) -{ - gen_tabs(); - return crypto_register_alg(&aes_alg); -} - -static void __exit aes_fini(void) -{ - crypto_unregister_alg(&aes_alg); -} - -module_init(aes_init); -module_exit(aes_fini); - -MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("aes"); diff --git a/arch/x86_64/crypto/twofish-x86_64-asm.S b/arch/x86_64/crypto/twofish-x86_64-asm.S deleted file mode 100644 index 35974a586615..000000000000 --- a/arch/x86_64/crypto/twofish-x86_64-asm.S +++ /dev/null @@ -1,324 +0,0 @@ -/*************************************************************************** -* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * -* * -* This program is free software; you can redistribute it and/or modify * -* it under the terms of the GNU General Public License as published by * -* the Free Software Foundation; either version 2 of the License, or * -* (at your option) any later version. * -* * -* This program is distributed in the hope that it will be useful, * -* but WITHOUT ANY WARRANTY; without even the implied warranty of * -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * -* GNU General Public License for more details. * -* * -* You should have received a copy of the GNU General Public License * -* along with this program; if not, write to the * -* Free Software Foundation, Inc., * -* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * -***************************************************************************/ - -.file "twofish-x86_64-asm.S" -.text - -#include <asm/asm-offsets.h> - -#define a_offset 0 -#define b_offset 4 -#define c_offset 8 -#define d_offset 12 - -/* Structure of the crypto context struct*/ - -#define s0 0 /* S0 Array 256 Words each */ -#define s1 1024 /* S1 Array */ -#define s2 2048 /* S2 Array */ -#define s3 3072 /* S3 Array */ -#define w 4096 /* 8 whitening keys (word) */ -#define k 4128 /* key 1-32 ( word ) */ - -/* define a few register aliases to allow macro substitution */ - -#define R0 %rax -#define R0D %eax -#define R0B %al -#define R0H %ah - -#define R1 %rbx -#define R1D %ebx -#define R1B %bl -#define R1H %bh - -#define R2 %rcx -#define R2D %ecx -#define R2B %cl -#define R2H %ch - -#define R3 %rdx -#define R3D %edx -#define R3B %dl -#define R3H %dh - - -/* performs input whitening */ -#define input_whitening(src,context,offset)\ - xor w+offset(context), src; - -/* performs input whitening */ -#define output_whitening(src,context,offset)\ - xor w+16+offset(context), src; - - -/* - * a input register containing a (rotated 16) - * b input register containing b - * c input register containing c - * d input register containing d (already rol $1) - * operations on a and b are interleaved to increase performance - */ -#define encrypt_round(a,b,c,d,round)\ - movzx b ## B, %edi;\ - mov s1(%r11,%rdi,4),%r8d;\ - movzx a ## B, %edi;\ - mov s2(%r11,%rdi,4),%r9d;\ - movzx b ## H, %edi;\ - ror $16, b ## D;\ - xor s2(%r11,%rdi,4),%r8d;\ - movzx a ## H, %edi;\ - ror $16, a ## D;\ - xor s3(%r11,%rdi,4),%r9d;\ - movzx b ## B, %edi;\ - xor s3(%r11,%rdi,4),%r8d;\ - movzx a ## B, %edi;\ - xor (%r11,%rdi,4), %r9d;\ - movzx b ## H, %edi;\ - ror $15, b ## D;\ - xor (%r11,%rdi,4), %r8d;\ - movzx a ## H, %edi;\ - xor s1(%r11,%rdi,4),%r9d;\ - add %r8d, %r9d;\ - add %r9d, %r8d;\ - add k+round(%r11), %r9d;\ - xor %r9d, c ## D;\ - rol $15, c ## D;\ - add k+4+round(%r11),%r8d;\ - xor %r8d, d ## D; - -/* - * a input register containing a(rotated 16) - * b input register containing b - * c input register containing c - * d input register containing d (already rol $1) - * operations on a and b are interleaved to increase performance - * during the round a and b are prepared for the output whitening - */ -#define encrypt_last_round(a,b,c,d,round)\ - mov b ## D, %r10d;\ - shl $32, %r10;\ - movzx b ## B, %edi;\ - mov s1(%r11,%rdi,4),%r8d;\ - movzx a ## B, %edi;\ - mov s2(%r11,%rdi,4),%r9d;\ - movzx b ## H, %edi;\ - ror $16, b ## D;\ - xor s2(%r11,%rdi,4),%r8d;\ - movzx a ## H, %edi;\ - ror $16, a ## D;\ - xor s3(%r11,%rdi,4),%r9d;\ - movzx b ## B, %edi;\ - xor s3(%r11,%rdi,4),%r8d;\ - movzx a ## B, %edi;\ - xor (%r11,%rdi,4), %r9d;\ - xor a, %r10;\ - movzx b ## H, %edi;\ - xor (%r11,%rdi,4), %r8d;\ - movzx a ## H, %edi;\ - xor s1(%r11,%rdi,4),%r9d;\ - add %r8d, %r9d;\ - add %r9d, %r8d;\ - add k+round(%r11), %r9d;\ - xor %r9d, c ## D;\ - ror $1, c ## D;\ - add k+4+round(%r11),%r8d;\ - xor %r8d, d ## D - -/* - * a input register containing a - * b input register containing b (rotated 16) - * c input register containing c (already rol $1) - * d input register containing d - * operations on a and b are interleaved to increase performance - */ -#define decrypt_round(a,b,c,d,round)\ - movzx a ## B, %edi;\ - mov (%r11,%rdi,4), %r9d;\ - movzx b ## B, %edi;\ - mov s3(%r11,%rdi,4),%r8d;\ - movzx a ## H, %edi;\ - ror $16, a ## D;\ - xor s1(%r11,%rdi,4),%r9d;\ - movzx b ## H, %edi;\ - ror $16, b ## D;\ - xor (%r11,%rdi,4), %r8d;\ - movzx a ## B, %edi;\ - xor s2(%r11,%rdi,4),%r9d;\ - movzx b ## B, %edi;\ - xor s1(%r11,%rdi,4),%r8d;\ - movzx a ## H, %edi;\ - ror $15, a ## D;\ - xor s3(%r11,%rdi,4),%r9d;\ - movzx b ## H, %edi;\ - xor s2(%r11,%rdi,4),%r8d;\ - add %r8d, %r9d;\ - add %r9d, %r8d;\ - add k+round(%r11), %r9d;\ - xor %r9d, c ## D;\ - add k+4+round(%r11),%r8d;\ - xor %r8d, d ## D;\ - rol $15, d ## D; - -/* - * a input register containing a - * b input register containing b - * c input register containing c (already rol $1) - * d input register containing d - * operations on a and b are interleaved to increase performance - * during the round a and b are prepared for the output whitening - */ -#define decrypt_last_round(a,b,c,d,round)\ - movzx a ## B, %edi;\ - mov (%r11,%rdi,4), %r9d;\ - movzx b ## B, %edi;\ - mov s3(%r11,%rdi,4),%r8d;\ - movzx b ## H, %edi;\ - ror $16, b ## D;\ - xor (%r11,%rdi,4), %r8d;\ - movzx a ## H, %edi;\ - mov b ## D, %r10d;\ - shl $32, %r10;\ - xor a, %r10;\ - ror $16, a ## D;\ - xor s1(%r11,%rdi,4),%r9d;\ - movzx b ## B, %edi;\ - xor s1(%r11,%rdi,4),%r8d;\ - movzx a ## B, %edi;\ - xor s2(%r11,%rdi,4),%r9d;\ - movzx b ## H, %edi;\ - xor s2(%r11,%rdi,4),%r8d;\ - movzx a ## H, %edi;\ - xor s3(%r11,%rdi,4),%r9d;\ - add %r8d, %r9d;\ - add %r9d, %r8d;\ - add k+round(%r11), %r9d;\ - xor %r9d, c ## D;\ - add k+4+round(%r11),%r8d;\ - xor %r8d, d ## D;\ - ror $1, d ## D; - -.align 8 -.global twofish_enc_blk -.global twofish_dec_blk - -twofish_enc_blk: - pushq R1 - - /* %rdi contains the crypto tfm adress */ - /* %rsi contains the output adress */ - /* %rdx contains the input adress */ - add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ - /* ctx adress is moved to free one non-rex register - as target for the 8bit high operations */ - mov %rdi, %r11 - - movq (R3), R1 - movq 8(R3), R3 - input_whitening(R1,%r11,a_offset) - input_whitening(R3,%r11,c_offset) - mov R1D, R0D - rol $16, R0D - shr $32, R1 - mov R3D, R2D - shr $32, R3 - rol $1, R3D - - encrypt_round(R0,R1,R2,R3,0); - encrypt_round(R2,R3,R0,R1,8); - encrypt_round(R0,R1,R2,R3,2*8); - encrypt_round(R2,R3,R0,R1,3*8); - encrypt_round(R0,R1,R2,R3,4*8); - encrypt_round(R2,R3,R0,R1,5*8); - encrypt_round(R0,R1,R2,R3,6*8); - encrypt_round(R2,R3,R0,R1,7*8); - encrypt_round(R0,R1,R2,R3,8*8); - encrypt_round(R2,R3,R0,R1,9*8); - encrypt_round(R0,R1,R2,R3,10*8); - encrypt_round(R2,R3,R0,R1,11*8); - encrypt_round(R0,R1,R2,R3,12*8); - encrypt_round(R2,R3,R0,R1,13*8); - encrypt_round(R0,R1,R2,R3,14*8); - encrypt_last_round(R2,R3,R0,R1,15*8); - - - output_whitening(%r10,%r11,a_offset) - movq %r10, (%rsi) - - shl $32, R1 - xor R0, R1 - - output_whitening(R1,%r11,c_offset) - movq R1, 8(%rsi) - - popq R1 - movq $1,%rax - ret - -twofish_dec_blk: - pushq R1 - - /* %rdi contains the crypto tfm adress */ - /* %rsi contains the output adress */ - /* %rdx contains the input adress */ - add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ - /* ctx adress is moved to free one non-rex register - as target for the 8bit high operations */ - mov %rdi, %r11 - - movq (R3), R1 - movq 8(R3), R3 - output_whitening(R1,%r11,a_offset) - output_whitening(R3,%r11,c_offset) - mov R1D, R0D - shr $32, R1 - rol $16, R1D - mov R3D, R2D - shr $32, R3 - rol $1, R2D - - decrypt_round(R0,R1,R2,R3,15*8); - decrypt_round(R2,R3,R0,R1,14*8); - decrypt_round(R0,R1,R2,R3,13*8); - decrypt_round(R2,R3,R0,R1,12*8); - decrypt_round(R0,R1,R2,R3,11*8); - decrypt_round(R2,R3,R0,R1,10*8); - decrypt_round(R0,R1,R2,R3,9*8); - decrypt_round(R2,R3,R0,R1,8*8); - decrypt_round(R0,R1,R2,R3,7*8); - decrypt_round(R2,R3,R0,R1,6*8); - decrypt_round(R0,R1,R2,R3,5*8); - decrypt_round(R2,R3,R0,R1,4*8); - decrypt_round(R0,R1,R2,R3,3*8); - decrypt_round(R2,R3,R0,R1,2*8); - decrypt_round(R0,R1,R2,R3,1*8); - decrypt_last_round(R2,R3,R0,R1,0); - - input_whitening(%r10,%r11,a_offset) - movq %r10, (%rsi) - - shl $32, R1 - xor R0, R1 - - input_whitening(R1,%r11,c_offset) - movq R1, 8(%rsi) - - popq R1 - movq $1,%rax - ret diff --git a/arch/x86_64/crypto/twofish.c b/arch/x86_64/crypto/twofish.c deleted file mode 100644 index 182d91d5cfb9..000000000000 --- a/arch/x86_64/crypto/twofish.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Glue Code for optimized x86_64 assembler version of TWOFISH - * - * Originally Twofish for GPG - * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998 - * 256-bit key length added March 20, 1999 - * Some modifications to reduce the text size by Werner Koch, April, 1998 - * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com> - * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net> - * - * The original author has disclaimed all copyright interest in this - * code and thus put it in the public domain. The subsequent authors - * have put this under the GNU General Public License. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 - * USA - * - * This code is a "clean room" implementation, written from the paper - * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey, - * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available - * through http://www.counterpane.com/twofish.html - * - * For background information on multiplication in finite fields, used for - * the matrix operations in the key schedule, see the book _Contemporary - * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the - * Third Edition. - */ - -#include <crypto/twofish.h> -#include <linux/crypto.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/types.h> - -asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); -asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); - -static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - twofish_enc_blk(tfm, dst, src); -} - -static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - twofish_dec_blk(tfm, dst, src); -} - -static struct crypto_alg alg = { - .cra_name = "twofish", - .cra_driver_name = "twofish-x86_64", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = TF_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 3, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = TF_MIN_KEY_SIZE, - .cia_max_keysize = TF_MAX_KEY_SIZE, - .cia_setkey = twofish_setkey, - .cia_encrypt = twofish_encrypt, - .cia_decrypt = twofish_decrypt - } - } -}; - -static int __init init(void) -{ - return crypto_register_alg(&alg); -} - -static void __exit fini(void) -{ - crypto_unregister_alg(&alg); -} - -module_init(init); -module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized"); -MODULE_ALIAS("twofish"); diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile deleted file mode 100644 index cdae36435e21..000000000000 --- a/arch/x86_64/ia32/Makefile +++ /dev/null @@ -1,35 +0,0 @@ -# -# Makefile for the ia32 kernel emulation subsystem. -# - -obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \ - ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o \ - mmap32.o - -sysv-$(CONFIG_SYSVIPC) := ipc32.o -obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) - -obj-$(CONFIG_IA32_AOUT) += ia32_aout.o - -audit-class-$(CONFIG_AUDIT) := audit.o -obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) - -$(obj)/syscall32_syscall.o: \ - $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so) - -# Teach kbuild about targets -targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so) - -# The DSO images are built using a special linker script -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m32 -nostdlib -shared -s \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) \ - -Wl,-soname=linux-gate.so.1 -o $@ \ - -Wl,-T,$(filter-out FORCE,$^) - -$(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \ -$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE - $(call if_changed,syscall) - -AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 -AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 diff --git a/arch/x86_64/ia32/audit.c b/arch/x86_64/ia32/audit.c deleted file mode 100644 index 8850fe40ea34..000000000000 --- a/arch/x86_64/ia32/audit.c +++ /dev/null @@ -1,42 +0,0 @@ -#include <asm-i386/unistd.h> - -unsigned ia32_dir_class[] = { -#include <asm-generic/audit_dir_write.h> -~0U -}; - -unsigned ia32_chattr_class[] = { -#include <asm-generic/audit_change_attr.h> -~0U -}; - -unsigned ia32_write_class[] = { -#include <asm-generic/audit_write.h> -~0U -}; - -unsigned ia32_read_class[] = { -#include <asm-generic/audit_read.h> -~0U -}; - -unsigned ia32_signal_class[] = { -#include <asm-generic/audit_signal.h> -~0U -}; - -int ia32_classify_syscall(unsigned syscall) -{ - switch(syscall) { - case __NR_open: - return 2; - case __NR_openat: - return 3; - case __NR_socketcall: - return 4; - case __NR_execve: - return 5; - default: - return 1; - } -} diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c deleted file mode 100644 index 2c8209a3605a..000000000000 --- a/arch/x86_64/ia32/fpu32.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes. - * This is used for ptrace, signals and coredumps in 32bit emulation. - */ - -#include <linux/sched.h> -#include <asm/sigcontext32.h> -#include <asm/processor.h> -#include <asm/uaccess.h> -#include <asm/i387.h> - -static inline unsigned short twd_i387_to_fxsr(unsigned short twd) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - return tmp; -} - -static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) -{ - struct _fpxreg *st = NULL; - unsigned long tos = (fxsave->swd >> 11) & 7; - unsigned long twd = (unsigned long) fxsave->twd; - unsigned long tag; - unsigned long ret = 0xffff0000; - int i; - -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); - - for (i = 0 ; i < 8 ; i++) { - if (twd & 0x1) { - st = FPREG_ADDR( fxsave, (i - tos) & 7 ); - - switch (st->exponent & 0x7fff) { - case 0x7fff: - tag = 2; /* Special */ - break; - case 0x0000: - if ( !st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3] ) { - tag = 1; /* Zero */ - } else { - tag = 2; /* Special */ - } - break; - default: - if (st->significand[3] & 0x8000) { - tag = 0; /* Valid */ - } else { - tag = 2; /* Special */ - } - break; - } - } else { - tag = 3; /* Empty */ - } - ret |= (tag << (2 * i)); - twd = twd >> 1; - } - return ret; -} - - -static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave, - struct _fpstate_ia32 __user *buf) -{ - struct _fpxreg *to; - struct _fpreg __user *from; - int i; - u32 v; - int err = 0; - -#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf) - G(0, fxsave->cwd); - G(1, fxsave->swd); - G(2, fxsave->twd); - fxsave->twd = twd_i387_to_fxsr(fxsave->twd); - G(3, fxsave->rip); - G(4, v); - fxsave->fop = v>>16; /* cs ignored */ - G(5, fxsave->rdp); - /* 6: ds ignored */ -#undef G - if (err) - return -1; - - to = (struct _fpxreg *)&fxsave->st_space[0]; - from = &buf->_st[0]; - for (i = 0 ; i < 8 ; i++, to++, from++) { - if (__copy_from_user(to, from, sizeof(*from))) - return -1; - } - return 0; -} - - -static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf, - struct i387_fxsave_struct *fxsave, - struct pt_regs *regs, - struct task_struct *tsk) -{ - struct _fpreg __user *to; - struct _fpxreg *from; - int i; - u16 cs,ds; - int err = 0; - - if (tsk == current) { - /* should be actually ds/cs at fpu exception time, - but that information is not available in 64bit mode. */ - asm("movw %%ds,%0 " : "=r" (ds)); - asm("movw %%cs,%0 " : "=r" (cs)); - } else { /* ptrace. task has stopped. */ - ds = tsk->thread.ds; - cs = regs->cs; - } - -#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf) - P(0, (u32)fxsave->cwd | 0xffff0000); - P(1, (u32)fxsave->swd | 0xffff0000); - P(2, twd_fxsr_to_i387(fxsave)); - P(3, (u32)fxsave->rip); - P(4, cs | ((u32)fxsave->fop) << 16); - P(5, fxsave->rdp); - P(6, 0xffff0000 | ds); -#undef P - - if (err) - return -1; - - to = &buf->_st[0]; - from = (struct _fpxreg *) &fxsave->st_space[0]; - for ( i = 0 ; i < 8 ; i++, to++, from++ ) { - if (__copy_to_user(to, from, sizeof(*to))) - return -1; - } - return 0; -} - -int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave) -{ - clear_fpu(tsk); - if (!fsave) { - if (__copy_from_user(&tsk->thread.i387.fxsave, - &buf->_fxsr_env[0], - sizeof(struct i387_fxsave_struct))) - return -1; - tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - set_stopped_child_used_math(tsk); - } - return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf); -} - -int save_i387_ia32(struct task_struct *tsk, - struct _fpstate_ia32 __user *buf, - struct pt_regs *regs, - int fsave) -{ - int err = 0; - - init_fpu(tsk); - if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk)) - return -1; - if (fsave) - return 0; - err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status); - if (fsave) - return err ? -1 : 1; - err |= __put_user(X86_FXSR_MAGIC, &buf->magic); - err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct)); - return err ? -1 : 1; -} diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c deleted file mode 100644 index 08781370256d..000000000000 --- a/arch/x86_64/ia32/ia32_aout.c +++ /dev/null @@ -1,528 +0,0 @@ -/* - * a.out loader for x86-64 - * - * Copyright (C) 1991, 1992, 1996 Linus Torvalds - * Hacked together by Andi Kleen - */ - -#include <linux/module.h> - -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/a.out.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/slab.h> -#include <linux/binfmts.h> -#include <linux/personality.h> -#include <linux/init.h> - -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/pgalloc.h> -#include <asm/cacheflush.h> -#include <asm/user32.h> -#include <asm/ia32.h> - -#undef WARN_OLD -#undef CORE_DUMP /* probably broken */ - -static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); -static int load_aout_library(struct file*); - -#ifdef CORE_DUMP -static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file); - -/* - * fill in the user structure for a core dump.. - */ -static void dump_thread32(struct pt_regs * regs, struct user32 * dump) -{ - u32 fs,gs; - -/* changed the size calculations - should hopefully work better. lbt */ - dump->magic = CMAGIC; - dump->start_code = 0; - dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1); - dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; - dump->u_dsize -= dump->u_tsize; - dump->u_ssize = 0; - dump->u_debugreg[0] = current->thread.debugreg0; - dump->u_debugreg[1] = current->thread.debugreg1; - dump->u_debugreg[2] = current->thread.debugreg2; - dump->u_debugreg[3] = current->thread.debugreg3; - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - dump->u_debugreg[7] = current->thread.debugreg7; - - if (dump->start_stack < 0xc0000000) - dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT; - - dump->regs.ebx = regs->rbx; - dump->regs.ecx = regs->rcx; - dump->regs.edx = regs->rdx; - dump->regs.esi = regs->rsi; - dump->regs.edi = regs->rdi; - dump->regs.ebp = regs->rbp; - dump->regs.eax = regs->rax; - dump->regs.ds = current->thread.ds; - dump->regs.es = current->thread.es; - asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; - asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; - dump->regs.orig_eax = regs->orig_rax; - dump->regs.eip = regs->rip; - dump->regs.cs = regs->cs; - dump->regs.eflags = regs->eflags; - dump->regs.esp = regs->rsp; - dump->regs.ss = regs->ss; - -#if 1 /* FIXME */ - dump->u_fpvalid = 0; -#else - dump->u_fpvalid = dump_fpu (regs, &dump->i387); -#endif -} - -#endif - -static struct linux_binfmt aout_format = { - .module = THIS_MODULE, - .load_binary = load_aout_binary, - .load_shlib = load_aout_library, -#ifdef CORE_DUMP - .core_dump = aout_core_dump, -#endif - .min_coredump = PAGE_SIZE -}; - -static void set_brk(unsigned long start, unsigned long end) -{ - start = PAGE_ALIGN(start); - end = PAGE_ALIGN(end); - if (end <= start) - return; - down_write(¤t->mm->mmap_sem); - do_brk(start, end - start); - up_write(¤t->mm->mmap_sem); -} - -#ifdef CORE_DUMP -/* - * These are the only things you should do on a core-file: use only these - * macros to write out all the necessary info. - */ - -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -#define DUMP_WRITE(addr, nr) \ - if (!dump_write(file, (void *)(addr), (nr))) \ - goto end_coredump; - -#define DUMP_SEEK(offset) \ -if (file->f_op->llseek) { \ - if (file->f_op->llseek(file,(offset),0) != (offset)) \ - goto end_coredump; \ -} else file->f_pos = (offset) - -/* - * Routine writes a core dump image in the current directory. - * Currently only a stub-function. - * - * Note that setuid/setgid files won't make a core-dump if the uid/gid - * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable" - * field, which also makes sure the core-dumps won't be recursive if the - * dumping of the process results in another error.. - */ - -static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file) -{ - mm_segment_t fs; - int has_dumped = 0; - unsigned long dump_start, dump_size; - struct user32 dump; -# define START_DATA(u) (u.u_tsize << PAGE_SHIFT) -# define START_STACK(u) (u.start_stack) - - fs = get_fs(); - set_fs(KERNEL_DS); - has_dumped = 1; - current->flags |= PF_DUMPCORE; - strncpy(dump.u_comm, current->comm, sizeof(current->comm)); - dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump))); - dump.signal = signr; - dump_thread32(regs, &dump); - -/* If the size of the dump file exceeds the rlimit, then see what would happen - if we wrote the stack, but not the data area. */ - if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE > - current->signal->rlim[RLIMIT_CORE].rlim_cur) - dump.u_dsize = 0; - -/* Make sure we have enough room to write the stack and data areas. */ - if ((dump.u_ssize+1) * PAGE_SIZE > - current->signal->rlim[RLIMIT_CORE].rlim_cur) - dump.u_ssize = 0; - -/* make sure we actually have a data and stack area to dump */ - set_fs(USER_DS); - if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) - dump.u_dsize = 0; - if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) - dump.u_ssize = 0; - - set_fs(KERNEL_DS); -/* struct user */ - DUMP_WRITE(&dump,sizeof(dump)); -/* Now dump all of the user data. Include malloced stuff as well */ - DUMP_SEEK(PAGE_SIZE); -/* now we start writing out the user space info */ - set_fs(USER_DS); -/* Dump the data area */ - if (dump.u_dsize != 0) { - dump_start = START_DATA(dump); - dump_size = dump.u_dsize << PAGE_SHIFT; - DUMP_WRITE(dump_start,dump_size); - } -/* Now prepare to dump the stack area */ - if (dump.u_ssize != 0) { - dump_start = START_STACK(dump); - dump_size = dump.u_ssize << PAGE_SHIFT; - DUMP_WRITE(dump_start,dump_size); - } -/* Finally dump the task struct. Not be used by gdb, but could be useful */ - set_fs(KERNEL_DS); - DUMP_WRITE(current,sizeof(*current)); -end_coredump: - set_fs(fs); - return has_dumped; -} -#endif - -/* - * create_aout_tables() parses the env- and arg-strings in new user - * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. - */ -static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) -{ - u32 __user *argv; - u32 __user *envp; - u32 __user *sp; - int argc = bprm->argc; - int envc = bprm->envc; - - sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); - sp -= envc+1; - envp = sp; - sp -= argc+1; - argv = sp; - put_user((unsigned long) envp,--sp); - put_user((unsigned long) argv,--sp); - put_user(argc,--sp); - current->mm->arg_start = (unsigned long) p; - while (argc-->0) { - char c; - put_user((u32)(unsigned long)p,argv++); - do { - get_user(c,p++); - } while (c); - } - put_user(0, argv); - current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-->0) { - char c; - put_user((u32)(unsigned long)p,envp++); - do { - get_user(c,p++); - } while (c); - } - put_user(0, envp); - current->mm->env_end = (unsigned long) p; - return sp; -} - -/* - * These are the functions used to load a.out style executables and shared - * libraries. There is no binary dependent code anywhere else. - */ - -static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) -{ - struct exec ex; - unsigned long error; - unsigned long fd_offset; - unsigned long rlim; - int retval; - - ex = *((struct exec *) bprm->buf); /* exec-header */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && - N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || - N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - return -ENOEXEC; - } - - fd_offset = N_TXTOFF(ex); - - /* Check initial limits. This avoids letting people circumvent - * size limits imposed on them by creating programs with large - * arrays in the data or bss. - */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; - if (rlim >= RLIM_INFINITY) - rlim = ~0; - if (ex.a_data + ex.a_bss > rlim) - return -ENOMEM; - - /* Flush all traces of the currently running executable */ - retval = flush_old_exec(bprm); - if (retval) - return retval; - - regs->cs = __USER32_CS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - - /* OK, This is the point of no return */ - set_personality(PER_LINUX); - set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_ABI_PENDING); - - current->mm->end_code = ex.a_text + - (current->mm->start_code = N_TXTADDR(ex)); - current->mm->end_data = ex.a_data + - (current->mm->start_data = N_DATADDR(ex)); - current->mm->brk = ex.a_bss + - (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = TASK_UNMAPPED_BASE; - current->mm->cached_hole_size = 0; - - current->mm->mmap = NULL; - compute_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; - - if (N_MAGIC(ex) == OMAGIC) { - unsigned long text_addr, map_size; - loff_t pos; - - text_addr = N_TXTADDR(ex); - - pos = 32; - map_size = ex.a_text+ex.a_data; - - down_write(¤t->mm->mmap_sem); - error = do_brk(text_addr & PAGE_MASK, map_size); - up_write(¤t->mm->mmap_sem); - - if (error != (text_addr & PAGE_MASK)) { - send_sig(SIGKILL, current, 0); - return error; - } - - error = bprm->file->f_op->read(bprm->file, - (char __user *)text_addr, - ex.a_text+ex.a_data, &pos); - if ((signed long)error < 0) { - send_sig(SIGKILL, current, 0); - return error; - } - - flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); - } else { -#ifdef WARN_OLD - static unsigned long error_time, error_time2; - if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) - { - printk(KERN_NOTICE "executable not page aligned\n"); - error_time2 = jiffies; - } - - if ((fd_offset & ~PAGE_MASK) != 0 && - (jiffies-error_time) > 5*HZ) - { - printk(KERN_WARNING - "fd_offset is not page aligned. Please convert program: %s\n", - bprm->file->f_path.dentry->d_name.name); - error_time = jiffies; - } -#endif - - if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { - loff_t pos = fd_offset; - down_write(¤t->mm->mmap_sem); - do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - up_write(¤t->mm->mmap_sem); - bprm->file->f_op->read(bprm->file, - (char __user *)N_TXTADDR(ex), - ex.a_text+ex.a_data, &pos); - flush_icache_range((unsigned long) N_TXTADDR(ex), - (unsigned long) N_TXTADDR(ex) + - ex.a_text+ex.a_data); - goto beyond_if; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, - PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, - fd_offset); - up_write(¤t->mm->mmap_sem); - - if (error != N_TXTADDR(ex)) { - send_sig(SIGKILL, current, 0); - return error; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, - fd_offset + ex.a_text); - up_write(¤t->mm->mmap_sem); - if (error != N_DATADDR(ex)) { - send_sig(SIGKILL, current, 0); - return error; - } - } -beyond_if: - set_binfmt(&aout_format); - - set_brk(current->mm->start_brk, current->mm->brk); - - retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); - return retval; - } - - current->mm->start_stack = - (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); - /* start thread */ - asm volatile("movl %0,%%fs" :: "r" (0)); \ - asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); - load_gs_index(0); - (regs)->rip = ex.a_entry; - (regs)->rsp = current->mm->start_stack; - (regs)->eflags = 0x200; - (regs)->cs = __USER32_CS; - (regs)->ss = __USER32_DS; - set_fs(USER_DS); - if (unlikely(current->ptrace & PT_PTRACED)) { - if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); - else - send_sig(SIGTRAP, current, 0); - } - return 0; -} - -static int load_aout_library(struct file *file) -{ - struct inode * inode; - unsigned long bss, start_addr, len; - unsigned long error; - int retval; - struct exec ex; - - inode = file->f_path.dentry->d_inode; - - retval = -ENOEXEC; - error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); - if (error != sizeof(ex)) - goto out; - - /* We come in here for the regular a.out style of shared libraries */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || - N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || - i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - goto out; - } - - if (N_FLAGS(ex)) - goto out; - - /* For QMAGIC, the starting address is 0x20 into the page. We mask - this off to get the starting address for the page */ - - start_addr = ex.a_entry & 0xfffff000; - - if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { - loff_t pos = N_TXTOFF(ex); - -#ifdef WARN_OLD - static unsigned long error_time; - if ((jiffies-error_time) > 5*HZ) - { - printk(KERN_WARNING - "N_TXTOFF is not page aligned. Please convert library: %s\n", - file->f_path.dentry->d_name.name); - error_time = jiffies; - } -#endif - down_write(¤t->mm->mmap_sem); - do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - up_write(¤t->mm->mmap_sem); - - file->f_op->read(file, (char __user *)start_addr, - ex.a_text + ex.a_data, &pos); - flush_icache_range((unsigned long) start_addr, - (unsigned long) start_addr + ex.a_text + ex.a_data); - - retval = 0; - goto out; - } - /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(file, start_addr, ex.a_text + ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT, - N_TXTOFF(ex)); - up_write(¤t->mm->mmap_sem); - retval = error; - if (error != start_addr) - goto out; - - len = PAGE_ALIGN(ex.a_text + ex.a_data); - bss = ex.a_text + ex.a_data + ex.a_bss; - if (bss > len) { - down_write(¤t->mm->mmap_sem); - error = do_brk(start_addr + len, bss - len); - up_write(¤t->mm->mmap_sem); - retval = error; - if (error != start_addr + len) - goto out; - } - retval = 0; -out: - return retval; -} - -static int __init init_aout_binfmt(void) -{ - return register_binfmt(&aout_format); -} - -static void __exit exit_aout_binfmt(void) -{ - unregister_binfmt(&aout_format); -} - -module_init(init_aout_binfmt); -module_exit(exit_aout_binfmt); -MODULE_LICENSE("GPL"); diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c deleted file mode 100644 index dffd2ac72747..000000000000 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Written 2000,2002 by Andi Kleen. - * - * Loosely based on the sparc64 and IA64 32bit emulation loaders. - * This tricks binfmt_elf.c into loading 32bit binaries using lots - * of ugly preprocessor tricks. Talk about very very poor man's inheritance. - */ -#define __ASM_X86_64_ELF_H 1 - -#undef ELF_CLASS -#define ELF_CLASS ELFCLASS32 - -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/rwsem.h> -#include <linux/sched.h> -#include <linux/compat.h> -#include <linux/string.h> -#include <linux/binfmts.h> -#include <linux/mm.h> -#include <linux/security.h> - -#include <asm/segment.h> -#include <asm/ptrace.h> -#include <asm/processor.h> -#include <asm/user32.h> -#include <asm/sigcontext32.h> -#include <asm/fpu32.h> -#include <asm/i387.h> -#include <asm/uaccess.h> -#include <asm/ia32.h> -#include <asm/vsyscall32.h> - -#define ELF_NAME "elf/i386" - -#define AT_SYSINFO 32 -#define AT_SYSINFO_EHDR 33 - -int sysctl_vsyscall32 = 1; - -#undef ARCH_DLINFO -#define ARCH_DLINFO do { \ - if (sysctl_vsyscall32) { \ - current->mm->context.vdso = (void *)VSYSCALL32_BASE; \ - NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \ - NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \ - } \ -} while(0) - -struct file; -struct elf_phdr; - -#define IA32_EMULATOR 1 - -#define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) - -#undef ELF_ARCH -#define ELF_ARCH EM_386 - -#define ELF_DATA ELFDATA2LSB - -#define USE_ELF_CORE_DUMP 1 - -/* Override elfcore.h */ -#define _LINUX_ELFCORE_H 1 -typedef unsigned int elf_greg_t; - -#define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t)) -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; - -struct elf_siginfo -{ - int si_signo; /* signal number */ - int si_code; /* extra code */ - int si_errno; /* errno */ -}; - -#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0) - -struct elf_prstatus -{ - struct elf_siginfo pr_info; /* Info associated with signal */ - short pr_cursig; /* Current signal */ - unsigned int pr_sigpend; /* Set of pending signals */ - unsigned int pr_sighold; /* Set of held signals */ - pid_t pr_pid; - pid_t pr_ppid; - pid_t pr_pgrp; - pid_t pr_sid; - struct compat_timeval pr_utime; /* User time */ - struct compat_timeval pr_stime; /* System time */ - struct compat_timeval pr_cutime; /* Cumulative user time */ - struct compat_timeval pr_cstime; /* Cumulative system time */ - elf_gregset_t pr_reg; /* GP registers */ - int pr_fpvalid; /* True if math co-processor being used. */ -}; - -#define ELF_PRARGSZ (80) /* Number of chars for args */ - -struct elf_prpsinfo -{ - char pr_state; /* numeric process state */ - char pr_sname; /* char for pr_state */ - char pr_zomb; /* zombie */ - char pr_nice; /* nice val */ - unsigned int pr_flag; /* flags */ - __u16 pr_uid; - __u16 pr_gid; - pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; - /* Lots missing */ - char pr_fname[16]; /* filename of executable */ - char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ -}; - -#define __STR(x) #x -#define STR(x) __STR(x) - -#define _GET_SEG(x) \ - ({ __u32 seg; asm("movl %%" STR(x) ",%0" : "=r"(seg)); seg; }) - -/* Assumes current==process to be dumped */ -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - pr_reg[0] = regs->rbx; \ - pr_reg[1] = regs->rcx; \ - pr_reg[2] = regs->rdx; \ - pr_reg[3] = regs->rsi; \ - pr_reg[4] = regs->rdi; \ - pr_reg[5] = regs->rbp; \ - pr_reg[6] = regs->rax; \ - pr_reg[7] = _GET_SEG(ds); \ - pr_reg[8] = _GET_SEG(es); \ - pr_reg[9] = _GET_SEG(fs); \ - pr_reg[10] = _GET_SEG(gs); \ - pr_reg[11] = regs->orig_rax; \ - pr_reg[12] = regs->rip; \ - pr_reg[13] = regs->cs; \ - pr_reg[14] = regs->eflags; \ - pr_reg[15] = regs->rsp; \ - pr_reg[16] = regs->ss; - -#define user user32 - -#undef elf_read_implies_exec -#define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X) -//#include <asm/ia32.h> -#include <linux/elf.h> - -typedef struct user_i387_ia32_struct elf_fpregset_t; -typedef struct user32_fxsr_struct elf_fpxregset_t; - - -static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs) -{ - ELF_CORE_COPY_REGS((*elfregs), regs) -} - -static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) -{ - struct pt_regs *pp = task_pt_regs(t); - ELF_CORE_COPY_REGS((*elfregs), pp); - /* fix wrong segments */ - (*elfregs)[7] = t->thread.ds; - (*elfregs)[9] = t->thread.fsindex; - (*elfregs)[10] = t->thread.gsindex; - (*elfregs)[8] = t->thread.es; - return 1; -} - -static inline int -elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu) -{ - struct _fpstate_ia32 *fpstate = (void*)fpu; - mm_segment_t oldfs = get_fs(); - - if (!tsk_used_math(tsk)) - return 0; - if (!regs) - regs = task_pt_regs(tsk); - if (tsk == current) - unlazy_fpu(tsk); - set_fs(KERNEL_DS); - save_i387_ia32(tsk, fpstate, regs, 1); - /* Correct for i386 bug. It puts the fop into the upper 16bits of - the tag word (like FXSAVE), not into the fcs*/ - fpstate->cssel |= fpstate->tag & 0xffff0000; - set_fs(oldfs); - return 1; -} - -#define ELF_CORE_COPY_XFPREGS 1 -static inline int -elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) -{ - struct pt_regs *regs = task_pt_regs(t); - if (!tsk_used_math(t)) - return 0; - if (t == current) - unlazy_fpu(t); - memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t)); - xfpu->fcs = regs->cs; - xfpu->fos = t->thread.ds; /* right? */ - return 1; -} - -#undef elf_check_arch -#define elf_check_arch(x) \ - ((x)->e_machine == EM_386) - -extern int force_personality32; - -#define ELF_EXEC_PAGESIZE PAGE_SIZE -#define ELF_HWCAP (boot_cpu_data.x86_capability[0]) -#define ELF_PLATFORM ("i686") -#define SET_PERSONALITY(ex, ibcs2) \ -do { \ - unsigned long new_flags = 0; \ - if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ - new_flags = _TIF_IA32; \ - if ((current_thread_info()->flags & _TIF_IA32) \ - != new_flags) \ - set_thread_flag(TIF_ABI_PENDING); \ - else \ - clear_thread_flag(TIF_ABI_PENDING); \ - /* XXX This overwrites the user set personality */ \ - current->personality |= force_personality32; \ -} while (0) - -/* Override some function names */ -#define elf_format elf32_format - -#define init_elf_binfmt init_elf32_binfmt -#define exit_elf_binfmt exit_elf32_binfmt - -#define load_elf_binary load_elf32_binary - -#define ELF_PLAT_INIT(r, load_addr) elf32_init(r) - -#undef start_thread -#define start_thread(regs,new_rip,new_rsp) do { \ - asm volatile("movl %0,%%fs" :: "r" (0)); \ - asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \ - load_gs_index(0); \ - (regs)->rip = (new_rip); \ - (regs)->rsp = (new_rsp); \ - (regs)->eflags = 0x200; \ - (regs)->cs = __USER32_CS; \ - (regs)->ss = __USER32_DS; \ - set_fs(USER_DS); \ -} while(0) - - -#include <linux/module.h> - -MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries."); -MODULE_AUTHOR("Eric Youngdale, Andi Kleen"); - -#undef MODULE_DESCRIPTION -#undef MODULE_AUTHOR - -static void elf32_init(struct pt_regs *); - -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 -#define arch_setup_additional_pages syscall32_setup_pages -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); - -#include "../../../fs/binfmt_elf.c" - -static void elf32_init(struct pt_regs *regs) -{ - struct task_struct *me = current; - regs->rdi = 0; - regs->rsi = 0; - regs->rdx = 0; - regs->rcx = 0; - regs->rax = 0; - regs->rbx = 0; - regs->rbp = 0; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - me->thread.fs = 0; - me->thread.gs = 0; - me->thread.fsindex = 0; - me->thread.gsindex = 0; - me->thread.ds = __USER_DS; - me->thread.es = __USER_DS; -} - -#ifdef CONFIG_SYSCTL -/* Register vsyscall32 into the ABI table */ -#include <linux/sysctl.h> - -static ctl_table abi_table2[] = { - { - .ctl_name = 99, - .procname = "vsyscall32", - .data = &sysctl_vsyscall32, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - {} -}; - -static ctl_table abi_root_table2[] = { - { - .ctl_name = CTL_ABI, - .procname = "abi", - .mode = 0555, - .child = abi_table2 - }, - {} -}; - -static __init int ia32_binfmt_init(void) -{ - register_sysctl_table(abi_root_table2); - return 0; -} -__initcall(ia32_binfmt_init); -#endif diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c deleted file mode 100644 index 6ea19c25f90d..000000000000 --- a/arch/x86_64/ia32/ia32_signal.c +++ /dev/null @@ -1,617 +0,0 @@ -/* - * linux/arch/x86_64/ia32/ia32_signal.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes - * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/kernel.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/wait.h> -#include <linux/ptrace.h> -#include <linux/unistd.h> -#include <linux/stddef.h> -#include <linux/personality.h> -#include <linux/compat.h> -#include <linux/binfmts.h> -#include <asm/ucontext.h> -#include <asm/uaccess.h> -#include <asm/i387.h> -#include <asm/ia32.h> -#include <asm/ptrace.h> -#include <asm/ia32_unistd.h> -#include <asm/user32.h> -#include <asm/sigcontext32.h> -#include <asm/fpu32.h> -#include <asm/proto.h> -#include <asm/vsyscall32.h> - -#define DEBUG_SIG 0 - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); -void signal_fault(struct pt_regs *regs, void __user *frame, char *where); - -int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) -{ - int err; - if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t))) - return -EFAULT; - - /* If you change siginfo_t structure, please make sure that - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); - - if (from->si_code < 0) { - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); - } else { - /* First 32bits of unions are always present: - * si_pid === si_band === si_tid === si_addr(LS half) */ - err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]); - switch (from->si_code >> 16) { - case __SI_FAULT >> 16: - break; - case __SI_CHLD >> 16: - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - err |= __put_user(from->si_status, &to->si_status); - /* FALL THROUGH */ - default: - case __SI_KILL >> 16: - err |= __put_user(from->si_uid, &to->si_uid); - break; - case __SI_POLL >> 16: - err |= __put_user(from->si_fd, &to->si_fd); - break; - case __SI_TIMER >> 16: - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(ptr_to_compat(from->si_ptr), - &to->si_ptr); - break; - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_int, &to->si_int); - break; - } - } - return err; -} - -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - int err; - u32 ptr32; - if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t))) - return -EFAULT; - - err = __get_user(to->si_signo, &from->si_signo); - err |= __get_user(to->si_errno, &from->si_errno); - err |= __get_user(to->si_code, &from->si_code); - - err |= __get_user(to->si_pid, &from->si_pid); - err |= __get_user(to->si_uid, &from->si_uid); - err |= __get_user(ptr32, &from->si_ptr); - to->si_ptr = compat_ptr(ptr32); - - return err; -} - -asmlinkage long -sys32_sigsuspend(int history0, int history1, old_sigset_t mask) -{ - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; -} - -asmlinkage long -sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, - stack_ia32_t __user *uoss_ptr, - struct pt_regs *regs) -{ - stack_t uss,uoss; - int ret; - mm_segment_t seg; - if (uss_ptr) { - u32 ptr; - memset(&uss,0,sizeof(stack_t)); - if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) || - __get_user(ptr, &uss_ptr->ss_sp) || - __get_user(uss.ss_flags, &uss_ptr->ss_flags) || - __get_user(uss.ss_size, &uss_ptr->ss_size)) - return -EFAULT; - uss.ss_sp = compat_ptr(ptr); - } - seg = get_fs(); - set_fs(KERNEL_DS); - ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp); - set_fs(seg); - if (ret >= 0 && uoss_ptr) { - if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) || - __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || - __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || - __put_user(uoss.ss_size, &uoss_ptr->ss_size)) - ret = -EFAULT; - } - return ret; -} - -/* - * Do a signal return; undo the signal stack. - */ - -struct sigframe -{ - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - struct _fpstate_ia32 fpstate; - unsigned int extramask[_COMPAT_NSIG_WORDS-1]; - char retcode[8]; -}; - -struct rt_sigframe -{ - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - compat_siginfo_t info; - struct ucontext_ia32 uc; - struct _fpstate_ia32 fpstate; - char retcode[8]; -}; - -static int -ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax) -{ - unsigned int err = 0; - - /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; - -#if DEBUG_SIG - printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", - sc, sc->err, sc->eip, sc->cs, sc->eflags); -#endif -#define COPY(x) { \ - unsigned int reg; \ - err |= __get_user(reg, &sc->e ##x); \ - regs->r ## x = reg; \ -} - -#define RELOAD_SEG(seg,mask) \ - { unsigned int cur; \ - unsigned short pre; \ - err |= __get_user(pre, &sc->seg); \ - asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ - pre |= mask; \ - if (pre != cur) loadsegment(seg,pre); } - - /* Reload fs and gs if they have changed in the signal handler. - This does not handle long fs/gs base changes in the handler, but - does not clobber them at least in the normal case. */ - - { - unsigned gs, oldgs; - err |= __get_user(gs, &sc->gs); - gs |= 3; - asm("movl %%gs,%0" : "=r" (oldgs)); - if (gs != oldgs) - load_gs_index(gs); - } - RELOAD_SEG(fs,3); - RELOAD_SEG(ds,3); - RELOAD_SEG(es,3); - - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); - /* Don't touch extended registers */ - - err |= __get_user(regs->cs, &sc->cs); - regs->cs |= 3; - err |= __get_user(regs->ss, &sc->ss); - regs->ss |= 3; - - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); - regs->orig_rax = -1; /* disable syscall checks */ - } - - { - u32 tmp; - struct _fpstate_ia32 __user * buf; - err |= __get_user(tmp, &sc->fpstate); - buf = compat_ptr(tmp); - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387_ia32(current, buf, 0); - } else { - struct task_struct *me = current; - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } - } - - { - u32 tmp; - err |= __get_user(tmp, &sc->eax); - *peax = tmp; - } - return err; - -badframe: - return 1; -} - -asmlinkage long sys32_sigreturn(struct pt_regs *regs) -{ - struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8); - sigset_t set; - unsigned int eax; - - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) - || (_COMPAT_NSIG_WORDS > 1 - && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask, - sizeof(frame->extramask)))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (ia32_restore_sigcontext(regs, &frame->sc, &eax)) - goto badframe; - return eax; - -badframe: - signal_fault(regs, frame, "32bit sigreturn"); - return 0; -} - -asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - sigset_t set; - unsigned int eax; - struct pt_regs tregs; - - frame = (struct rt_sigframe __user *)(regs->rsp - 4); - - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) - goto badframe; - - tregs = *regs; - if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) - goto badframe; - - return eax; - -badframe: - signal_fault(regs,frame,"32bit rt sigreturn"); - return 0; -} - -/* - * Set up a signal frame. - */ - -static int -ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate, - struct pt_regs *regs, unsigned int mask) -{ - int tmp, err = 0; - - tmp = 0; - __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int __user *)&sc->fs); - __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int __user *)&sc->ds); - __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int __user *)&sc->es); - - err |= __put_user((u32)regs->rdi, &sc->edi); - err |= __put_user((u32)regs->rsi, &sc->esi); - err |= __put_user((u32)regs->rbp, &sc->ebp); - err |= __put_user((u32)regs->rsp, &sc->esp); - err |= __put_user((u32)regs->rbx, &sc->ebx); - err |= __put_user((u32)regs->rdx, &sc->edx); - err |= __put_user((u32)regs->rcx, &sc->ecx); - err |= __put_user((u32)regs->rax, &sc->eax); - err |= __put_user((u32)regs->cs, &sc->cs); - err |= __put_user((u32)regs->ss, &sc->ss); - err |= __put_user(current->thread.trap_no, &sc->trapno); - err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user((u32)regs->rip, &sc->eip); - err |= __put_user((u32)regs->eflags, &sc->eflags); - err |= __put_user((u32)regs->rsp, &sc->esp_at_signal); - - tmp = save_i387_ia32(current, fpstate, regs, 0); - if (tmp < 0) - err = -EFAULT; - else { - clear_used_math(); - stts(); - err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), - &sc->fpstate); - } - - /* non-iBCS2 extensions.. */ - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->thread.cr2, &sc->cr2); - - return err; -} - -/* - * Determine which stack to use.. - */ -static void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) -{ - unsigned long rsp; - - /* Default to using normal stack */ - rsp = regs->rsp; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(rsp) == 0) - rsp = current->sas_ss_sp + current->sas_ss_size; - } - - /* This is the legacy signal stack switching. */ - else if ((regs->ss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) { - rsp = (unsigned long) ka->sa.sa_restorer; - } - - rsp -= frame_size; - /* Align the stack pointer according to the i386 ABI, - * i.e. so that on function entry ((sp + 4) & 15) == 0. */ - rsp = ((rsp + 4) & -16ul) - 4; - return (void __user *) rsp; -} - -int ia32_setup_frame(int sig, struct k_sigaction *ka, - compat_sigset_t *set, struct pt_regs * regs) -{ - struct sigframe __user *frame; - int err = 0; - - frame = get_sigframe(ka, regs, sizeof(*frame)); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; - - err |= __put_user(sig, &frame->sig); - if (err) - goto give_sigsegv; - - err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, - set->sig[0]); - if (err) - goto give_sigsegv; - - if (_COMPAT_NSIG_WORDS > 1) { - err |= __copy_to_user(frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - } - if (err) - goto give_sigsegv; - - /* Return stub is in 32bit vsyscall page */ - { - void __user *restorer; - if (current->binfmt->hasvdso) - restorer = VSYSCALL32_SIGRETURN; - else - restorer = (void *)&frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); - } - /* These are actually not used anymore, but left because some - gdb versions depend on them as a marker. */ - { - /* copy_to_user optimizes that into a single 8 byte store */ - static const struct { - u16 poplmovl; - u32 val; - u16 int80; - u16 pad; - } __attribute__((packed)) code = { - 0xb858, /* popl %eax ; movl $...,%eax */ - __NR_ia32_sigreturn, - 0x80cd, /* int $0x80 */ - 0, - }; - err |= __copy_to_user(frame->retcode, &code, 8); - } - if (err) - goto give_sigsegv; - - /* Set up registers for signal handler */ - regs->rsp = (unsigned long) frame; - regs->rip = (unsigned long) ka->sa.sa_handler; - - /* Make -mregparm=3 work */ - regs->rax = sig; - regs->rdx = 0; - regs->rcx = 0; - - asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); - - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; - - set_fs(USER_DS); - regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - -#if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); -#endif - - return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; -} - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - compat_sigset_t *set, struct pt_regs * regs) -{ - struct rt_sigframe __user *frame; - int err = 0; - - frame = get_sigframe(ka, regs, sizeof(*frame)); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; - - { - struct exec_domain *ed = current_thread_info()->exec_domain; - err |= __put_user((ed - && ed->signal_invmap - && sig < 32 - ? ed->signal_invmap[sig] - : sig), - &frame->sig); - } - err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); - err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); - err |= copy_siginfo_to_user32(&frame->info, info); - if (err) - goto give_sigsegv; - - /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->rsp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - if (err) - goto give_sigsegv; - - - { - void __user *restorer = VSYSCALL32_RTSIGRETURN; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); - } - - /* This is movl $,%eax ; int $0x80 */ - /* Not actually used anymore, but left because some gdb versions - need it. */ - { - /* __copy_to_user optimizes that into a single 8 byte store */ - static const struct { - u8 movl; - u32 val; - u16 int80; - u16 pad; - u8 pad2; - } __attribute__((packed)) code = { - 0xb8, - __NR_ia32_rt_sigreturn, - 0x80cd, - 0, - }; - err |= __copy_to_user(frame->retcode, &code, 8); - } - if (err) - goto give_sigsegv; - - /* Set up registers for signal handler */ - regs->rsp = (unsigned long) frame; - regs->rip = (unsigned long) ka->sa.sa_handler; - - /* Make -mregparm=3 work */ - regs->rax = sig; - regs->rdx = (unsigned long) &frame->info; - regs->rcx = (unsigned long) &frame->uc; - - /* Make -mregparm=3 work */ - regs->rax = sig; - regs->rdx = (unsigned long) &frame->info; - regs->rcx = (unsigned long) &frame->uc; - - asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); - - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; - - set_fs(USER_DS); - regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - -#if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); -#endif - - return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; -} diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S deleted file mode 100644 index 18b231810908..000000000000 --- a/arch/x86_64/ia32/ia32entry.S +++ /dev/null @@ -1,736 +0,0 @@ -/* - * Compatibility mode system call entry point for x86-64. - * - * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ - -#include <asm/dwarf2.h> -#include <asm/calling.h> -#include <asm/asm-offsets.h> -#include <asm/current.h> -#include <asm/errno.h> -#include <asm/ia32_unistd.h> -#include <asm/thread_info.h> -#include <asm/segment.h> -#include <asm/vsyscall32.h> -#include <asm/irqflags.h> -#include <linux/linkage.h> - -#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) - - .macro IA32_ARG_FIXUP noebp=0 - movl %edi,%r8d - .if \noebp - .else - movl %ebp,%r9d - .endif - xchg %ecx,%esi - movl %ebx,%edi - movl %edx,%edx /* zero extension */ - .endm - - /* clobbers %eax */ - .macro CLEAR_RREGS - xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %rax,R9(%rsp) - movq %rax,R8(%rsp) - .endm - - .macro LOAD_ARGS32 offset - movl \offset(%rsp),%r11d - movl \offset+8(%rsp),%r10d - movl \offset+16(%rsp),%r9d - movl \offset+24(%rsp),%r8d - movl \offset+40(%rsp),%ecx - movl \offset+48(%rsp),%edx - movl \offset+56(%rsp),%esi - movl \offset+64(%rsp),%edi - movl \offset+72(%rsp),%eax - .endm - - .macro CFI_STARTPROC32 simple - CFI_STARTPROC \simple - CFI_UNDEFINED r8 - CFI_UNDEFINED r9 - CFI_UNDEFINED r10 - CFI_UNDEFINED r11 - CFI_UNDEFINED r12 - CFI_UNDEFINED r13 - CFI_UNDEFINED r14 - CFI_UNDEFINED r15 - .endm - -/* - * 32bit SYSENTER instruction entry. - * - * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx Arg2 - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp user stack - * 0(%ebp) Arg6 - * - * Interrupts off. - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. Set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_sysenter_target) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rsp,rbp - swapgs - movq %gs:pda_kernelstack, %rsp - addq $(PDA_STACKOFFSET),%rsp - /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs, here we enable it straight after entry: - */ - sti - movl %ebp,%ebp /* zero extension */ - pushq $__USER32_DS - CFI_ADJUST_CFA_OFFSET 8 - /*CFI_REL_OFFSET ss,0*/ - pushq %rbp - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rsp,0 - pushfq - CFI_ADJUST_CFA_OFFSET 8 - /*CFI_REL_OFFSET rflags,0*/ - movl $VSYSCALL32_SYSEXIT, %r10d - CFI_REGISTER rip,r10 - pushq $__USER32_CS - CFI_ADJUST_CFA_OFFSET 8 - /*CFI_REL_OFFSET cs,0*/ - movl %eax, %eax - pushq %r10 - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rip,0 - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 - cld - SAVE_ARGS 0,0,1 - /* no need to do an access_ok check here because rbp has been - 32bit zero extended */ -1: movl (%rbp),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous - GET_THREAD_INFO(%r10) - orl $TS_COMPAT,threadinfo_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) - CFI_REMEMBER_STATE - jnz sysenter_tracesys -sysenter_do_call: - cmpl $(IA32_NR_syscalls-1),%eax - ja ia32_badsys - IA32_ARG_FIXUP 1 - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) - cli - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) - jnz int_ret_from_sys_call - andl $~TS_COMPAT,threadinfo_status(%r10) - /* clear IF, that popfq doesn't enable interrupts early */ - andl $~0x200,EFLAGS-R11(%rsp) - RESTORE_ARGS 1,24,1,1,1,1 - popfq - CFI_ADJUST_CFA_OFFSET -8 - /*CFI_RESTORE rflags*/ - popq %rcx /* User %esp */ - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rsp,rcx - movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ - CFI_REGISTER rip,rdx - TRACE_IRQS_ON - swapgs - sti /* sti only takes effect after the next instruction */ - /* sysexit */ - .byte 0xf, 0x35 - -sysenter_tracesys: - CFI_RESTORE_STATE - SAVE_REST - CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp) /* really needed? */ - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST - movl %ebp, %ebp - /* no need to do an access_ok check here because rbp has been - 32bit zero extended */ -1: movl (%rbp),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous - jmp sysenter_do_call - CFI_ENDPROC -ENDPROC(ia32_sysenter_target) - -/* - * 32bit SYSCALL instruction entry. - * - * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx return EIP - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp Arg2 [note: not saved in the stack frame, should not be touched] - * %esp user stack - * 0(%esp) Arg6 - * - * Interrupts off. - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. Set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_cstar_target) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,PDA_STACKOFFSET - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - swapgs - movl %esp,%r8d - CFI_REGISTER rsp,r8 - movq %gs:pda_kernelstack,%rsp - /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs and here we enable it straight after entry: - */ - sti - SAVE_ARGS 8,1,1 - movl %eax,%eax /* zero extension */ - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) - movq %rcx,RIP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rip,RIP-ARGOFFSET - movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ - movl %ebp,%ecx - movq $__USER32_CS,CS-ARGOFFSET(%rsp) - movq $__USER32_DS,SS-ARGOFFSET(%rsp) - movq %r11,EFLAGS-ARGOFFSET(%rsp) - /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ - movq %r8,RSP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rsp,RSP-ARGOFFSET - /* no need to do an access_ok check here because r8 has been - 32bit zero extended */ - /* hardware stack frame is complete now */ -1: movl (%r8),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous - GET_THREAD_INFO(%r10) - orl $TS_COMPAT,threadinfo_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) - CFI_REMEMBER_STATE - jnz cstar_tracesys -cstar_do_call: - cmpl $IA32_NR_syscalls-1,%eax - ja ia32_badsys - IA32_ARG_FIXUP 1 - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) - cli - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) - jnz int_ret_from_sys_call - andl $~TS_COMPAT,threadinfo_status(%r10) - RESTORE_ARGS 1,-ARG_SKIP,1,1,1 - movl RIP-ARGOFFSET(%rsp),%ecx - CFI_REGISTER rip,rcx - movl EFLAGS-ARGOFFSET(%rsp),%r11d - /*CFI_REGISTER rflags,r11*/ - TRACE_IRQS_ON - movl RSP-ARGOFFSET(%rsp),%esp - CFI_RESTORE rsp - swapgs - sysretl - -cstar_tracesys: - CFI_RESTORE_STATE - SAVE_REST - CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp) /* really needed? */ - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST - movl RSP-ARGOFFSET(%rsp), %r8d - /* no need to do an access_ok check here because r8 has been - 32bit zero extended */ -1: movl (%r8),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous - jmp cstar_do_call -END(ia32_cstar_target) - -ia32_badarg: - movq $-EFAULT,%rax - jmp ia32_sysret - CFI_ENDPROC - -/* - * Emulated IA32 system calls via int 0x80. - * - * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx Arg2 - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp Arg6 [note: not saved in the stack frame, should not be touched] - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except %eax must be saved (but ptrace may violate that) - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ - -ENTRY(ia32_syscall) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-RIP - /*CFI_REL_OFFSET ss,SS-RIP*/ - CFI_REL_OFFSET rsp,RSP-RIP - /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ - /*CFI_REL_OFFSET cs,CS-RIP*/ - CFI_REL_OFFSET rip,RIP-RIP - swapgs - /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs and here we enable it straight after entry: - */ - sti - movl %eax,%eax - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 - cld - /* note the registers are not zero extended to the sf. - this could be a problem. */ - SAVE_ARGS 0,0,1 - GET_THREAD_INFO(%r10) - orl $TS_COMPAT,threadinfo_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) - jnz ia32_tracesys -ia32_do_syscall: - cmpl $(IA32_NR_syscalls-1),%eax - ja ia32_badsys - IA32_ARG_FIXUP - call *ia32_sys_call_table(,%rax,8) # xxx: rip relative -ia32_sysret: - movq %rax,RAX-ARGOFFSET(%rsp) - jmp int_ret_from_sys_call - -ia32_tracesys: - SAVE_REST - CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp) /* really needed? */ - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST - jmp ia32_do_syscall -END(ia32_syscall) - -ia32_badsys: - movq $0,ORIG_RAX-ARGOFFSET(%rsp) - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp int_ret_from_sys_call - -quiet_ni_syscall: - movq $-ENOSYS,%rax - ret - CFI_ENDPROC - - .macro PTREGSCALL label, func, arg - .globl \label -\label: - leaq \func(%rip),%rax - leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ - jmp ia32_ptregs_common - .endm - - CFI_STARTPROC32 - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi - PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi - PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx - PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx - PTREGSCALL stub32_execve, sys32_execve, %rcx - PTREGSCALL stub32_fork, sys_fork, %rdi - PTREGSCALL stub32_clone, sys32_clone, %rdx - PTREGSCALL stub32_vfork, sys_vfork, %rdi - PTREGSCALL stub32_iopl, sys_iopl, %rsi - PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx - -ENTRY(ia32_ptregs_common) - popq %r11 - CFI_ENDPROC - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-ARGOFFSET - CFI_REL_OFFSET rax,RAX-ARGOFFSET - CFI_REL_OFFSET rcx,RCX-ARGOFFSET - CFI_REL_OFFSET rdx,RDX-ARGOFFSET - CFI_REL_OFFSET rsi,RSI-ARGOFFSET - CFI_REL_OFFSET rdi,RDI-ARGOFFSET - CFI_REL_OFFSET rip,RIP-ARGOFFSET -/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ -/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ - CFI_REL_OFFSET rsp,RSP-ARGOFFSET -/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ - SAVE_REST - call *%rax - RESTORE_REST - jmp ia32_sysret /* misbalances the return cache */ - CFI_ENDPROC -END(ia32_ptregs_common) - - .section .rodata,"a" - .align 8 -ia32_sys_call_table: - .quad sys_restart_syscall - .quad sys_exit - .quad stub32_fork - .quad sys_read - .quad sys_write - .quad compat_sys_open /* 5 */ - .quad sys_close - .quad sys32_waitpid - .quad sys_creat - .quad sys_link - .quad sys_unlink /* 10 */ - .quad stub32_execve - .quad sys_chdir - .quad compat_sys_time - .quad sys_mknod - .quad sys_chmod /* 15 */ - .quad sys_lchown16 - .quad quiet_ni_syscall /* old break syscall holder */ - .quad sys_stat - .quad sys32_lseek - .quad sys_getpid /* 20 */ - .quad compat_sys_mount /* mount */ - .quad sys_oldumount /* old_umount */ - .quad sys_setuid16 - .quad sys_getuid16 - .quad compat_sys_stime /* stime */ /* 25 */ - .quad sys32_ptrace /* ptrace */ - .quad sys_alarm - .quad sys_fstat /* (old)fstat */ - .quad sys_pause - .quad compat_sys_utime /* 30 */ - .quad quiet_ni_syscall /* old stty syscall holder */ - .quad quiet_ni_syscall /* old gtty syscall holder */ - .quad sys_access - .quad sys_nice - .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ - .quad sys_sync - .quad sys32_kill - .quad sys_rename - .quad sys_mkdir - .quad sys_rmdir /* 40 */ - .quad sys_dup - .quad sys32_pipe - .quad compat_sys_times - .quad quiet_ni_syscall /* old prof syscall holder */ - .quad sys_brk /* 45 */ - .quad sys_setgid16 - .quad sys_getgid16 - .quad sys_signal - .quad sys_geteuid16 - .quad sys_getegid16 /* 50 */ - .quad sys_acct - .quad sys_umount /* new_umount */ - .quad quiet_ni_syscall /* old lock syscall holder */ - .quad compat_sys_ioctl - .quad compat_sys_fcntl64 /* 55 */ - .quad quiet_ni_syscall /* old mpx syscall holder */ - .quad sys_setpgid - .quad quiet_ni_syscall /* old ulimit syscall holder */ - .quad sys32_olduname - .quad sys_umask /* 60 */ - .quad sys_chroot - .quad sys32_ustat - .quad sys_dup2 - .quad sys_getppid - .quad sys_getpgrp /* 65 */ - .quad sys_setsid - .quad sys32_sigaction - .quad sys_sgetmask - .quad sys_ssetmask - .quad sys_setreuid16 /* 70 */ - .quad sys_setregid16 - .quad stub32_sigsuspend - .quad compat_sys_sigpending - .quad sys_sethostname - .quad compat_sys_setrlimit /* 75 */ - .quad compat_sys_old_getrlimit /* old_getrlimit */ - .quad compat_sys_getrusage - .quad sys32_gettimeofday - .quad sys32_settimeofday - .quad sys_getgroups16 /* 80 */ - .quad sys_setgroups16 - .quad sys32_old_select - .quad sys_symlink - .quad sys_lstat - .quad sys_readlink /* 85 */ - .quad sys_uselib - .quad sys_swapon - .quad sys_reboot - .quad compat_sys_old_readdir - .quad sys32_mmap /* 90 */ - .quad sys_munmap - .quad sys_truncate - .quad sys_ftruncate - .quad sys_fchmod - .quad sys_fchown16 /* 95 */ - .quad sys_getpriority - .quad sys_setpriority - .quad quiet_ni_syscall /* old profil syscall holder */ - .quad compat_sys_statfs - .quad compat_sys_fstatfs /* 100 */ - .quad sys_ioperm - .quad compat_sys_socketcall - .quad sys_syslog - .quad compat_sys_setitimer - .quad compat_sys_getitimer /* 105 */ - .quad compat_sys_newstat - .quad compat_sys_newlstat - .quad compat_sys_newfstat - .quad sys32_uname - .quad stub32_iopl /* 110 */ - .quad sys_vhangup - .quad quiet_ni_syscall /* old "idle" system call */ - .quad sys32_vm86_warning /* vm86old */ - .quad compat_sys_wait4 - .quad sys_swapoff /* 115 */ - .quad compat_sys_sysinfo - .quad sys32_ipc - .quad sys_fsync - .quad stub32_sigreturn - .quad stub32_clone /* 120 */ - .quad sys_setdomainname - .quad sys_uname - .quad sys_modify_ldt - .quad compat_sys_adjtimex - .quad sys32_mprotect /* 125 */ - .quad compat_sys_sigprocmask - .quad quiet_ni_syscall /* create_module */ - .quad sys_init_module - .quad sys_delete_module - .quad quiet_ni_syscall /* 130 get_kernel_syms */ - .quad sys32_quotactl - .quad sys_getpgid - .quad sys_fchdir - .quad quiet_ni_syscall /* bdflush */ - .quad sys_sysfs /* 135 */ - .quad sys_personality - .quad quiet_ni_syscall /* for afs_syscall */ - .quad sys_setfsuid16 - .quad sys_setfsgid16 - .quad sys_llseek /* 140 */ - .quad compat_sys_getdents - .quad compat_sys_select - .quad sys_flock - .quad sys_msync - .quad compat_sys_readv /* 145 */ - .quad compat_sys_writev - .quad sys_getsid - .quad sys_fdatasync - .quad sys32_sysctl /* sysctl */ - .quad sys_mlock /* 150 */ - .quad sys_munlock - .quad sys_mlockall - .quad sys_munlockall - .quad sys_sched_setparam - .quad sys_sched_getparam /* 155 */ - .quad sys_sched_setscheduler - .quad sys_sched_getscheduler - .quad sys_sched_yield - .quad sys_sched_get_priority_max - .quad sys_sched_get_priority_min /* 160 */ - .quad sys32_sched_rr_get_interval - .quad compat_sys_nanosleep - .quad sys_mremap - .quad sys_setresuid16 - .quad sys_getresuid16 /* 165 */ - .quad sys32_vm86_warning /* vm86 */ - .quad quiet_ni_syscall /* query_module */ - .quad sys_poll - .quad compat_sys_nfsservctl - .quad sys_setresgid16 /* 170 */ - .quad sys_getresgid16 - .quad sys_prctl - .quad stub32_rt_sigreturn - .quad sys32_rt_sigaction - .quad sys32_rt_sigprocmask /* 175 */ - .quad sys32_rt_sigpending - .quad compat_sys_rt_sigtimedwait - .quad sys32_rt_sigqueueinfo - .quad stub32_rt_sigsuspend - .quad sys32_pread /* 180 */ - .quad sys32_pwrite - .quad sys_chown16 - .quad sys_getcwd - .quad sys_capget - .quad sys_capset - .quad stub32_sigaltstack - .quad sys32_sendfile - .quad quiet_ni_syscall /* streams1 */ - .quad quiet_ni_syscall /* streams2 */ - .quad stub32_vfork /* 190 */ - .quad compat_sys_getrlimit - .quad sys32_mmap2 - .quad sys32_truncate64 - .quad sys32_ftruncate64 - .quad sys32_stat64 /* 195 */ - .quad sys32_lstat64 - .quad sys32_fstat64 - .quad sys_lchown - .quad sys_getuid - .quad sys_getgid /* 200 */ - .quad sys_geteuid - .quad sys_getegid - .quad sys_setreuid - .quad sys_setregid - .quad sys_getgroups /* 205 */ - .quad sys_setgroups - .quad sys_fchown - .quad sys_setresuid - .quad sys_getresuid - .quad sys_setresgid /* 210 */ - .quad sys_getresgid - .quad sys_chown - .quad sys_setuid - .quad sys_setgid - .quad sys_setfsuid /* 215 */ - .quad sys_setfsgid - .quad sys_pivot_root - .quad sys_mincore - .quad sys_madvise - .quad compat_sys_getdents64 /* 220 getdents64 */ - .quad compat_sys_fcntl64 - .quad quiet_ni_syscall /* tux */ - .quad quiet_ni_syscall /* security */ - .quad sys_gettid - .quad sys32_readahead /* 225 */ - .quad sys_setxattr - .quad sys_lsetxattr - .quad sys_fsetxattr - .quad sys_getxattr - .quad sys_lgetxattr /* 230 */ - .quad sys_fgetxattr - .quad sys_listxattr - .quad sys_llistxattr - .quad sys_flistxattr - .quad sys_removexattr /* 235 */ - .quad sys_lremovexattr - .quad sys_fremovexattr - .quad sys_tkill - .quad sys_sendfile64 - .quad compat_sys_futex /* 240 */ - .quad compat_sys_sched_setaffinity - .quad compat_sys_sched_getaffinity - .quad sys32_set_thread_area - .quad sys32_get_thread_area - .quad compat_sys_io_setup /* 245 */ - .quad sys_io_destroy - .quad compat_sys_io_getevents - .quad compat_sys_io_submit - .quad sys_io_cancel - .quad sys32_fadvise64 /* 250 */ - .quad quiet_ni_syscall /* free_huge_pages */ - .quad sys_exit_group - .quad sys32_lookup_dcookie - .quad sys_epoll_create - .quad sys_epoll_ctl /* 255 */ - .quad sys_epoll_wait - .quad sys_remap_file_pages - .quad sys_set_tid_address - .quad compat_sys_timer_create - .quad compat_sys_timer_settime /* 260 */ - .quad compat_sys_timer_gettime - .quad sys_timer_getoverrun - .quad sys_timer_delete - .quad compat_sys_clock_settime - .quad compat_sys_clock_gettime /* 265 */ - .quad compat_sys_clock_getres - .quad compat_sys_clock_nanosleep - .quad compat_sys_statfs64 - .quad compat_sys_fstatfs64 - .quad sys_tgkill /* 270 */ - .quad compat_sys_utimes - .quad sys32_fadvise64_64 - .quad quiet_ni_syscall /* sys_vserver */ - .quad sys_mbind - .quad compat_sys_get_mempolicy /* 275 */ - .quad sys_set_mempolicy - .quad compat_sys_mq_open - .quad sys_mq_unlink - .quad compat_sys_mq_timedsend - .quad compat_sys_mq_timedreceive /* 280 */ - .quad compat_sys_mq_notify - .quad compat_sys_mq_getsetattr - .quad compat_sys_kexec_load /* reserved for kexec */ - .quad compat_sys_waitid - .quad quiet_ni_syscall /* 285: sys_altroot */ - .quad sys_add_key - .quad sys_request_key - .quad sys_keyctl - .quad sys_ioprio_set - .quad sys_ioprio_get /* 290 */ - .quad sys_inotify_init - .quad sys_inotify_add_watch - .quad sys_inotify_rm_watch - .quad sys_migrate_pages - .quad compat_sys_openat /* 295 */ - .quad sys_mkdirat - .quad sys_mknodat - .quad sys_fchownat - .quad compat_sys_futimesat - .quad sys32_fstatat /* 300 */ - .quad sys_unlinkat - .quad sys_renameat - .quad sys_linkat - .quad sys_symlinkat - .quad sys_readlinkat /* 305 */ - .quad sys_fchmodat - .quad sys_faccessat - .quad compat_sys_pselect6 - .quad compat_sys_ppoll - .quad sys_unshare /* 310 */ - .quad compat_sys_set_robust_list - .quad compat_sys_get_robust_list - .quad sys_splice - .quad sys32_sync_file_range - .quad sys_tee /* 315 */ - .quad compat_sys_vmsplice - .quad compat_sys_move_pages - .quad sys_getcpu - .quad sys_epoll_pwait - .quad compat_sys_utimensat /* 320 */ - .quad compat_sys_signalfd - .quad compat_sys_timerfd - .quad sys_eventfd - .quad sys32_fallocate -ia32_syscall_end: diff --git a/arch/x86_64/ia32/ipc32.c b/arch/x86_64/ia32/ipc32.c deleted file mode 100644 index 369151dc3213..000000000000 --- a/arch/x86_64/ia32/ipc32.c +++ /dev/null @@ -1,57 +0,0 @@ -#include <linux/kernel.h> -#include <linux/spinlock.h> -#include <linux/list.h> -#include <linux/syscalls.h> -#include <linux/time.h> -#include <linux/sem.h> -#include <linux/msg.h> -#include <linux/shm.h> -#include <linux/ipc.h> -#include <linux/compat.h> - -#include <asm-i386/ipc.h> - -asmlinkage long -sys32_ipc(u32 call, int first, int second, int third, - compat_uptr_t ptr, u32 fifth) -{ - int version; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - /* struct sembuf is the same on 32 and 64bit :)) */ - return sys_semtimedop(first, compat_ptr(ptr), second, NULL); - case SEMTIMEDOP: - return compat_sys_semtimedop(first, compat_ptr(ptr), second, - compat_ptr(fifth)); - case SEMGET: - return sys_semget(first, second, third); - case SEMCTL: - return compat_sys_semctl(first, second, third, compat_ptr(ptr)); - - case MSGSND: - return compat_sys_msgsnd(first, second, third, compat_ptr(ptr)); - case MSGRCV: - return compat_sys_msgrcv(first, second, fifth, third, - version, compat_ptr(ptr)); - case MSGGET: - return sys_msgget((key_t) first, second); - case MSGCTL: - return compat_sys_msgctl(first, second, compat_ptr(ptr)); - - case SHMAT: - return compat_sys_shmat(first, second, third, version, - compat_ptr(ptr)); - break; - case SHMDT: - return sys_shmdt(compat_ptr(ptr)); - case SHMGET: - return sys_shmget(first, (unsigned)second, third); - case SHMCTL: - return compat_sys_shmctl(first, second, compat_ptr(ptr)); - } - return -ENOSYS; -} diff --git a/arch/x86_64/ia32/mmap32.c b/arch/x86_64/ia32/mmap32.c deleted file mode 100644 index e4b84b4a417a..000000000000 --- a/arch/x86_64/ia32/mmap32.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * linux/arch/x86_64/ia32/mm/mmap.c - * - * flexible mmap layout support - * - * Based on the i386 version which was - * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * - * Started by Ingo Molnar <mingo@elte.hu> - */ - -#include <linux/personality.h> -#include <linux/mm.h> -#include <linux/random.h> -#include <linux/sched.h> - -/* - * Top of mmap area (just below the process stack). - * - * Leave an at least ~128 MB hole. - */ -#define MIN_GAP (128*1024*1024) -#define MAX_GAP (TASK_SIZE/6*5) - -static inline unsigned long mmap_base(struct mm_struct *mm) -{ - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; - unsigned long random_factor = 0; - - if (current->flags & PF_RANDOMIZE) - random_factor = get_random_int() % (1024*1024); - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(TASK_SIZE - gap - random_factor); -} - -/* - * This function, called very early during the creation of a new - * process VM image, sets up which VM layout function to use: - */ -void ia32_pick_mmap_layout(struct mm_struct *mm) -{ - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (sysctl_legacy_va_layout || - (current->personality & ADDR_COMPAT_LAYOUT) || - current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; - } else { - mm->mmap_base = mmap_base(mm); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; - } -} diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c deleted file mode 100644 index 4a233ad6269c..000000000000 --- a/arch/x86_64/ia32/ptrace32.c +++ /dev/null @@ -1,404 +0,0 @@ -/* - * 32bit ptrace for x86-64. - * - * Copyright 2001,2002 Andi Kleen, SuSE Labs. - * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier - * copyright. - * - * This allows to access 64bit processes too; but there is no way to see the extended - * register contents. - */ - -#include <linux/kernel.h> -#include <linux/stddef.h> -#include <linux/sched.h> -#include <linux/syscalls.h> -#include <linux/unistd.h> -#include <linux/mm.h> -#include <linux/err.h> -#include <linux/ptrace.h> -#include <asm/ptrace.h> -#include <asm/compat.h> -#include <asm/uaccess.h> -#include <asm/user32.h> -#include <asm/user.h> -#include <asm/errno.h> -#include <asm/debugreg.h> -#include <asm/i387.h> -#include <asm/fpu32.h> -#include <asm/ia32.h> - -/* - * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). - * Also masks reserved bits (31-22, 15, 5, 3, 1). - */ -#define FLAG_MASK 0x54dd5UL - -#define R32(l,q) \ - case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break - -static int putreg32(struct task_struct *child, unsigned regno, u32 val) -{ - int i; - __u64 *stack = (__u64 *)task_pt_regs(child); - - switch (regno) { - case offsetof(struct user32, regs.fs): - if (val && (val & 3) != 3) return -EIO; - child->thread.fsindex = val & 0xffff; - break; - case offsetof(struct user32, regs.gs): - if (val && (val & 3) != 3) return -EIO; - child->thread.gsindex = val & 0xffff; - break; - case offsetof(struct user32, regs.ds): - if (val && (val & 3) != 3) return -EIO; - child->thread.ds = val & 0xffff; - break; - case offsetof(struct user32, regs.es): - child->thread.es = val & 0xffff; - break; - case offsetof(struct user32, regs.ss): - if ((val & 3) != 3) return -EIO; - stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff; - break; - case offsetof(struct user32, regs.cs): - if ((val & 3) != 3) return -EIO; - stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff; - break; - - R32(ebx, rbx); - R32(ecx, rcx); - R32(edx, rdx); - R32(edi, rdi); - R32(esi, rsi); - R32(ebp, rbp); - R32(eax, rax); - R32(orig_eax, orig_rax); - R32(eip, rip); - R32(esp, rsp); - - case offsetof(struct user32, regs.eflags): { - __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8]; - val &= FLAG_MASK; - *flags = val | (*flags & ~FLAG_MASK); - break; - } - - case offsetof(struct user32, u_debugreg[4]): - case offsetof(struct user32, u_debugreg[5]): - return -EIO; - - case offsetof(struct user32, u_debugreg[0]): - child->thread.debugreg0 = val; - break; - - case offsetof(struct user32, u_debugreg[1]): - child->thread.debugreg1 = val; - break; - - case offsetof(struct user32, u_debugreg[2]): - child->thread.debugreg2 = val; - break; - - case offsetof(struct user32, u_debugreg[3]): - child->thread.debugreg3 = val; - break; - - case offsetof(struct user32, u_debugreg[6]): - child->thread.debugreg6 = val; - break; - - case offsetof(struct user32, u_debugreg[7]): - val &= ~DR_CONTROL_RESERVED; - /* See arch/i386/kernel/ptrace.c for an explanation of - * this awkward check.*/ - for(i=0; i<4; i++) - if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = val; - if (val) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; - - default: - if (regno > sizeof(struct user32) || (regno & 3)) - return -EIO; - - /* Other dummy fields in the virtual user structure are ignored */ - break; - } - return 0; -} - -#undef R32 - -#define R32(l,q) \ - case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break - -static int getreg32(struct task_struct *child, unsigned regno, u32 *val) -{ - __u64 *stack = (__u64 *)task_pt_regs(child); - - switch (regno) { - case offsetof(struct user32, regs.fs): - *val = child->thread.fsindex; - break; - case offsetof(struct user32, regs.gs): - *val = child->thread.gsindex; - break; - case offsetof(struct user32, regs.ds): - *val = child->thread.ds; - break; - case offsetof(struct user32, regs.es): - *val = child->thread.es; - break; - - R32(cs, cs); - R32(ss, ss); - R32(ebx, rbx); - R32(ecx, rcx); - R32(edx, rdx); - R32(edi, rdi); - R32(esi, rsi); - R32(ebp, rbp); - R32(eax, rax); - R32(orig_eax, orig_rax); - R32(eip, rip); - R32(eflags, eflags); - R32(esp, rsp); - - case offsetof(struct user32, u_debugreg[0]): - *val = child->thread.debugreg0; - break; - case offsetof(struct user32, u_debugreg[1]): - *val = child->thread.debugreg1; - break; - case offsetof(struct user32, u_debugreg[2]): - *val = child->thread.debugreg2; - break; - case offsetof(struct user32, u_debugreg[3]): - *val = child->thread.debugreg3; - break; - case offsetof(struct user32, u_debugreg[6]): - *val = child->thread.debugreg6; - break; - case offsetof(struct user32, u_debugreg[7]): - *val = child->thread.debugreg7; - break; - - default: - if (regno > sizeof(struct user32) || (regno & 3)) - return -EIO; - - /* Other dummy fields in the virtual user structure are ignored */ - *val = 0; - break; - } - return 0; -} - -#undef R32 - -static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) -{ - int ret; - compat_siginfo_t __user *si32 = compat_ptr(data); - siginfo_t ssi; - siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t)); - if (request == PTRACE_SETSIGINFO) { - memset(&ssi, 0, sizeof(siginfo_t)); - ret = copy_siginfo_from_user32(&ssi, si32); - if (ret) - return ret; - if (copy_to_user(si, &ssi, sizeof(siginfo_t))) - return -EFAULT; - } - ret = sys_ptrace(request, pid, addr, (unsigned long)si); - if (ret) - return ret; - if (request == PTRACE_GETSIGINFO) { - if (copy_from_user(&ssi, si, sizeof(siginfo_t))) - return -EFAULT; - ret = copy_siginfo_to_user32(si32, &ssi); - } - return ret; -} - -asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) -{ - struct task_struct *child; - struct pt_regs *childregs; - void __user *datap = compat_ptr(data); - int ret; - __u32 val; - - switch (request) { - case PTRACE_TRACEME: - case PTRACE_ATTACH: - case PTRACE_KILL: - case PTRACE_CONT: - case PTRACE_SINGLESTEP: - case PTRACE_DETACH: - case PTRACE_SYSCALL: - case PTRACE_OLDSETOPTIONS: - case PTRACE_SETOPTIONS: - case PTRACE_SET_THREAD_AREA: - case PTRACE_GET_THREAD_AREA: - return sys_ptrace(request, pid, addr, data); - - default: - return -EINVAL; - - case PTRACE_PEEKTEXT: - case PTRACE_PEEKDATA: - case PTRACE_POKEDATA: - case PTRACE_POKETEXT: - case PTRACE_POKEUSR: - case PTRACE_PEEKUSR: - case PTRACE_GETREGS: - case PTRACE_SETREGS: - case PTRACE_SETFPREGS: - case PTRACE_GETFPREGS: - case PTRACE_SETFPXREGS: - case PTRACE_GETFPXREGS: - case PTRACE_GETEVENTMSG: - break; - - case PTRACE_SETSIGINFO: - case PTRACE_GETSIGINFO: - return ptrace32_siginfo(request, pid, addr, data); - } - - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) - return PTR_ERR(child); - - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out; - - childregs = task_pt_regs(child); - - switch (request) { - case PTRACE_PEEKDATA: - case PTRACE_PEEKTEXT: - ret = 0; - if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32)) - ret = -EIO; - else - ret = put_user(val, (unsigned int __user *)datap); - break; - - case PTRACE_POKEDATA: - case PTRACE_POKETEXT: - ret = 0; - if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32)) - ret = -EIO; - break; - - case PTRACE_PEEKUSR: - ret = getreg32(child, addr, &val); - if (ret == 0) - ret = put_user(val, (__u32 __user *)datap); - break; - - case PTRACE_POKEUSR: - ret = putreg32(child, addr, data); - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - int i; - if (!access_ok(VERIFY_WRITE, datap, 16*4)) { - ret = -EIO; - break; - } - ret = 0; - for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) { - getreg32(child, i, &val); - ret |= __put_user(val,(u32 __user *)datap); - datap += sizeof(u32); - } - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - int i; - if (!access_ok(VERIFY_READ, datap, 16*4)) { - ret = -EIO; - break; - } - ret = 0; - for ( i = 0; i <= 16*4; i += sizeof(u32) ) { - ret |= __get_user(tmp, (u32 __user *)datap); - putreg32(child, i, tmp); - datap += sizeof(u32); - } - break; - } - - case PTRACE_GETFPREGS: - ret = -EIO; - if (!access_ok(VERIFY_READ, compat_ptr(data), - sizeof(struct user_i387_struct))) - break; - save_i387_ia32(child, datap, childregs, 1); - ret = 0; - break; - - case PTRACE_SETFPREGS: - ret = -EIO; - if (!access_ok(VERIFY_WRITE, datap, - sizeof(struct user_i387_struct))) - break; - ret = 0; - /* don't check EFAULT to be bug-to-bug compatible to i386 */ - restore_i387_ia32(child, datap, 1); - break; - - case PTRACE_GETFPXREGS: { - struct user32_fxsr_struct __user *u = datap; - init_fpu(child); - ret = -EIO; - if (!access_ok(VERIFY_WRITE, u, sizeof(*u))) - break; - ret = -EFAULT; - if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u))) - break; - ret = __put_user(childregs->cs, &u->fcs); - ret |= __put_user(child->thread.ds, &u->fos); - break; - } - case PTRACE_SETFPXREGS: { - struct user32_fxsr_struct __user *u = datap; - unlazy_fpu(child); - ret = -EIO; - if (!access_ok(VERIFY_READ, u, sizeof(*u))) - break; - /* no checking to be bug-to-bug compatible with i386. */ - /* but silence warning */ - if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u))) - ; - set_stopped_child_used_math(child); - child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - ret = 0; - break; - } - - case PTRACE_GETEVENTMSG: - ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data)); - break; - - default: - BUG(); - } - - out: - put_task_struct(child); - return ret; -} - diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c deleted file mode 100644 index bee96d614432..000000000000 --- a/arch/x86_64/ia32/sys_ia32.c +++ /dev/null @@ -1,889 +0,0 @@ -/* - * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on - * sys_sparc32 - * - * Copyright (C) 2000 VA Linux Co - * Copyright (C) 2000 Don Dugger <n0ano@valinux.com> - * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> - * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) - * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) - * Copyright (C) 2000 Hewlett-Packard Co. - * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> - * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) - * - * These routines maintain argument size conversion between 32bit and 64bit - * environment. In 2.5 most of this should be moved to a generic directory. - * - * This file assumes that there is a hole at the end of user address space. - * - * Some of the functions are LE specific currently. These are hopefully all marked. - * This should be fixed. - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/signal.h> -#include <linux/syscalls.h> -#include <linux/resource.h> -#include <linux/times.h> -#include <linux/utsname.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/sem.h> -#include <linux/msg.h> -#include <linux/mm.h> -#include <linux/shm.h> -#include <linux/slab.h> -#include <linux/uio.h> -#include <linux/nfs_fs.h> -#include <linux/quota.h> -#include <linux/module.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr.h> -#include <linux/nfsd/syscall.h> -#include <linux/poll.h> -#include <linux/personality.h> -#include <linux/stat.h> -#include <linux/ipc.h> -#include <linux/rwsem.h> -#include <linux/binfmts.h> -#include <linux/init.h> -#include <linux/aio_abi.h> -#include <linux/aio.h> -#include <linux/compat.h> -#include <linux/vfs.h> -#include <linux/ptrace.h> -#include <linux/highuid.h> -#include <linux/vmalloc.h> -#include <linux/fsnotify.h> -#include <linux/sysctl.h> -#include <asm/mman.h> -#include <asm/types.h> -#include <asm/uaccess.h> -#include <asm/semaphore.h> -#include <asm/atomic.h> -#include <asm/ldt.h> - -#include <net/scm.h> -#include <net/sock.h> -#include <asm/ia32.h> - -#define AA(__x) ((unsigned long)(__x)) - -int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) -{ - compat_ino_t ino; - - typeof(ubuf->st_uid) uid = 0; - typeof(ubuf->st_gid) gid = 0; - SET_UID(uid, kbuf->uid); - SET_GID(gid, kbuf->gid); - if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev)) - return -EOVERFLOW; - if (kbuf->size >= 0x7fffffff) - return -EOVERFLOW; - ino = kbuf->ino; - if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) - return -EOVERFLOW; - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || - __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) || - __put_user (ino, &ubuf->st_ino) || - __put_user (kbuf->mode, &ubuf->st_mode) || - __put_user (kbuf->nlink, &ubuf->st_nlink) || - __put_user (uid, &ubuf->st_uid) || - __put_user (gid, &ubuf->st_gid) || - __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || - __put_user (kbuf->size, &ubuf->st_size) || - __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) || - __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) || - __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) || - __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user (kbuf->blksize, &ubuf->st_blksize) || - __put_user (kbuf->blocks, &ubuf->st_blocks)) - return -EFAULT; - return 0; -} - -asmlinkage long -sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high) -{ - return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); -} - -asmlinkage long -sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high) -{ - return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); -} - -/* Another set for IA32/LFS -- x86_64 struct stat is different due to - support for 64bit inode numbers. */ - -static int -cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) -{ - typeof(ubuf->st_uid) uid = 0; - typeof(ubuf->st_gid) gid = 0; - SET_UID(uid, stat->uid); - SET_GID(gid, stat->gid); - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || - __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || - __put_user (stat->ino, &ubuf->__st_ino) || - __put_user (stat->ino, &ubuf->st_ino) || - __put_user (stat->mode, &ubuf->st_mode) || - __put_user (stat->nlink, &ubuf->st_nlink) || - __put_user (uid, &ubuf->st_uid) || - __put_user (gid, &ubuf->st_gid) || - __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) || - __put_user (stat->size, &ubuf->st_size) || - __put_user (stat->atime.tv_sec, &ubuf->st_atime) || - __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) || - __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) || - __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user (stat->blksize, &ubuf->st_blksize) || - __put_user (stat->blocks, &ubuf->st_blocks)) - return -EFAULT; - return 0; -} - -asmlinkage long -sys32_stat64(char __user * filename, struct stat64 __user *statbuf) -{ - struct kstat stat; - int ret = vfs_stat(filename, &stat); - if (!ret) - ret = cp_stat64(statbuf, &stat); - return ret; -} - -asmlinkage long -sys32_lstat64(char __user * filename, struct stat64 __user *statbuf) -{ - struct kstat stat; - int ret = vfs_lstat(filename, &stat); - if (!ret) - ret = cp_stat64(statbuf, &stat); - return ret; -} - -asmlinkage long -sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) -{ - struct kstat stat; - int ret = vfs_fstat(fd, &stat); - if (!ret) - ret = cp_stat64(statbuf, &stat); - return ret; -} - -asmlinkage long -sys32_fstatat(unsigned int dfd, char __user *filename, - struct stat64 __user* statbuf, int flag) -{ - struct kstat stat; - int error = -EINVAL; - - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) - goto out; - - if (flag & AT_SYMLINK_NOFOLLOW) - error = vfs_lstat_fd(dfd, filename, &stat); - else - error = vfs_stat_fd(dfd, filename, &stat); - - if (!error) - error = cp_stat64(statbuf, &stat); - -out: - return error; -} - -/* - * Linux/i386 didn't use to be able to handle more than - * 4 system call parameters, so these system calls used a memory - * block for parameter passing.. - */ - -struct mmap_arg_struct { - unsigned int addr; - unsigned int len; - unsigned int prot; - unsigned int flags; - unsigned int fd; - unsigned int offset; -}; - -asmlinkage long -sys32_mmap(struct mmap_arg_struct __user *arg) -{ - struct mmap_arg_struct a; - struct file *file = NULL; - unsigned long retval; - struct mm_struct *mm ; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if (a.offset & ~PAGE_MASK) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - file = fget(a.fd); - if (!file) - return -EBADF; - } - - mm = current->mm; - down_write(&mm->mmap_sem); - retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT); - if (file) - fput(file); - - up_write(&mm->mmap_sem); - - return retval; -} - -asmlinkage long -sys32_mprotect(unsigned long start, size_t len, unsigned long prot) -{ - return sys_mprotect(start,len,prot); -} - -asmlinkage long -sys32_pipe(int __user *fd) -{ - int retval; - int fds[2]; - - retval = do_pipe(fds); - if (retval) - goto out; - if (copy_to_user(fd, fds, sizeof(fds))) - retval = -EFAULT; - out: - return retval; -} - -asmlinkage long -sys32_rt_sigaction(int sig, struct sigaction32 __user *act, - struct sigaction32 __user *oact, unsigned int sigsetsize) -{ - struct k_sigaction new_ka, old_ka; - int ret; - compat_sigset_t set32; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - - if (act) { - compat_uptr_t handler, restorer; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_flags, &act->sa_flags) || - __get_user(restorer, &act->sa_restorer)|| - __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t))) - return -EFAULT; - new_ka.sa.sa_handler = compat_ptr(handler); - new_ka.sa.sa_restorer = compat_ptr(restorer); - /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ - switch (_NSIG_WORDS) { - case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6] - | (((long)set32.sig[7]) << 32); - case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4] - | (((long)set32.sig[5]) << 32); - case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2] - | (((long)set32.sig[3]) << 32); - case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0] - | (((long)set32.sig[1]) << 32); - } - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ - switch (_NSIG_WORDS) { - case 4: - set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32); - set32.sig[6] = old_ka.sa.sa_mask.sig[3]; - case 3: - set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32); - set32.sig[4] = old_ka.sa.sa_mask.sig[2]; - case 2: - set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32); - set32.sig[2] = old_ka.sa.sa_mask.sig[1]; - case 1: - set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32); - set32.sig[0] = old_ka.sa.sa_mask.sig[0]; - } - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || - __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || - __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || - __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t))) - return -EFAULT; - } - - return ret; -} - -asmlinkage long -sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - - if (act) { - compat_old_sigset_t mask; - compat_uptr_t handler, restorer; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_flags, &act->sa_flags) || - __get_user(restorer, &act->sa_restorer) || - __get_user(mask, &act->sa_mask)) - return -EFAULT; - - new_ka.sa.sa_handler = compat_ptr(handler); - new_ka.sa.sa_restorer = compat_ptr(restorer); - - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || - __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || - __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) - return -EFAULT; - } - - return ret; -} - -asmlinkage long -sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, - compat_sigset_t __user *oset, unsigned int sigsetsize) -{ - sigset_t s; - compat_sigset_t s32; - int ret; - mm_segment_t old_fs = get_fs(); - - if (set) { - if (copy_from_user (&s32, set, sizeof(compat_sigset_t))) - return -EFAULT; - switch (_NSIG_WORDS) { - case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); - case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32); - case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32); - case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); - } - } - set_fs (KERNEL_DS); - ret = sys_rt_sigprocmask(how, - set ? (sigset_t __user *)&s : NULL, - oset ? (sigset_t __user *)&s : NULL, - sigsetsize); - set_fs (old_fs); - if (ret) return ret; - if (oset) { - switch (_NSIG_WORDS) { - case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; - case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2]; - case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; - case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; - } - if (copy_to_user (oset, &s32, sizeof(compat_sigset_t))) - return -EFAULT; - } - return 0; -} - -static inline long -get_tv32(struct timeval *o, struct compat_timeval __user *i) -{ - int err = -EFAULT; - if (access_ok(VERIFY_READ, i, sizeof(*i))) { - err = __get_user(o->tv_sec, &i->tv_sec); - err |= __get_user(o->tv_usec, &i->tv_usec); - } - return err; -} - -static inline long -put_tv32(struct compat_timeval __user *o, struct timeval *i) -{ - int err = -EFAULT; - if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { - err = __put_user(i->tv_sec, &o->tv_sec); - err |= __put_user(i->tv_usec, &o->tv_usec); - } - return err; -} - -extern unsigned int alarm_setitimer(unsigned int seconds); - -asmlinkage long -sys32_alarm(unsigned int seconds) -{ - return alarm_setitimer(seconds); -} - -/* Translations due to time_t size differences. Which affects all - sorts of things, like timeval and itimerval. */ - -extern struct timezone sys_tz; - -asmlinkage long -sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (put_tv32(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -asmlinkage long -sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - struct timeval ktv; - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_tv32(&ktv, tv)) - return -EFAULT; - kts.tv_sec = ktv.tv_sec; - kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - -struct sel_arg_struct { - unsigned int n; - unsigned int inp; - unsigned int outp; - unsigned int exp; - unsigned int tvp; -}; - -asmlinkage long -sys32_old_select(struct sel_arg_struct __user *arg) -{ - struct sel_arg_struct a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), - compat_ptr(a.exp), compat_ptr(a.tvp)); -} - -extern asmlinkage long -compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options, - struct compat_rusage *ru); - -asmlinkage long -sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) -{ - return compat_sys_wait4(pid, stat_addr, options, NULL); -} - -/* 32-bit timeval and related flotsam. */ - -asmlinkage long -sys32_sysfs(int option, u32 arg1, u32 arg2) -{ - return sys_sysfs(option, arg1, arg2); -} - -asmlinkage long -sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) -{ - struct timespec t; - int ret; - mm_segment_t old_fs = get_fs (); - - set_fs (KERNEL_DS); - ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); - set_fs (old_fs); - if (put_compat_timespec(&t, interval)) - return -EFAULT; - return ret; -} - -asmlinkage long -sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) -{ - sigset_t s; - compat_sigset_t s32; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize); - set_fs (old_fs); - if (!ret) { - switch (_NSIG_WORDS) { - case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; - case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2]; - case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; - case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; - } - if (copy_to_user (set, &s32, sizeof(compat_sigset_t))) - return -EFAULT; - } - return ret; -} - -asmlinkage long -sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) -{ - siginfo_t info; - int ret; - mm_segment_t old_fs = get_fs(); - - if (copy_siginfo_from_user32(&info, uinfo)) - return -EFAULT; - set_fs (KERNEL_DS); - ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info); - set_fs (old_fs); - return ret; -} - -/* These are here just in case some old ia32 binary calls it. */ -asmlinkage long -sys32_pause(void) -{ - current->state = TASK_INTERRUPTIBLE; - schedule(); - return -ERESTARTNOHAND; -} - - -#ifdef CONFIG_SYSCTL_SYSCALL -struct sysctl_ia32 { - unsigned int name; - int nlen; - unsigned int oldval; - unsigned int oldlenp; - unsigned int newval; - unsigned int newlen; - unsigned int __unused[4]; -}; - - -asmlinkage long -sys32_sysctl(struct sysctl_ia32 __user *args32) -{ - struct sysctl_ia32 a32; - mm_segment_t old_fs = get_fs (); - void __user *oldvalp, *newvalp; - size_t oldlen; - int __user *namep; - long ret; - - if (copy_from_user(&a32, args32, sizeof (a32))) - return -EFAULT; - - /* - * We need to pre-validate these because we have to disable address checking - * before calling do_sysctl() because of OLDLEN but we can't run the risk of the - * user specifying bad addresses here. Well, since we're dealing with 32 bit - * addresses, we KNOW that access_ok() will always succeed, so this is an - * expensive NOP, but so what... - */ - namep = compat_ptr(a32.name); - oldvalp = compat_ptr(a32.oldval); - newvalp = compat_ptr(a32.newval); - - if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp))) - || !access_ok(VERIFY_WRITE, namep, 0) - || !access_ok(VERIFY_WRITE, oldvalp, 0) - || !access_ok(VERIFY_WRITE, newvalp, 0)) - return -EFAULT; - - set_fs(KERNEL_DS); - lock_kernel(); - ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen, - newvalp, (size_t) a32.newlen); - unlock_kernel(); - set_fs(old_fs); - - if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp))) - return -EFAULT; - - return ret; -} -#endif - -/* warning: next two assume little endian */ -asmlinkage long -sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) -{ - return sys_pread64(fd, ubuf, count, - ((loff_t)AA(poshi) << 32) | AA(poslo)); -} - -asmlinkage long -sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) -{ - return sys_pwrite64(fd, ubuf, count, - ((loff_t)AA(poshi) << 32) | AA(poslo)); -} - - -asmlinkage long -sys32_personality(unsigned long personality) -{ - int ret; - if (personality(current->personality) == PER_LINUX32 && - personality == PER_LINUX) - personality = PER_LINUX32; - ret = sys_personality(personality); - if (ret == PER_LINUX32) - ret = PER_LINUX; - return ret; -} - -asmlinkage long -sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) -{ - mm_segment_t old_fs = get_fs(); - int ret; - off_t of; - - if (offset && get_user(of, offset)) - return -EFAULT; - - set_fs(KERNEL_DS); - ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL, - count); - set_fs(old_fs); - - if (offset && put_user(of, offset)) - return -EFAULT; - - return ret; -} - -asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct mm_struct *mm = current->mm; - unsigned long error; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); - return error; -} - -asmlinkage long sys32_olduname(struct oldold_utsname __user * name) -{ - int err; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - - err = __copy_to_user(&name->sysname,&utsname()->sysname, - __OLD_UTS_LEN); - err |= __put_user(0,name->sysname+__OLD_UTS_LEN); - err |= __copy_to_user(&name->nodename,&utsname()->nodename, - __OLD_UTS_LEN); - err |= __put_user(0,name->nodename+__OLD_UTS_LEN); - err |= __copy_to_user(&name->release,&utsname()->release, - __OLD_UTS_LEN); - err |= __put_user(0,name->release+__OLD_UTS_LEN); - err |= __copy_to_user(&name->version,&utsname()->version, - __OLD_UTS_LEN); - err |= __put_user(0,name->version+__OLD_UTS_LEN); - { - char *arch = "x86_64"; - if (personality(current->personality) == PER_LINUX32) - arch = "i686"; - - err |= __copy_to_user(&name->machine, arch, strlen(arch)+1); - } - - up_read(&uts_sem); - - err = err ? -EFAULT : 0; - - return err; -} - -long sys32_uname(struct old_utsname __user * name) -{ - int err; - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - return err?-EFAULT:0; -} - -long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) -{ - struct ustat u; - mm_segment_t seg; - int ret; - - seg = get_fs(); - set_fs(KERNEL_DS); - ret = sys_ustat(dev, (struct ustat __user *)&u); - set_fs(seg); - if (ret >= 0) { - if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || - __put_user((__u32) u.f_tfree, &u32p->f_tfree) || - __put_user((__u32) u.f_tinode, &u32p->f_tfree) || - __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) || - __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack))) - ret = -EFAULT; - } - return ret; -} - -asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, - compat_uptr_t __user *envp, struct pt_regs *regs) -{ - long error; - char * filename; - - filename = getname(name); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - return error; - error = compat_do_execve(filename, argv, envp, regs); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - } - putname(filename); - return error; -} - -asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, - struct pt_regs *regs) -{ - void __user *parent_tid = (void __user *)regs->rdx; - void __user *child_tid = (void __user *)regs->rdi; - if (!newsp) - newsp = regs->rsp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - -/* - * Some system calls that need sign extended arguments. This could be done by a generic wrapper. - */ - -long sys32_lseek (unsigned int fd, int offset, unsigned int whence) -{ - return sys_lseek(fd, offset, whence); -} - -long sys32_kill(int pid, int sig) -{ - return sys_kill(pid, sig); -} - -long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, - __u32 len_low, __u32 len_high, int advice) -{ - return sys_fadvise64_64(fd, - (((u64)offset_high)<<32) | offset_low, - (((u64)len_high)<<32) | len_low, - advice); -} - -long sys32_vm86_warning(void) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - -long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, - char __user * buf, size_t len) -{ - return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len); -} - -asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count) -{ - return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); -} - -asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi, - unsigned n_low, unsigned n_hi, int flags) -{ - return sys_sync_file_range(fd, - ((u64)off_hi << 32) | off_low, - ((u64)n_hi << 32) | n_low, flags); -} - -asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, size_t len, - int advice) -{ - return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, - len, advice); -} - -asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, - unsigned offset_hi, unsigned len_lo, - unsigned len_hi) -{ - return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, - ((u64)len_hi << 32) | len_lo); -} diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c deleted file mode 100644 index 15013bac181c..000000000000 --- a/arch/x86_64/ia32/syscall32.c +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright 2002,2003 Andi Kleen, SuSE Labs */ - -/* vsyscall handling for 32bit processes. Map a stub page into it - on demand because 32bit cannot reach the kernel's fixmaps */ - -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/stringify.h> -#include <linux/security.h> -#include <asm/proto.h> -#include <asm/tlbflush.h> -#include <asm/ia32_unistd.h> -#include <asm/vsyscall32.h> - -extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; -extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; -extern int sysctl_vsyscall32; - -static struct page *syscall32_pages[1]; -static int use_sysenter = -1; - -struct linux_binprm; - -/* Setup a VMA at program startup for the vsyscall page */ -int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) -{ - struct mm_struct *mm = current->mm; - int ret; - - down_write(&mm->mmap_sem); - /* - * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - /* Could randomize here */ - ret = install_special_mapping(mm, VSYSCALL32_BASE, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, - syscall32_pages); - up_write(&mm->mmap_sem); - return ret; -} - -static int __init init_syscall32(void) -{ - char *syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); - if (!syscall32_page) - panic("Cannot allocate syscall32 page"); - syscall32_pages[0] = virt_to_page(syscall32_page); - if (use_sysenter > 0) { - memcpy(syscall32_page, syscall32_sysenter, - syscall32_sysenter_end - syscall32_sysenter); - } else { - memcpy(syscall32_page, syscall32_syscall, - syscall32_syscall_end - syscall32_syscall); - } - return 0; -} - -__initcall(init_syscall32); - -/* May not be __init: called during resume */ -void syscall32_cpu_init(void) -{ - if (use_sysenter < 0) - use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); - - /* Load these always in case some future AMD CPU supports - SYSENTER from compat mode too. */ - checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); - checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); - - wrmsrl(MSR_CSTAR, ia32_cstar_target); -} diff --git a/arch/x86_64/ia32/syscall32_syscall.S b/arch/x86_64/ia32/syscall32_syscall.S deleted file mode 100644 index 8f8271bdf135..000000000000 --- a/arch/x86_64/ia32/syscall32_syscall.S +++ /dev/null @@ -1,17 +0,0 @@ -/* 32bit VDSOs mapped into user space. */ - - .section ".init.data","aw" - - .globl syscall32_syscall - .globl syscall32_syscall_end - -syscall32_syscall: - .incbin "arch/x86_64/ia32/vsyscall-syscall.so" -syscall32_syscall_end: - - .globl syscall32_sysenter - .globl syscall32_sysenter_end - -syscall32_sysenter: - .incbin "arch/x86_64/ia32/vsyscall-sysenter.so" -syscall32_sysenter_end: diff --git a/arch/x86_64/ia32/tls32.c b/arch/x86_64/ia32/tls32.c deleted file mode 100644 index 1cc4340de3ca..000000000000 --- a/arch/x86_64/ia32/tls32.c +++ /dev/null @@ -1,163 +0,0 @@ -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/user.h> - -#include <asm/uaccess.h> -#include <asm/desc.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/processor.h> -#include <asm/proto.h> - -/* - * sys_alloc_thread_area: get a yet unused TLS descriptor index. - */ -static int get_free_idx(void) -{ - struct thread_struct *t = ¤t->thread; - int idx; - - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) - if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx)) - return idx + GDT_ENTRY_TLS_MIN; - return -ESRCH; -} - -/* - * Set a given TLS descriptor: - * When you want addresses > 32bit use arch_prctl() - */ -int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info) -{ - struct user_desc info; - struct n_desc_struct *desc; - int cpu, idx; - - if (copy_from_user(&info, u_info, sizeof(info))) - return -EFAULT; - - idx = info.entry_number; - - /* - * index -1 means the kernel should try to find and - * allocate an empty descriptor: - */ - if (idx == -1) { - idx = get_free_idx(); - if (idx < 0) - return idx; - if (put_user(idx, &u_info->entry_number)) - return -EFAULT; - } - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; - - /* - * We must not get preempted while modifying the TLS. - */ - cpu = get_cpu(); - - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - if (t == ¤t->thread) - load_TLS(t, cpu); - - put_cpu(); - return 0; -} - -asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info) -{ - return do_set_thread_area(¤t->thread, u_info); -} - - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) -#define GET_LONGMODE(desc) (((desc)->b >> 21) & 1) - -int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info) -{ - struct user_desc info; - struct n_desc_struct *desc; - int idx; - - if (get_user(idx, &u_info->entry_number)) - return -EFAULT; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; - - memset(&info, 0, sizeof(struct user_desc)); - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - info.lm = GET_LONGMODE(desc); - - if (copy_to_user(u_info, &info, sizeof(info))) - return -EFAULT; - return 0; -} - -asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info) -{ - return do_get_thread_area(¤t->thread, u_info); -} - - -int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs) -{ - struct n_desc_struct *desc; - struct user_desc info; - struct user_desc __user *cp; - int idx; - - cp = (void __user *)childregs->rsi; - if (copy_from_user(&info, cp, sizeof(info))) - return -EFAULT; - if (LDT_empty(&info)) - return -EINVAL; - - idx = info.entry_number; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN; - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - - return 0; -} diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S deleted file mode 100644 index 1384367cdbe1..000000000000 --- a/arch/x86_64/ia32/vsyscall-sigreturn.S +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Common code for the sigreturn entry points on the vsyscall page. - * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80) - * to enter the kernel. - * This file is #include'd by vsyscall-*.S to define them after the - * vsyscall entry point. The addresses we get for these entry points - * by doing ".balign 32" must match in both versions of the page. - */ - - .code32 - .section .text.sigreturn,"ax" - .balign 32 - .globl __kernel_sigreturn - .type __kernel_sigreturn,@function -__kernel_sigreturn: -.LSTART_sigreturn: - popl %eax - movl $__NR_ia32_sigreturn, %eax - SYSCALL_ENTER_KERNEL -.LEND_sigreturn: - .size __kernel_sigreturn,.-.LSTART_sigreturn - - .section .text.rtsigreturn,"ax" - .balign 32 - .globl __kernel_rt_sigreturn - .type __kernel_rt_sigreturn,@function -__kernel_rt_sigreturn: -.LSTART_rt_sigreturn: - movl $__NR_ia32_rt_sigreturn, %eax - SYSCALL_ENTER_KERNEL -.LEND_rt_sigreturn: - .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn - - .section .eh_frame,"a",@progbits -.LSTARTFRAMES: - .long .LENDCIES-.LSTARTCIES -.LSTARTCIES: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zRS" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIES: - - .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */ -.LSTARTFDE2: - .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */ - /* HACK: The dwarf2 unwind routines will subtract 1 from the - return address to get an address in the middle of the - presumed call instruction. Since we didn't get here via - a call, we need to include the nop before the real start - to make up for it. */ - .long .LSTART_sigreturn-1-. /* PC-relative start address */ - .long .LEND_sigreturn-.LSTART_sigreturn+1 - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - complicated by the fact that the "CFA" is always assumed to - be the value of the stack pointer in the caller. This means - that we must define the CFA of this body of code to be the - saved value of the stack pointer in the sigcontext. Which - also means that there is no fixed relation to the other - saved registers, which means that we must use DW_CFA_expression - to compute their addresses. It also means that when we - adjust the stack with the popl, we have to do it all over again. */ - -#define do_cfa_expr(offset) \ - .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ - .byte 0x06; /* DW_OP_deref */ \ -1: - -#define do_expr(regno, offset) \ - .byte 0x10; /* DW_CFA_expression */ \ - .uleb128 regno; /* regno */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ -1: - - do_cfa_expr(IA32_SIGCONTEXT_esp+4) - do_expr(0, IA32_SIGCONTEXT_eax+4) - do_expr(1, IA32_SIGCONTEXT_ecx+4) - do_expr(2, IA32_SIGCONTEXT_edx+4) - do_expr(3, IA32_SIGCONTEXT_ebx+4) - do_expr(5, IA32_SIGCONTEXT_ebp+4) - do_expr(6, IA32_SIGCONTEXT_esi+4) - do_expr(7, IA32_SIGCONTEXT_edi+4) - do_expr(8, IA32_SIGCONTEXT_eip+4) - - .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ - - do_cfa_expr(IA32_SIGCONTEXT_esp) - do_expr(0, IA32_SIGCONTEXT_eax) - do_expr(1, IA32_SIGCONTEXT_ecx) - do_expr(2, IA32_SIGCONTEXT_edx) - do_expr(3, IA32_SIGCONTEXT_ebx) - do_expr(5, IA32_SIGCONTEXT_ebp) - do_expr(6, IA32_SIGCONTEXT_esi) - do_expr(7, IA32_SIGCONTEXT_edi) - do_expr(8, IA32_SIGCONTEXT_eip) - - .align 4 -.LENDFDE2: - - .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */ -.LSTARTFDE3: - .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */ - /* HACK: See above wrt unwind library assumptions. */ - .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ - .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - slightly less complicated than the above, since we don't - modify the stack pointer in the process. */ - - do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp) - do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax) - do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx) - do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx) - do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx) - do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp) - do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi) - do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi) - do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip) - - .align 4 -.LENDFDE3: - -#include "../../i386/kernel/vsyscall-note.S" - diff --git a/arch/x86_64/ia32/vsyscall-syscall.S b/arch/x86_64/ia32/vsyscall-syscall.S deleted file mode 100644 index cf9ef678de3e..000000000000 --- a/arch/x86_64/ia32/vsyscall-syscall.S +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Code for the vsyscall page. This version uses the syscall instruction. - */ - -#include <asm/ia32_unistd.h> -#include <asm/asm-offsets.h> -#include <asm/segment.h> - - .code32 - .text - .section .text.vsyscall,"ax" - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function -__kernel_vsyscall: -.LSTART_vsyscall: - push %ebp -.Lpush_ebp: - movl %ecx, %ebp - syscall - movl $__USER32_DS, %ecx - movl %ecx, %ss - movl %ebp, %ecx - popl %ebp -.Lpop_ebp: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE -.LSTARTCIE: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIE: - - .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ -.LSTARTFDE1: - .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 8 - .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ - .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 4 - .align 4 -.LENDFDE1: - -#define SYSCALL_ENTER_KERNEL syscall -#include "vsyscall-sigreturn.S" diff --git a/arch/x86_64/ia32/vsyscall-sysenter.S b/arch/x86_64/ia32/vsyscall-sysenter.S deleted file mode 100644 index ae056e553d13..000000000000 --- a/arch/x86_64/ia32/vsyscall-sysenter.S +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Code for the vsyscall page. This version uses the sysenter instruction. - */ - -#include <asm/ia32_unistd.h> -#include <asm/asm-offsets.h> - - .code32 - .text - .section .text.vsyscall,"ax" - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function -__kernel_vsyscall: -.LSTART_vsyscall: - push %ecx -.Lpush_ecx: - push %edx -.Lpush_edx: - push %ebp -.Lenter_kernel: - movl %esp,%ebp - sysenter - .space 7,0x90 - jmp .Lenter_kernel - /* 16: System call normal return point is here! */ - pop %ebp -.Lpop_ebp: - pop %edx -.Lpop_edx: - pop %ecx -.Lpop_ecx: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE -.LSTARTCIE: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIE: - - .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ -.LSTARTFDE1: - .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_ecx-.LSTART_vsyscall - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_edx-.Lpush_ecx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lenter_kernel-.Lpush_edx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x10 /* RA at offset 16 now */ - .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ - /* Finally the epilogue. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ebp-.Lenter_kernel - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x12 /* RA at offset 12 now */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_edx-.Lpop_ebp - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ecx-.Lpop_edx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x04 /* RA at offset 4 now */ - .align 4 -.LENDFDE1: - -#define SYSCALL_ENTER_KERNEL int $0x80 -#include "vsyscall-sigreturn.S" diff --git a/arch/x86_64/ia32/vsyscall.lds b/arch/x86_64/ia32/vsyscall.lds deleted file mode 100644 index 1dc86ff5bcb9..000000000000 --- a/arch/x86_64/ia32/vsyscall.lds +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Linker script for vsyscall DSO. The vsyscall page is an ELF shared - * object prelinked to its virtual address. This script controls its layout. - */ - -/* This must match <asm/fixmap.h>. */ -VSYSCALL_BASE = 0xffffe000; - -SECTIONS -{ - . = VSYSCALL_BASE + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - /* This linker script is used both with -r and with -shared. - For the layouts to match, we need to skip more than enough - space for the dynamic symbol table et al. If this amount - is insufficient, ld -shared will barf. Just increase it here. */ - . = VSYSCALL_BASE + 0x400; - - .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090 - - /* This is an 32bit object and we cannot easily get the offsets - into the 64bit kernel. Just hardcode them here. This assumes - that all the stubs don't need more than 0x100 bytes. */ - . = VSYSCALL_BASE + 0x500; - - .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090 - - . = VSYSCALL_BASE + 0x600; - - .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090 - - .note : { *(.note.*) } :text :note - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - .useless : { - *(.got.plt) *(.got) - *(.data .data.* .gnu.linkonce.d.*) - *(.dynbss) - *(.bss .bss.* .gnu.linkonce.b.*) - } :text -} - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ -} - -/* - * This controls what symbols we export from the DSO. - */ -VERSION -{ - LINUX_2.5 { - global: - __kernel_vsyscall; - __kernel_sigreturn; - __kernel_rt_sigreturn; - - local: *; - }; -} - -/* The ELF entry point can be used to set the AT_SYSINFO value. */ -ENTRY(__kernel_vsyscall); diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile deleted file mode 100644 index ff5d8c9b96d9..000000000000 --- a/arch/x86_64/kernel/Makefile +++ /dev/null @@ -1,63 +0,0 @@ -# -# Makefile for the linux kernel. -# - -extra-y := head.o head64.o init_task.o vmlinux.lds -EXTRA_AFLAGS := -traditional -obj-y := process.o signal.o entry.o traps.o irq.o \ - ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ - x8664_ksyms.o i387.o syscall.o vsyscall.o \ - setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \ - perfctr-watchdog.o - -obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o -obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o -obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ -obj-$(CONFIG_ACPI) += acpi/ -obj-$(CONFIG_X86_MSR) += msr.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o -obj-y += apic.o nmi.o -obj-y += io_apic.o mpparse.o genapic.o genapic_flat.o -obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_PM) += suspend.o -obj-$(CONFIG_HIBERNATION) += suspend_asm.o -obj-$(CONFIG_CPU_FREQ) += cpufreq/ -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_IOMMU) += pci-gart.o aperture.o -obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary.o tce.o -obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o -obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o -obj-$(CONFIG_X86_VSMP) += vsmp.o -obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_AUDIT) += audit.o - -obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_PCI) += early-quirks.o - -obj-y += topology.o -obj-y += intel_cacheinfo.o -obj-y += addon_cpuid_features.o -obj-y += pcspeaker.o - -CFLAGS_vsyscall.o := $(PROFILING) -g0 - -therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o -bootflag-y += ../../i386/kernel/bootflag.o -cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o -topology-y += ../../i386/kernel/topology.o -microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o -intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o -addon_cpuid_features-y += ../../i386/kernel/cpu/addon_cpuid_features.o -quirks-y += ../../i386/kernel/quirks.o -i8237-y += ../../i386/kernel/i8237.o -msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o -alternative-y += ../../i386/kernel/alternative.o -pcspeaker-y += ../../i386/kernel/pcspeaker.o -perfctr-watchdog-y += ../../i386/kernel/cpu/perfctr-watchdog.o diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile deleted file mode 100644 index 080b9963f1bc..000000000000 --- a/arch/x86_64/kernel/acpi/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -obj-y := boot.o -boot-y := ../../../i386/kernel/acpi/boot.o -obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o - -ifneq ($(CONFIG_ACPI_PROCESSOR),) -obj-y += processor.o -processor-y := ../../../i386/kernel/acpi/processor.o ../../../i386/kernel/acpi/cstate.o -endif - diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c deleted file mode 100644 index 79475d237071..000000000000 --- a/arch/x86_64/kernel/acpi/sleep.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * acpi.c - Architecture-Specific Low-Level ACPI Support - * - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> - * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> - * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> - * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) - * Copyright (C) 2003 Pavel Machek, SuSE Labs - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/pci.h> -#include <linux/bootmem.h> -#include <linux/acpi.h> -#include <linux/cpumask.h> - -#include <asm/mpspec.h> -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/apicdef.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/tlbflush.h> - -/* -------------------------------------------------------------------------- - Low-Level Sleep Support - -------------------------------------------------------------------------- */ - -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long acpi_copy_wakeup_routine(unsigned long); - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); - - return 0; -} - -/* - * acpi_restore_state - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page in low memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16M pages, but not - * <1M pages. - */ -void __init acpi_reserve_bootmem(void) -{ - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); - if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) - printk(KERN_CRIT - "ACPI: Wakeup code way too big, will crash on attempt" - " to suspend\n"); -} - -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); - -void acpi_pci_link_exit(void) -{ -} diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S deleted file mode 100644 index a06f2bcabef9..000000000000 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ /dev/null @@ -1,456 +0,0 @@ -.text -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/pgtable.h> -#include <asm/page.h> -#include <asm/msr.h> - -# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 -# -# wakeup_code runs in real mode, and at unknown address (determined at run-time). -# Therefore it must only use relative jumps/calls. -# -# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled -# -# If physical address of wakeup_code is 0x12345, BIOS should call us with -# cs = 0x1234, eip = 0x05 -# - -#define BEEP \ - inb $97, %al; \ - outb %al, $0x80; \ - movb $3, %al; \ - outb %al, $97; \ - outb %al, $0x80; \ - movb $-74, %al; \ - outb %al, $67; \ - outb %al, $0x80; \ - movb $-119, %al; \ - outb %al, $66; \ - outb %al, $0x80; \ - movb $15, %al; \ - outb %al, $66; - - -ALIGN - .align 16 -ENTRY(wakeup_start) -wakeup_code: - wakeup_code_start = . - .code16 - -# Running in *copy* of this code, somewhere in low 1MB. - - movb $0xa1, %al ; outb %al, $0x80 - cli - cld - # setup data segment - movw %cs, %ax - movw %ax, %ds # Make ds:0 point to wakeup_start - movw %ax, %ss - - # Data segment must be set up before we can see whether to beep. - testl $4, realmode_flags - wakeup_code - jz 1f - BEEP -1: - - # Private stack is needed for ASUS board - mov $(wakeup_stack - wakeup_code), %sp - - pushl $0 # Kill any dangerous flags - popfl - - movl real_magic - wakeup_code, %eax - cmpl $0x12345678, %eax - jne bogus_real_magic - - call verify_cpu # Verify the cpu supports long - # mode - testl %eax, %eax - jnz no_longmode - - testl $1, realmode_flags - wakeup_code - jz 1f - lcall $0xc000,$3 - movw %cs, %ax - movw %ax, %ds # Bios might have played with that - movw %ax, %ss -1: - - testl $2, realmode_flags - wakeup_code - jz 1f - mov video_mode - wakeup_code, %ax - call mode_set -1: - - movw $0xb800, %ax - movw %ax,%fs - movw $0x0e00 + 'L', %fs:(0x10) - - movb $0xa2, %al ; outb %al, $0x80 - - mov %ds, %ax # Find 32bit wakeup_code addr - movzx %ax, %esi # (Convert %ds:gdt to a liner ptr) - shll $4, %esi - # Fix up the vectors - addl %esi, wakeup_32_vector - wakeup_code - addl %esi, wakeup_long64_vector - wakeup_code - addl %esi, gdt_48a + 2 - wakeup_code # Fixup the gdt pointer - - lidtl %ds:idt_48a - wakeup_code - lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is - # appropriate - - movl $1, %eax # protected mode (PE) bit - lmsw %ax # This is it! - jmp 1f -1: - - ljmpl *(wakeup_32_vector - wakeup_code) - - .balign 4 -wakeup_32_vector: - .long wakeup_32 - wakeup_code - .word __KERNEL32_CS, 0 - - .code32 -wakeup_32: -# Running in this code, but at low address; paging is not yet turned on. - movb $0xa5, %al ; outb %al, $0x80 - - movl $__KERNEL_DS, %eax - movl %eax, %ds - - movw $0x0e00 + 'i', %ds:(0xb8012) - movb $0xa8, %al ; outb %al, $0x80; - - /* - * Prepare for entering 64bits mode - */ - - /* Enable PAE */ - xorl %eax, %eax - btsl $5, %eax - movl %eax, %cr4 - - /* Setup early boot stage 4 level pagetables */ - leal (wakeup_level4_pgt - wakeup_code)(%esi), %eax - movl %eax, %cr3 - - /* Check if nx is implemented */ - movl $0x80000001, %eax - cpuid - movl %edx,%edi - - /* Enable Long Mode */ - xorl %eax, %eax - btsl $_EFER_LME, %eax - - /* No Execute supported? */ - btl $20,%edi - jnc 1f - btsl $_EFER_NX, %eax - - /* Make changes effective */ -1: movl $MSR_EFER, %ecx - xorl %edx, %edx - wrmsr - - xorl %eax, %eax - btsl $31, %eax /* Enable paging and in turn activate Long Mode */ - btsl $0, %eax /* Enable protected mode */ - - /* Make changes effective */ - movl %eax, %cr0 - - /* At this point: - CR4.PAE must be 1 - CS.L must be 0 - CR3 must point to PML4 - Next instruction must be a branch - This must be on identity-mapped page - */ - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. - */ - - /* Finally jump in 64bit mode */ - ljmp *(wakeup_long64_vector - wakeup_code)(%esi) - - .balign 4 -wakeup_long64_vector: - .long wakeup_long64 - wakeup_code - .word __KERNEL_CS, 0 - -.code64 - - /* Hooray, we are in Long 64-bit mode (but still running in - * low memory) - */ -wakeup_long64: - /* - * We must switch to a new descriptor in kernel space for the GDT - * because soon the kernel won't have access anymore to the userspace - * addresses where we're currently running on. We have to do that here - * because in 32bit we couldn't load a 64bit linear address. - */ - lgdt cpu_gdt_descr - - movw $0x0e00 + 'n', %ds:(0xb8014) - movb $0xa9, %al ; outb %al, $0x80 - - movq saved_magic, %rax - movq $0x123456789abcdef0, %rdx - cmpq %rdx, %rax - jne bogus_64_magic - - movw $0x0e00 + 'u', %ds:(0xb8016) - - nop - nop - movw $__KERNEL_DS, %ax - movw %ax, %ss - movw %ax, %ds - movw %ax, %es - movw %ax, %fs - movw %ax, %gs - movq saved_rsp, %rsp - - movw $0x0e00 + 'x', %ds:(0xb8018) - movq saved_rbx, %rbx - movq saved_rdi, %rdi - movq saved_rsi, %rsi - movq saved_rbp, %rbp - - movw $0x0e00 + '!', %ds:(0xb801a) - movq saved_rip, %rax - jmp *%rax - -.code32 - - .align 64 -gdta: - /* Its good to keep gdt in sync with one in trampoline.S */ - .word 0, 0, 0, 0 # dummy - /* ??? Why I need the accessed bit set in order for this to work? */ - .quad 0x00cf9b000000ffff # __KERNEL32_CS - .quad 0x00af9b000000ffff # __KERNEL_CS - .quad 0x00cf93000000ffff # __KERNEL_DS - -idt_48a: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L - -gdt_48a: - .word 0x800 # gdt limit=2048, - # 256 GDT entries - .long gdta - wakeup_code # gdt base (relocated in later) - -real_magic: .quad 0 -video_mode: .quad 0 -realmode_flags: .quad 0 - -.code16 -bogus_real_magic: - movb $0xba,%al ; outb %al,$0x80 - jmp bogus_real_magic - -.code64 -bogus_64_magic: - movb $0xb3,%al ; outb %al,$0x80 - jmp bogus_64_magic - -.code16 -no_longmode: - movb $0xbc,%al ; outb %al,$0x80 - jmp no_longmode - -#include "../verify_cpu.S" - -/* This code uses an extended set of video mode numbers. These include: - * Aliases for standard modes - * NORMAL_VGA (-1) - * EXTENDED_VGA (-2) - * ASK_VGA (-3) - * Video modes numbered by menu position -- NOT RECOMMENDED because of lack - * of compatibility when extending the table. These are between 0x00 and 0xff. - */ -#define VIDEO_FIRST_MENU 0x0000 - -/* Standard BIOS video modes (BIOS number + 0x0100) */ -#define VIDEO_FIRST_BIOS 0x0100 - -/* VESA BIOS video modes (VESA number + 0x0200) */ -#define VIDEO_FIRST_VESA 0x0200 - -/* Video7 special modes (BIOS number + 0x0900) */ -#define VIDEO_FIRST_V7 0x0900 - -# Setting of user mode (AX=mode ID) => CF=success - -# For now, we only handle VESA modes (0x0200..0x03ff). To handle other -# modes, we should probably compile in the video code from the boot -# directory. -.code16 -mode_set: - movw %ax, %bx - subb $VIDEO_FIRST_VESA>>8, %bh - cmpb $2, %bh - jb check_vesa - -setbad: - clc - ret - -check_vesa: - orw $0x4000, %bx # Use linear frame buffer - movw $0x4f02, %ax # VESA BIOS mode set call - int $0x10 - cmpw $0x004f, %ax # AL=4f if implemented - jnz setbad # AH=0 if OK - - stc - ret - -wakeup_stack_begin: # Stack grows down - -.org 0xff0 -wakeup_stack: # Just below end of page - -.org 0x1000 -ENTRY(wakeup_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 510,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - -ENTRY(wakeup_end) - -## -# acpi_copy_wakeup_routine -# -# Copy the above routine to low memory. -# -# Parameters: -# %rdi: place to copy wakeup routine to -# -# Returned address is location of code in low memory (past data and stack) -# - .code64 -ENTRY(acpi_copy_wakeup_routine) - pushq %rax - pushq %rdx - - movl saved_video_mode, %edx - movl %edx, video_mode - wakeup_start (,%rdi) - movl acpi_realmode_flags, %edx - movl %edx, realmode_flags - wakeup_start (,%rdi) - movq $0x12345678, real_magic - wakeup_start (,%rdi) - movq $0x123456789abcdef0, %rdx - movq %rdx, saved_magic - - movq saved_magic, %rax - movq $0x123456789abcdef0, %rdx - cmpq %rdx, %rax - jne bogus_64_magic - - # restore the regs we used - popq %rdx - popq %rax -ENTRY(do_suspend_lowlevel_s4bios) - ret - - .align 2 - .p2align 4,,15 -.globl do_suspend_lowlevel - .type do_suspend_lowlevel,@function -do_suspend_lowlevel: -.LFB5: - subq $8, %rsp - xorl %eax, %eax - call save_processor_state - - movq %rsp, saved_context_esp(%rip) - movq %rax, saved_context_eax(%rip) - movq %rbx, saved_context_ebx(%rip) - movq %rcx, saved_context_ecx(%rip) - movq %rdx, saved_context_edx(%rip) - movq %rbp, saved_context_ebp(%rip) - movq %rsi, saved_context_esi(%rip) - movq %rdi, saved_context_edi(%rip) - movq %r8, saved_context_r08(%rip) - movq %r9, saved_context_r09(%rip) - movq %r10, saved_context_r10(%rip) - movq %r11, saved_context_r11(%rip) - movq %r12, saved_context_r12(%rip) - movq %r13, saved_context_r13(%rip) - movq %r14, saved_context_r14(%rip) - movq %r15, saved_context_r15(%rip) - pushfq ; popq saved_context_eflags(%rip) - - movq $.L97, saved_rip(%rip) - - movq %rsp,saved_rsp - movq %rbp,saved_rbp - movq %rbx,saved_rbx - movq %rdi,saved_rdi - movq %rsi,saved_rsi - - addq $8, %rsp - movl $3, %edi - xorl %eax, %eax - jmp acpi_enter_sleep_state -.L97: - .p2align 4,,7 -.L99: - .align 4 - movl $24, %eax - movw %ax, %ds - movq saved_context+58(%rip), %rax - movq %rax, %cr4 - movq saved_context+50(%rip), %rax - movq %rax, %cr3 - movq saved_context+42(%rip), %rax - movq %rax, %cr2 - movq saved_context+34(%rip), %rax - movq %rax, %cr0 - pushq saved_context_eflags(%rip) ; popfq - movq saved_context_esp(%rip), %rsp - movq saved_context_ebp(%rip), %rbp - movq saved_context_eax(%rip), %rax - movq saved_context_ebx(%rip), %rbx - movq saved_context_ecx(%rip), %rcx - movq saved_context_edx(%rip), %rdx - movq saved_context_esi(%rip), %rsi - movq saved_context_edi(%rip), %rdi - movq saved_context_r08(%rip), %r8 - movq saved_context_r09(%rip), %r9 - movq saved_context_r10(%rip), %r10 - movq saved_context_r11(%rip), %r11 - movq saved_context_r12(%rip), %r12 - movq saved_context_r13(%rip), %r13 - movq saved_context_r14(%rip), %r14 - movq saved_context_r15(%rip), %r15 - - xorl %eax, %eax - addq $8, %rsp - jmp restore_processor_state -.LFE5: -.Lfe5: - .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel - -.data -ALIGN -ENTRY(saved_rbp) .quad 0 -ENTRY(saved_rsi) .quad 0 -ENTRY(saved_rdi) .quad 0 -ENTRY(saved_rbx) .quad 0 - -ENTRY(saved_rip) .quad 0 -ENTRY(saved_rsp) .quad 0 - -ENTRY(saved_magic) .quad 0 diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c deleted file mode 100644 index 8f681cae7bf7..000000000000 --- a/arch/x86_64/kernel/aperture.c +++ /dev/null @@ -1,298 +0,0 @@ -/* - * Firmware replacement code. - * - * Work around broken BIOSes that don't set an aperture or only set the - * aperture in the AGP bridge. - * If all fails map the aperture over some low memory. This is cheaper than - * doing bounce buffering. The memory is lost. This is done at early boot - * because only the bootmem allocator can allocate 32+MB. - * - * Copyright 2002 Andi Kleen, SuSE Labs. - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/mmzone.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> -#include <linux/bitops.h> -#include <linux/ioport.h> -#include <asm/e820.h> -#include <asm/io.h> -#include <asm/iommu.h> -#include <asm/pci-direct.h> -#include <asm/dma.h> -#include <asm/k8.h> - -int iommu_aperture; -int iommu_aperture_disabled __initdata = 0; -int iommu_aperture_allowed __initdata = 0; - -int fallback_aper_order __initdata = 1; /* 64MB */ -int fallback_aper_force __initdata = 0; - -int fix_aperture __initdata = 1; - -static struct resource gart_resource = { - .name = "GART", - .flags = IORESOURCE_MEM, -}; - -static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) -{ - gart_resource.start = aper_base; - gart_resource.end = aper_base + aper_size - 1; - insert_resource(&iomem_resource, &gart_resource); -} - -/* This code runs before the PCI subsystem is initialized, so just - access the northbridge directly. */ - -static u32 __init allocate_aperture(void) -{ - u32 aper_size; - void *p; - - if (fallback_aper_order > 7) - fallback_aper_order = 7; - aper_size = (32 * 1024 * 1024) << fallback_aper_order; - - /* - * Aperture has to be naturally aligned. This means an 2GB aperture won't - * have much chance of finding a place in the lower 4GB of memory. - * Unfortunately we cannot move it up because that would make the - * IOMMU useless. - */ - p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); - if (!p || __pa(p)+aper_size > 0xffffffff) { - printk("Cannot allocate aperture memory hole (%p,%uK)\n", - p, aper_size>>10); - if (p) - free_bootmem(__pa(p), aper_size); - return 0; - } - printk("Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, __pa(p)); - insert_aperture_resource((u32)__pa(p), aper_size); - return (u32)__pa(p); -} - -static int __init aperture_valid(u64 aper_base, u32 aper_size) -{ - if (!aper_base) - return 0; - if (aper_size < 64*1024*1024) { - printk("Aperture too small (%d MB)\n", aper_size>>20); - return 0; - } - if (aper_base + aper_size > 0x100000000UL) { - printk("Aperture beyond 4GB. Ignoring.\n"); - return 0; - } - if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { - printk("Aperture pointing to e820 RAM. Ignoring.\n"); - return 0; - } - return 1; -} - -/* Find a PCI capability */ -static __u32 __init find_cap(int num, int slot, int func, int cap) -{ - u8 pos; - int bytes; - if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) - return 0; - pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { - u8 id; - pos &= ~3; - id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); - if (id == 0xff) - break; - if (id == cap) - return pos; - pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); - } - return 0; -} - -/* Read a standard AGPv3 bridge header */ -static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) -{ - u32 apsize; - u32 apsizereg; - int nbits; - u32 aper_low, aper_hi; - u64 aper; - - printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); - apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); - if (apsizereg == 0xffffffff) { - printk("APSIZE in AGP bridge unreadable\n"); - return 0; - } - - apsize = apsizereg & 0xfff; - /* Some BIOS use weird encodings not in the AGPv3 table. */ - if (apsize & 0xff) - apsize |= 0xf00; - nbits = hweight16(apsize); - *order = 7 - nbits; - if ((int)*order < 0) /* < 32MB */ - *order = 0; - - aper_low = read_pci_config(num,slot,func, 0x10); - aper_hi = read_pci_config(num,slot,func,0x14); - aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); - - printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", - aper, 32 << *order, apsizereg); - - if (!aperture_valid(aper, (32*1024*1024) << *order)) - return 0; - return (u32)aper; -} - -/* Look for an AGP bridge. Windows only expects the aperture in the - AGP bridge and some BIOS forget to initialize the Northbridge too. - Work around this here. - - Do an PCI bus scan by hand because we're running before the PCI - subsystem. - - All K8 AGP bridges are AGPv3 compliant, so we can do this scan - generically. It's probably overkill to always scan all slots because - the AGP bridges should be always an own bus on the HT hierarchy, - but do it here for future safety. */ -static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) -{ - int num, slot, func; - - /* Poor man's PCI discovery */ - for (num = 0; num < 256; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class, cap; - u8 type; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - switch (class >> 16) { - case PCI_CLASS_BRIDGE_HOST: - case PCI_CLASS_BRIDGE_OTHER: /* needed? */ - /* AGP bridge? */ - cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); - if (!cap) - break; - *valid_agp = 1; - return read_agp(num,slot,func,cap,order); - } - - /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } - printk("No AGP bridge found\n"); - return 0; -} - -void __init iommu_hole_init(void) -{ - int fix, num; - u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; - u64 aper_base, last_aper_base = 0; - int valid_agp = 0; - - if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) - return; - - printk(KERN_INFO "Checking aperture...\n"); - - fix = 0; - for (num = 24; num < 32; num++) { - if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - iommu_detected = 1; - iommu_aperture = 1; - - aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; - aper_size = (32 * 1024 * 1024) << aper_order; - aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; - aper_base <<= 25; - - printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, - aper_base, aper_size>>20); - - if (!aperture_valid(aper_base, aper_size)) { - fix = 1; - break; - } - - if ((last_aper_order && aper_order != last_aper_order) || - (last_aper_base && aper_base != last_aper_base)) { - fix = 1; - break; - } - last_aper_order = aper_order; - last_aper_base = aper_base; - } - - if (!fix && !fallback_aper_force) { - if (last_aper_base) { - unsigned long n = (32 * 1024 * 1024) << last_aper_order; - insert_aperture_resource((u32)last_aper_base, n); - } - return; - } - - if (!fallback_aper_force) - aper_alloc = search_agp_bridge(&aper_order, &valid_agp); - - if (aper_alloc) { - /* Got the aperture from the AGP bridge */ - } else if (swiotlb && !valid_agp) { - /* Do nothing */ - } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || - force_iommu || - valid_agp || - fallback_aper_force) { - printk("Your BIOS doesn't leave a aperture memory hole\n"); - printk("Please enable the IOMMU option in the BIOS setup\n"); - printk("This costs you %d MB of RAM\n", - 32 << fallback_aper_order); - - aper_order = fallback_aper_order; - aper_alloc = allocate_aperture(); - if (!aper_alloc) { - /* Could disable AGP and IOMMU here, but it's probably - not worth it. But the later users cannot deal with - bad apertures and turning on the aperture over memory - causes very strange problems, so it's better to - panic early. */ - panic("Not enough memory for aperture"); - } - } else { - return; - } - - /* Fix up the north bridges */ - for (num = 24; num < 32; num++) { - if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(0, num, 3, 0x90, aper_order<<1); - write_pci_config(0, num, 3, 0x94, aper_alloc>>25); - } -} diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c deleted file mode 100644 index 925758dbca0c..000000000000 --- a/arch/x86_64/kernel/apic.c +++ /dev/null @@ -1,1253 +0,0 @@ -/* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. - */ - -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/module.h> -#include <linux/ioport.h> - -#include <asm/atomic.h> -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/pgalloc.h> -#include <asm/mach_apic.h> -#include <asm/nmi.h> -#include <asm/idle.h> -#include <asm/proto.h> -#include <asm/timex.h> -#include <asm/hpet.h> -#include <asm/apic.h> - -int apic_mapped; -int apic_verbosity; -int apic_runs_main_timer; -int apic_calibrate_pmtmr __initdata; - -int disable_apic_timer __initdata; - -/* Local APIC timer works in C2? */ -int local_apic_timer_c2_ok; -EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); - -static struct resource *ioapic_resources; -static struct resource lapic_resource = { - .name = "Local APIC", - .flags = IORESOURCE_MEM | IORESOURCE_BUSY, -}; - -/* - * cpu_mask that denotes the CPUs that needs timer interrupt coming in as - * IPIs in place of local APIC timers - */ -static cpumask_t timer_interrupt_broadcast_ipi_mask; - -/* Using APIC to generate smp_local_timer_interrupt? */ -int using_apic_timer __read_mostly = 0; - -static void apic_pm_activate(void); - -void apic_wait_icr_idle(void) -{ - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -unsigned int safe_apic_wait_icr_idle(void) -{ - unsigned int send_status; - int timeout; - - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - udelay(100); - } while (timeout++ < 1000); - - return send_status; -} - -void enable_NMI_through_LVT0 (void * dummy) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - apic_write(APIC_LVT0, v); -} - -int get_maxlvt(void) -{ - unsigned int v, maxlvt; - - v = apic_read(APIC_LVR); - maxlvt = GET_APIC_MAXLVT(v); - return maxlvt; -} - -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk("unexpected IRQ trap at vector %02x\n", irq); - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But don't ack when the APIC is disabled. -AK - */ - if (!disable_apic) - ack_APIC_irq(); -} - -void clear_local_APIC(void) -{ - int maxlvt; - unsigned int v; - - maxlvt = get_maxlvt(); - - /* - * Masking an LVT entry can trigger a local APIC error - * if the vector is zero. Mask LVTERR first to prevent this. - */ - if (maxlvt >= 3) { - v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); - } - /* - * Careful: we have to set masks only first to deassert - * any level-triggered sources. - */ - v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT1); - apic_write(APIC_LVT1, v | APIC_LVT_MASKED); - if (maxlvt >= 4) { - v = apic_read(APIC_LVTPC); - apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); - } - - /* - * Clean APIC state for other OSs: - */ - apic_write(APIC_LVTT, APIC_LVT_MASKED); - apic_write(APIC_LVT0, APIC_LVT_MASKED); - apic_write(APIC_LVT1, APIC_LVT_MASKED); - if (maxlvt >= 3) - apic_write(APIC_LVTERR, APIC_LVT_MASKED); - if (maxlvt >= 4) - apic_write(APIC_LVTPC, APIC_LVT_MASKED); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); -} - -void disconnect_bsp_APIC(int virt_wire_setup) -{ - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); -} - -void disable_local_APIC(void) -{ - unsigned int value; - - clear_local_APIC(); - - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_SPIV_APIC_ENABLED; - apic_write(APIC_SPIV, value); -} - -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); - reg1 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); - apic_write(APIC_ID, reg0); - if (reg1 != (reg0 ^ APIC_ID_MASK)) - return 0; - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - -void __init sync_Arb_IDs(void) -{ - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (ver >= 0x14) /* P4 or higher */ - return; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); -} - -/* - * An initial setup of the virtual wire mode. - */ -void __init init_bsp_APIC(void) -{ - unsigned int value; - - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !cpu_has_apic) - return; - - value = apic_read(APIC_LVR); - - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - - /* - * Enable APIC. - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up the virtual wire mode. - */ - apic_write(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - apic_write(APIC_LVT1, value); -} - -void __cpuinit setup_local_APIC (void) -{ - unsigned int value, maxlvt; - int i, j; - - value = apic_read(APIC_LVR); - - BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); - - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - if (!apic_id_registered()) - BUG(); - - /* - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ - init_apic_ldr(); - - /* - * Set Task Priority to 'accept all'. We never change this - * later on. - */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write(APIC_TASKPRI, value); - - /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. - */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1<<j)) - ack_APIC_irq(); - } - } - - /* - * Now that we are all set up, enable the APIC - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - /* - * Enable APIC - */ - value |= APIC_SPIV_APIC_ENABLED; - - /* We always use processor focus */ - - /* - * Set spurious IRQ vector - */ - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up LVT0, LVT1: - * - * set up through-local-APIC on the BP's LINT0. This is not - * strictly necessary in pure symmetric-IO mode, but sometimes - * we delegate interrupts to the 8259A. - */ - /* - * TODO: set up through-local-APIC from through-I/O-APIC? --macro - */ - value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && !value) { - value = APIC_DM_EXTINT; - apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); - } else { - value = APIC_DM_EXTINT | APIC_LVT_MASKED; - apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); - } - apic_write(APIC_LVT0, value); - - /* - * only the BP should see the LINT1 NMI signal, obviously. - */ - if (!smp_processor_id()) - value = APIC_DM_NMI; - else - value = APIC_DM_NMI | APIC_LVT_MASKED; - apic_write(APIC_LVT1, value); - - { - unsigned oldvalue; - maxlvt = get_maxlvt(); - oldvalue = apic_read(APIC_ESR); - value = ERROR_APIC_VECTOR; // enables sending errors - apic_write(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, - "ESR value after enabling vector: %08x, after %08x\n", - oldvalue, value); - } - - nmi_watchdog_default(); - setup_apic_nmi_watchdog(NULL); - apic_pm_activate(); -} - -#ifdef CONFIG_PM - -static struct { - /* 'active' is true if the local APIC was enabled by us and - not the BIOS; this signifies that we are also responsible - for disabling it before entering apm/acpi suspend */ - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = get_maxlvt(); - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - if (maxlvt >= 4) - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#ifdef CONFIG_X86_MCE_INTEL - if (maxlvt >= 5) - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -#endif - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = get_maxlvt(); - - local_irq_save(flags); - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#ifdef CONFIG_X86_MCE_INTEL - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -#endif - if (maxlvt >= 4) - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - local_irq_restore(flags); - return 0; -} - -static struct sysdev_class lapic_sysclass = { - set_kset_name("lapic"), - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __cpuinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -static int __init apic_set_verbosity(char *str) -{ - if (str == NULL) { - skip_ioapic_setup = 0; - ioapic_force = 1; - return 0; - } - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", str); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) - */ - -static int __init detect_init_APIC (void) -{ - if (!cpu_has_apic) { - printk(KERN_INFO "No local APIC present\n"); - return -1; - } - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_id = 0; - return 0; -} - -#ifdef CONFIG_X86_IO_APIC -static struct resource * __init ioapic_setup_resources(void) -{ -#define IOAPIC_RESOURCE_NAME_SIZE 11 - unsigned long n; - struct resource *res; - char *mem; - int i; - - if (nr_ioapics <= 0) - return NULL; - - n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; - - mem = alloc_bootmem(n); - res = (void *)mem; - - if (mem != NULL) { - memset(mem, 0, n); - mem += sizeof(struct resource) * nr_ioapics; - - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } - } - - ioapic_resources = res; - - return res; -} - -static int __init ioapic_insert_resources(void) -{ - int i; - struct resource *r = ioapic_resources; - - if (!r) { - printk("IO APIC resources could be not be allocated.\n"); - return -1; - } - - for (i = 0; i < nr_ioapics; i++) { - insert_resource(&iomem_resource, r); - r++; - } - - return 0; -} - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); -#endif - -void __init init_apic_mappings(void) -{ - unsigned long apic_phys; - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_mapped = 1; - apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); - - /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; - lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; - insert_resource(&iomem_resource, &lapic_resource); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - { - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - struct resource *ioapic_res; - - ioapic_res = ioapic_setup_resources(); - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mpc_apicaddr; - } else { - ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } - } - } -} - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ - -#define APIC_DIVISOR 16 - -static void __setup_APIC_LVTT(unsigned int clocks) -{ - unsigned int lvtt_value, tmp_value; - int cpu = smp_processor_id(); - - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - - if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) - lvtt_value |= APIC_LVT_MASKED; - - apic_write(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - apic_write(APIC_TMICT, clocks/APIC_DIVISOR); -} - -static void setup_APIC_timer(unsigned int clocks) -{ - unsigned long flags; - - local_irq_save(flags); - - /* wait for irq slice */ - if (hpet_address && hpet_use_timer) { - u32 trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_T0_CMP) == trigger) - /* do nothing */ ; - } else { - int c1, c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - do { - c1 = c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - } while (c2 - c1 < 300); - } - __setup_APIC_LVTT(clocks); - /* Turn off PIT interrupt if we use APIC timer as main timer. - Only works with the PM timer right now - TBD fix it for HPET too. */ - if ((pmtmr_ioport != 0) && - smp_processor_id() == boot_cpu_id && - apic_runs_main_timer == 1 && - !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { - stop_timer_interrupt(); - apic_runs_main_timer++; - } - local_irq_restore(flags); -} - -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static int __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - */ - __setup_APIC_LVTT(4000000000); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - printk("result %d\n", result); - - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - return result * APIC_DIVISOR / HZ; -} - -static unsigned int calibration_result; - -void __init setup_boot_APIC_clock (void) -{ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - return; - } - - printk(KERN_INFO "Using local APIC timer interrupts.\n"); - using_apic_timer = 1; - - local_irq_disable(); - - calibration_result = calibrate_APIC_clock(); - /* - * Now set up the timer for real. - */ - setup_APIC_timer(calibration_result); - - local_irq_enable(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - local_irq_disable(); /* FIXME: Do we need this? --RR */ - setup_APIC_timer(calibration_result); - local_irq_enable(); -} - -void disable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; - - v = apic_read(APIC_LVTT); - /* - * When an illegal vector value (0-15) is written to an LVT - * entry and delivery mode is Fixed, the APIC may signal an - * illegal vector error, with out regard to whether the mask - * bit is set or whether an interrupt is actually seen on input. - * - * Boot sequence might call this function when the LVTT has - * '0' vector value. So make sure vector field is set to - * valid value. - */ - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); - } -} - -void enable_APIC_timer(void) -{ - int cpu = smp_processor_id(); - - if (using_apic_timer && - !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { - unsigned long v; - - v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED); - } -} - -void switch_APIC_timer_to_ipi(void *cpumask) -{ - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { - disable_APIC_timer(); - cpu_set(cpu, timer_interrupt_broadcast_ipi_mask); - } -} -EXPORT_SYMBOL(switch_APIC_timer_to_ipi); - -void smp_send_timer_broadcast_ipi(void) -{ - int cpu = smp_processor_id(); - cpumask_t mask; - - cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask); - - if (cpu_isset(cpu, mask)) { - cpu_clear(cpu, mask); - add_pda(apic_timer_irqs, 1); - smp_local_timer_interrupt(); - } - - if (!cpus_empty(mask)) { - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); - } -} - -void switch_ipi_to_APIC_timer(void *cpumask) -{ - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { - cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask); - enable_APIC_timer(); - } -} -EXPORT_SYMBOL(switch_ipi_to_APIC_timer); - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - -void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, - unsigned char msg_type, unsigned char mask) -{ - unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - apic_write(reg, v); -} - -#undef APIC_DIVISOR - -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -void smp_local_timer_interrupt(void) -{ - profile_tick(CPU_PROFILING); -#ifdef CONFIG_SMP - update_process_times(user_mode(get_irq_regs())); -#endif - if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id) - main_timer_handler(); - /* - * We take the 'long' return path, and there every subsystem - * grabs the appropriate locks (kernel lock/ irq lock). - * - * We might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * the NMI deadlock-detector uses this. - */ - add_pda(apic_timer_irqs, 1); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - exit_idle(); - irq_enter(); - smp_local_timer_interrupt(); - irq_exit(); - set_irq_regs(old_regs); -} - -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) -{ - int i, clusters, zeros; - unsigned id; - DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - - bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - - for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; - if (id != BAD_APICID) - __set_bit(APIC_CLUSTERID(id), clustermap); - } - - /* Problem: Partially populated chassis may not have CPUs in some of - * the APIC clusters they have been allocated. Only present CPUs have - * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since - * clusters are allocated sequentially, count zeros only if they are - * bounded by ones. - */ - clusters = 0; - zeros = 0; - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (test_bit(i, clustermap)) { - clusters += 1 + zeros; - zeros = 0; - } else - ++zeros; - } - - /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. - */ - return (clusters > 2); -} - -/* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -asmlinkage void smp_spurious_interrupt(void) -{ - unsigned int v; - exit_idle(); - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture - */ - -asmlinkage void smp_error_interrupt(void) -{ - unsigned int v, v1; - - exit_idle(); - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -int disable_apic; - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor (void) -{ - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; - } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; - } - - verify_local_APIC(); - - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); - - setup_local_APIC(); - - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else - nr_ioapics = 0; - setup_boot_APIC_clock(); - check_nmi_watchdog(); - return 0; -} - -static __init int setup_disableapic(char *str) -{ - disable_apic = 1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return 0; -} -early_param("disableapic", setup_disableapic); - -/* same as disableapic, for compatibility */ -static __init int setup_nolapic(char *str) -{ - return setup_disableapic(str); -} -early_param("nolapic", setup_nolapic); - -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - -static __init int setup_noapictimer(char *str) -{ - if (str[0] != ' ' && str[0] != 0) - return 0; - disable_apic_timer = 1; - return 1; -} - -static __init int setup_apicmaintimer(char *str) -{ - apic_runs_main_timer = 1; - nohpet = 1; - return 1; -} -__setup("apicmaintimer", setup_apicmaintimer); - -static __init int setup_noapicmaintimer(char *str) -{ - apic_runs_main_timer = -1; - return 1; -} -__setup("noapicmaintimer", setup_noapicmaintimer); - -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return setup_apicmaintimer(NULL); -} -__setup("apicpmtimer", setup_apicpmtimer); - -__setup("noapictimer", setup_noapictimer); - diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c deleted file mode 100644 index 778953bc636c..000000000000 --- a/arch/x86_64/kernel/asm-offsets.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed to extract - * and format the required data. - */ - -#include <linux/crypto.h> -#include <linux/sched.h> -#include <linux/stddef.h> -#include <linux/errno.h> -#include <linux/hardirq.h> -#include <linux/suspend.h> -#include <asm/pda.h> -#include <asm/processor.h> -#include <asm/segment.h> -#include <asm/thread_info.h> -#include <asm/ia32.h> - -#define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) - -#define BLANK() asm volatile("\n->" : : ) - -#define __NO_STUBS 1 -#undef __SYSCALL -#undef _ASM_X86_64_UNISTD_H_ -#define __SYSCALL(nr, sym) [nr] = 1, -static char syscalls[] = { -#include <asm/unistd.h> -}; - -int main(void) -{ -#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) - ENTRY(state); - ENTRY(flags); - ENTRY(thread); - ENTRY(pid); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) - ENTRY(flags); - ENTRY(addr_limit); - ENTRY(preempt_count); - ENTRY(status); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) - ENTRY(kernelstack); - ENTRY(oldrsp); - ENTRY(pcurrent); - ENTRY(irqcount); - ENTRY(cpunumber); - ENTRY(irqstackptr); - ENTRY(data_offset); - BLANK(); -#undef ENTRY -#ifdef CONFIG_IA32_EMULATION -#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) - ENTRY(eax); - ENTRY(ebx); - ENTRY(ecx); - ENTRY(edx); - ENTRY(esi); - ENTRY(edi); - ENTRY(ebp); - ENTRY(esp); - ENTRY(eip); - BLANK(); -#undef ENTRY - DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe32, uc.uc_mcontext)); - BLANK(); -#endif - DEFINE(pbe_address, offsetof(struct pbe, address)); - DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); - DEFINE(pbe_next, offsetof(struct pbe, next)); - BLANK(); - DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); - BLANK(); - DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); - BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); - return 0; -} diff --git a/arch/x86_64/kernel/audit.c b/arch/x86_64/kernel/audit.c deleted file mode 100644 index 06d3e5a14d9d..000000000000 --- a/arch/x86_64/kernel/audit.c +++ /dev/null @@ -1,81 +0,0 @@ -#include <linux/init.h> -#include <linux/types.h> -#include <linux/audit.h> -#include <asm/unistd.h> - -static unsigned dir_class[] = { -#include <asm-generic/audit_dir_write.h> -~0U -}; - -static unsigned read_class[] = { -#include <asm-generic/audit_read.h> -~0U -}; - -static unsigned write_class[] = { -#include <asm-generic/audit_write.h> -~0U -}; - -static unsigned chattr_class[] = { -#include <asm-generic/audit_change_attr.h> -~0U -}; - -static unsigned signal_class[] = { -#include <asm-generic/audit_signal.h> -~0U -}; - -int audit_classify_arch(int arch) -{ -#ifdef CONFIG_IA32_EMULATION - if (arch == AUDIT_ARCH_I386) - return 1; -#endif - return 0; -} - -int audit_classify_syscall(int abi, unsigned syscall) -{ -#ifdef CONFIG_IA32_EMULATION - extern int ia32_classify_syscall(unsigned); - if (abi == AUDIT_ARCH_I386) - return ia32_classify_syscall(syscall); -#endif - switch(syscall) { - case __NR_open: - return 2; - case __NR_openat: - return 3; - case __NR_execve: - return 5; - default: - return 0; - } -} - -static int __init audit_classes_init(void) -{ -#ifdef CONFIG_IA32_EMULATION - extern __u32 ia32_dir_class[]; - extern __u32 ia32_write_class[]; - extern __u32 ia32_read_class[]; - extern __u32 ia32_chattr_class[]; - extern __u32 ia32_signal_class[]; - audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); - audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); - audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); - audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class); - audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class); -#endif - audit_register_class(AUDIT_CLASS_WRITE, write_class); - audit_register_class(AUDIT_CLASS_READ, read_class); - audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); - audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); - audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); - return 0; -} - -__initcall(audit_classes_init); diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c deleted file mode 100644 index 4e5e9d364d63..000000000000 --- a/arch/x86_64/kernel/bugs.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * arch/x86_64/kernel/bugs.c - * - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2000 SuSE - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <asm/alternative.h> -#include <asm/bugs.h> -#include <asm/processor.h> -#include <asm/mtrr.h> - -void __init check_bugs(void) -{ - identify_cpu(&boot_cpu_data); - mtrr_bp_init(); -#if !defined(CONFIG_SMP) - printk("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - alternative_instructions(); -} diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig deleted file mode 100644 index a3fd51926cbd..000000000000 --- a/arch/x86_64/kernel/cpufreq/Kconfig +++ /dev/null @@ -1,108 +0,0 @@ -# -# CPU Frequency scaling -# - -menu "CPU Frequency scaling" - -source "drivers/cpufreq/Kconfig" - -if CPU_FREQ - -comment "CPUFreq processor drivers" - -config X86_POWERNOW_K8 - tristate "AMD Opteron/Athlon64 PowerNow!" - select CPU_FREQ_TABLE - help - This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. - - To compile this driver as a module, choose M here: the - module will be called powernow-k8. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_POWERNOW_K8_ACPI - bool - depends on X86_POWERNOW_K8 && ACPI_PROCESSOR - depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m) - default y - -config X86_SPEEDSTEP_CENTRINO - tristate "Intel Enhanced SpeedStep (deprecated)" - select CPU_FREQ_TABLE - depends on ACPI_PROCESSOR - help - This is deprecated and this functionality is now merged into - acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of - speedstep_centrino. - This adds the CPUFreq driver for Enhanced SpeedStep enabled - mobile CPUs. This means Intel Pentium M (Centrino) CPUs - or 64bit enabled Intel Xeons. - - To compile this driver as a module, choose M here: the - module will be called speedstep-centrino. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_ACPI_CPUFREQ - tristate "ACPI Processor P-States driver" - select CPU_FREQ_TABLE - depends on ACPI_PROCESSOR - help - This driver adds a CPUFreq driver which utilizes the ACPI - Processor Performance States. - This driver also supports Intel Enhanced Speedstep. - - To compile this driver as a module, choose M here: the - module will be called acpi-cpufreq. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -comment "shared options" - -config X86_ACPI_CPUFREQ_PROC_INTF - bool "/proc/acpi/processor/../performance interface (deprecated)" - depends on PROC_FS - depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K8_ACPI - help - This enables the deprecated /proc/acpi/processor/../performance - interface. While it is helpful for debugging, the generic, - cross-architecture cpufreq interfaces should be used. - - If in doubt, say N. - -config X86_P4_CLOCKMOD - tristate "Intel Pentium 4 clock modulation" - depends on EMBEDDED - select CPU_FREQ_TABLE - help - This adds the clock modulation driver for Intel Pentium 4 / XEON - processors. When enabled it will lower CPU temperature by skipping - clocks. - - This driver should be only used in exceptional - circumstances when very low power is needed because it causes severe - slowdowns and noticeable latencies. Normally Speedstep should be used - instead. - - To compile this driver as a module, choose M here: the - module will be called p4-clockmod. - - For details, take a look at <file:Documentation/cpu-freq/>. - - Unless you are absolutely sure say N. - - -config X86_SPEEDSTEP_LIB - tristate - default X86_P4_CLOCKMOD - -endif - -endmenu diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile deleted file mode 100644 index 753ce1dd418e..000000000000 --- a/arch/x86_64/kernel/cpufreq/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -# -# Reuse the i386 cpufreq drivers -# - -SRCDIR := ../../../i386/kernel/cpu/cpufreq - -obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o -obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o -obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o -obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o -obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o - -powernow-k8-objs := ${SRCDIR}/powernow-k8.o -speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o -acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o -p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o -speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c deleted file mode 100644 index 13432a1ae904..000000000000 --- a/arch/x86_64/kernel/crash.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Architecture specific (x86_64) functions for kexec based crash dumps. - * - * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) - * - * Copyright (C) IBM Corporation, 2004. All rights reserved. - * - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/irq.h> -#include <linux/reboot.h> -#include <linux/kexec.h> -#include <linux/delay.h> -#include <linux/elf.h> -#include <linux/elfcore.h> -#include <linux/kdebug.h> - -#include <asm/processor.h> -#include <asm/hardirq.h> -#include <asm/nmi.h> -#include <asm/hw_irq.h> -#include <asm/mach_apic.h> - -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; - -#ifdef CONFIG_SMP -static atomic_t waiting_for_crash_ipi; - -static int crash_nmi_callback(struct notifier_block *self, - unsigned long val, void *data) -{ - struct pt_regs *regs; - int cpu; - - if (val != DIE_NMI_IPI) - return NOTIFY_OK; - - regs = ((struct die_args *)data)->regs; - cpu = raw_smp_processor_id(); - - /* - * Don't do anything if this handler is invoked on crashing cpu. - * Otherwise, system will completely hang. Crashing cpu can get - * an NMI if system was initially booted with nmi_watchdog parameter. - */ - if (cpu == crashing_cpu) - return NOTIFY_STOP; - local_irq_disable(); - - crash_save_cpu(regs, cpu); - disable_local_APIC(); - atomic_dec(&waiting_for_crash_ipi); - /* Assume hlt works */ - for(;;) - halt(); - - return 1; -} - -static void smp_send_nmi_allbutself(void) -{ - send_IPI_allbutself(NMI_VECTOR); -} - -/* - * This code is a best effort heuristic to get the - * other cpus to stop executing. So races with - * cpu hotplug shouldn't matter. - */ - -static struct notifier_block crash_nmi_nb = { - .notifier_call = crash_nmi_callback, -}; - -static void nmi_shootdown_cpus(void) -{ - unsigned long msecs; - - atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - if (register_die_notifier(&crash_nmi_nb)) - return; /* return what? */ - - /* - * Ensure the new callback function is set before sending - * out the NMI - */ - wmb(); - - smp_send_nmi_allbutself(); - - msecs = 1000; /* Wait at most a second for the other cpus to stop */ - while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { - mdelay(1); - msecs--; - } - /* Leave the nmi callback set */ - disable_local_APIC(); -} -#else -static void nmi_shootdown_cpus(void) -{ - /* There are no cpus to shootdown */ -} -#endif - -void machine_crash_shutdown(struct pt_regs *regs) -{ - /* - * This function is only called after the system - * has panicked or is otherwise in a critical state. - * The minimum amount of code to allow a kexec'd kernel - * to run successfully needs to happen here. - * - * In practice this means shooting down the other cpus in - * an SMP system. - */ - /* The kernel is broken so disable interrupts */ - local_irq_disable(); - - /* Make a note of crashing cpu. Will be used in NMI callback.*/ - crashing_cpu = smp_processor_id(); - nmi_shootdown_cpus(); - - if(cpu_has_apic) - disable_local_APIC(); - - disable_IO_APIC(); - - crash_save_cpu(regs, smp_processor_id()); -} diff --git a/arch/x86_64/kernel/crash_dump.c b/arch/x86_64/kernel/crash_dump.c deleted file mode 100644 index 942deac4d43a..000000000000 --- a/arch/x86_64/kernel/crash_dump.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * kernel/crash_dump.c - Memory preserving reboot related code. - * - * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) - * Copyright (C) IBM Corporation, 2004. All rights reserved - */ - -#include <linux/errno.h> -#include <linux/crash_dump.h> - -#include <asm/uaccess.h> -#include <asm/io.h> - -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) -{ - void *vaddr; - - if (!csize) - return 0; - - vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); - - if (userbuf) { - if (copy_to_user(buf, (vaddr + offset), csize)) { - iounmap(vaddr); - return -EFAULT; - } - } else - memcpy(buf, (vaddr + offset), csize); - - iounmap(vaddr); - return csize; -} diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c deleted file mode 100644 index 0f4d5e209e9b..000000000000 --- a/arch/x86_64/kernel/e820.c +++ /dev/null @@ -1,725 +0,0 @@ -/* - * Handle the memory map. - * The functions here do the job until bootmem takes over. - * - * Getting sanitize_e820_map() in sync with i386 version by applying change: - * - Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <xela@slit.de>, December 2002. - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> - * - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/ioport.h> -#include <linux/string.h> -#include <linux/kexec.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/suspend.h> -#include <linux/pfn.h> - -#include <asm/pgtable.h> -#include <asm/page.h> -#include <asm/e820.h> -#include <asm/proto.h> -#include <asm/bootsetup.h> -#include <asm/sections.h> - -struct e820map e820; - -/* - * PFN of last memory page. - */ -unsigned long end_pfn; -EXPORT_SYMBOL(end_pfn); - -/* - * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. - * The direct mapping extends to end_pfn_map, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long end_pfn_map; - -/* - * Last pfn which the user wants to use. - */ -static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; - -extern struct resource code_resource, data_resource; - -/* Check for some hardcoded bad areas that early boot is not allowed to touch */ -static inline int bad_addr(unsigned long *addrp, unsigned long size) -{ - unsigned long addr = *addrp, last = addr + size; - - /* various gunk below that needed for SMP startup */ - if (addr < 0x8000) { - *addrp = PAGE_ALIGN(0x8000); - return 1; - } - - /* direct mapping tables of the kernel */ - if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { - *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT); - return 1; - } - - /* initrd */ -#ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START && last >= INITRD_START && - addr < INITRD_START+INITRD_SIZE) { - *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE); - return 1; - } -#endif - /* kernel code */ - if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { - *addrp = PAGE_ALIGN(__pa_symbol(&_end)); - return 1; - } - - if (last >= ebda_addr && addr < ebda_addr + ebda_size) { - *addrp = PAGE_ALIGN(ebda_addr + ebda_size); - return 1; - } - -#ifdef CONFIG_NUMA - /* NUMA memory to node map */ - if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { - *addrp = nodemap_addr + nodemap_size; - return 1; - } -#endif - /* XXX ramdisk image here? */ - return 0; -} - -/* - * This function checks if any part of the range <start,end> is mapped - * with type. - */ -int -e820_any_mapped(unsigned long start, unsigned long end, unsigned type) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(e820_any_mapped); - -/* - * This function checks if the entire range <start,end> is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - - /* if the region is at the beginning of <start,end> we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full coverage */ - if (start >= end) - return 1; /* we're done */ - } - return 0; -} - -/* - * Find a free area in a specific range. - */ -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr = ei->addr, last; - if (ei->type != E820_RAM) - continue; - if (addr < start) - addr = start; - if (addr > ei->addr + ei->size) - continue; - while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) - ; - last = PAGE_ALIGN(addr) + size; - if (last > ei->addr + ei->size) - continue; - if (last > end) - continue; - return addr; - } - return -1UL; -} - -/* - * Find the highest page frame number we have available - */ -unsigned long __init e820_end_of_ram(void) -{ - unsigned long end_pfn = 0; - end_pfn = find_max_pfn_with_active_regions(); - - if (end_pfn > end_pfn_map) - end_pfn_map = end_pfn; - if (end_pfn_map > MAXMEM>>PAGE_SHIFT) - end_pfn_map = MAXMEM>>PAGE_SHIFT; - if (end_pfn > end_user_pfn) - end_pfn = end_user_pfn; - if (end_pfn > end_pfn_map) - end_pfn = end_pfn_map; - - printk("end_pfn_map = %lu\n", end_pfn_map); - return end_pfn; -} - -/* - * Mark e820 reserved areas as busy for the resource manager. - */ -void __init e820_reserve_resources(void) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct resource *res; - res = alloc_bootmem_low(sizeof(struct resource)); - switch (e820.map[i].type) { - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } - res->start = e820.map[i].addr; - res->end = res->start + e820.map[i].size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - request_resource(&iomem_resource, res); - if (e820.map[i].type == E820_RAM) { - /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. - */ - request_resource(res, &code_resource); - request_resource(res, &data_resource); -#ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); -#endif - } - } -} - -/* - * Find the ranges of physical addresses that do not correspond to - * e820 RAM areas and mark the corresponding pages as nosave for software - * suspend and suspend to RAM. - * - * This function requires the e820 map to be sorted and without any - * overlapping entries and assumes the first e820 area to be RAM. - */ -void __init e820_mark_nosave_regions(void) -{ - int i; - unsigned long paddr; - - paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE); - for (i = 1; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (paddr < ei->addr) - register_nosave_region(PFN_DOWN(paddr), - PFN_UP(ei->addr)); - - paddr = round_down(ei->addr + ei->size, PAGE_SIZE); - if (ei->type != E820_RAM) - register_nosave_region(PFN_UP(ei->addr), - PFN_DOWN(paddr)); - - if (paddr >= (end_pfn << PAGE_SHIFT)) - break; - } -} - -/* - * Finds an active region in the address range from start_pfn to end_pfn and - * returns its range in ei_startpfn and ei_endpfn for the e820 entry. - */ -static int __init e820_find_active_region(const struct e820entry *ei, - unsigned long start_pfn, - unsigned long end_pfn, - unsigned long *ei_startpfn, - unsigned long *ei_endpfn) -{ - *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; - *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT; - - /* Skip map entries smaller than a page */ - if (*ei_startpfn >= *ei_endpfn) - return 0; - - /* Check if end_pfn_map should be updated */ - if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map) - end_pfn_map = *ei_endpfn; - - /* Skip if map is outside the node */ - if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || - *ei_startpfn >= end_pfn) - return 0; - - /* Check for overlaps */ - if (*ei_startpfn < start_pfn) - *ei_startpfn = start_pfn; - if (*ei_endpfn > end_pfn) - *ei_endpfn = end_pfn; - - /* Obey end_user_pfn to save on memmap */ - if (*ei_startpfn >= end_user_pfn) - return 0; - if (*ei_endpfn > end_user_pfn) - *ei_endpfn = end_user_pfn; - - return 1; -} - -/* Walk the e820 map and register active regions within a node */ -void __init -e820_register_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long ei_startpfn; - unsigned long ei_endpfn; - int i; - - for (i = 0; i < e820.nr_map; i++) - if (e820_find_active_region(&e820.map[i], - start_pfn, end_pfn, - &ei_startpfn, &ei_endpfn)) - add_active_range(nid, ei_startpfn, ei_endpfn); -} - -/* - * Add a memory region to the kernel e820 map. - */ -void __init add_memory_region(unsigned long start, unsigned long size, int type) -{ - int x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; -} - -/* - * Find the hole size (in bytes) in the memory range. - * @start: starting address of the memory range to scan - * @end: ending address of the memory range to scan - */ -unsigned long __init e820_hole_size(unsigned long start, unsigned long end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long ei_startpfn; - unsigned long ei_endpfn; - unsigned long ram = 0; - int i; - - for (i = 0; i < e820.nr_map; i++) { - if (e820_find_active_region(&e820.map[i], - start_pfn, end_pfn, - &ei_startpfn, &ei_endpfn)) - ram += ei_endpfn - ei_startpfn; - } - return end - start - (ram << PAGE_SHIFT); -} - -void __init e820_print_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(KERN_INFO " %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) (e820.map[i].addr + e820.map[i].size)); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; - } - } -} - -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) -{ - struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ - }; - static struct change_member change_point_list[2*E820MAX] __initdata; - static struct change_member *change_point[2*E820MAX] __initdata; - static struct e820entry *overlap_list[E820MAX] __initdata; - static struct e820entry new_bios[E820MAX] __initdata; - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) - return -1; - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; i<old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) - return -1; - - /* create pointers for initial change-point information (for sorting) */ - for (i=0; i < 2*old_nr; i++) - change_point[i] = &change_point_list[i]; - - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i=0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if <current_addr> > <last_addr>, swap */ - /* or, if current=<start_addr> & last=<end_addr>, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing=1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; i<overlap_entries; i++) - { - if (overlap_list[i] == change_point[chgidx]->pbios) - overlap_list[i] = overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ - current_type = 0; - for (i=0; i<overlap_entries; i++) - if (overlap_list[i]->type > current_type) - current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ - if (new_bios[new_bios_entry].size != 0) - if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; - } - last_type = current_type; - } - } - new_nr = new_bios_entry; /* retain count for new bios entries */ - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - */ -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - unsigned long start = biosmap->addr; - unsigned long size = biosmap->size; - unsigned long end = start + size; - unsigned long type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - add_memory_region(start, size, type); - } while (biosmap++,--nr_map); - return 0; -} - -void early_panic(char *msg) -{ - early_printk(msg); - panic(msg); -} - -void __init setup_memory_region(void) -{ - /* - * Try to copy the BIOS-supplied E820-map. - * - * Otherwise fake a memory map; one section from 0k->640k, - * the next section from 1mb->appropriate_mem_k - */ - sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) - early_panic("Cannot find a valid memory map"); - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map("BIOS-e820"); -} - -static int __init parse_memopt(char *p) -{ - if (!p) - return -EINVAL; - end_user_pfn = memparse(p, &p); - end_user_pfn >>= PAGE_SHIFT; - return 0; -} -early_param("mem", parse_memopt); - -static int userdef __initdata; - -static int __init parse_memmap_opt(char *p) -{ - char *oldp; - unsigned long long start_at, mem_size; - - if (!strcmp(p, "exactmap")) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - e820_register_active_regions(0, 0, -1UL); - saved_max_pfn = e820_end_of_ram(); - remove_all_active_ranges(); -#endif - end_pfn_map = 0; - e820.nr_map = 0; - userdef = 1; - return 0; - } - - oldp = p; - mem_size = memparse(p, &p); - if (p == oldp) - return -EINVAL; - if (*p == '@') { - start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*p == '#') { - start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*p == '$') { - start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - end_user_pfn = (mem_size >> PAGE_SHIFT); - } - return *p == '\0' ? 0 : -EINVAL; -} -early_param("memmap", parse_memmap_opt); - -void __init finish_e820_parsing(void) -{ - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - e820_print_map("user"); - } -} - -unsigned long pci_mem_start = 0xaeedbabe; -EXPORT_SYMBOL(pci_mem_start); - -/* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. We pass this space to PCI to assign MMIO resources - * for hotplug or unconfigured devices in. - * Hopefully the BIOS let enough space left. - */ -__init void e820_setup_gap(void) -{ - unsigned long gapstart, gapsize, round; - unsigned long last; - int i; - int found = 0; - - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - found = 1; - } - } - if (start < last) - last = start; - } - - if (!found) { - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); -} diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c deleted file mode 100644 index 13aa4fd728f3..000000000000 --- a/arch/x86_64/kernel/early-quirks.c +++ /dev/null @@ -1,127 +0,0 @@ -/* Various workarounds for chipset bugs. - This code runs very early and can't use the regular PCI subsystem - The entries are keyed to PCI bridges which usually identify chipsets - uniquely. - This is only for whole classes of chipsets with specific problems which - need early invasive action (e.g. before the timers are initialized). - Most PCI device specific workarounds can be done later and should be - in standard PCI quirks - Mainboard specific bugs should be handled by DMI entries. - CPU specific bugs in setup.c */ - -#include <linux/pci.h> -#include <linux/acpi.h> -#include <linux/pci_ids.h> -#include <asm/pci-direct.h> -#include <asm/proto.h> -#include <asm/iommu.h> -#include <asm/dma.h> - -static void __init via_bugs(void) -{ -#ifdef CONFIG_IOMMU - if ((end_pfn > MAX_DMA32_PFN || force_iommu) && - !iommu_aperture_allowed) { - printk(KERN_INFO - "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n"); - iommu_aperture_disabled = 1; - } -#endif -} - -#ifdef CONFIG_ACPI - -static int __init nvidia_hpet_check(struct acpi_table_header *header) -{ - return 0; -} -#endif - -static void __init nvidia_bugs(void) -{ -#ifdef CONFIG_ACPI - /* - * All timer overrides on Nvidia are - * wrong unless HPET is enabled. - * Unfortunately that's not true on many Asus boards. - * We don't know yet how to detect this automatically, but - * at least allow a command line override. - */ - if (acpi_use_timer_override) - return; - - if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) { - acpi_skip_timer_override = 1; - printk(KERN_INFO "Nvidia board " - "detected. Ignoring ACPI " - "timer override.\n"); - printk(KERN_INFO "If you got timer trouble " - "try acpi_use_timer_override\n"); - } -#endif - /* RED-PEN skip them on mptables too? */ - -} - -static void __init ati_bugs(void) -{ - if (timer_over_8254 == 1) { - timer_over_8254 = 0; - printk(KERN_INFO - "ATI board detected. Disabling timer routing over 8254.\n"); - } -} - -struct chipset { - u16 vendor; - void (*f)(void); -}; - -static struct chipset early_qrk[] __initdata = { - { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, - { PCI_VENDOR_ID_VIA, via_bugs }, - { PCI_VENDOR_ID_ATI, ati_bugs }, - {} -}; - -void __init early_quirks(void) -{ - int num, slot, func; - - if (!early_pci_allowed()) - return; - - /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - int i; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - - for (i = 0; early_qrk[i].f; i++) - if (early_qrk[i].vendor == vendor) { - early_qrk[i].f(); - return; - } - - type = read_pci_config_byte(num, slot, func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } -} diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c deleted file mode 100644 index fd9aff3f3890..000000000000 --- a/arch/x86_64/kernel/early_printk.c +++ /dev/null @@ -1,259 +0,0 @@ -#include <linux/console.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/screen_info.h> -#include <asm/io.h> -#include <asm/processor.h> -#include <asm/fcntl.h> -#include <xen/hvc-console.h> - -/* Simple VGA output */ - -#ifdef __i386__ -#include <asm/setup.h> -#else -#include <asm/bootsetup.h> -#endif -#define VGABASE (__ISA_IO_base + 0xb8000) - -static int max_ypos = 25, max_xpos = 80; -static int current_ypos = 25, current_xpos = 0; - -static void early_vga_write(struct console *con, const char *str, unsigned n) -{ - char c; - int i, k, j; - - while ((c = *str++) != '\0' && n-- > 0) { - if (current_ypos >= max_ypos) { - /* scroll 1 line up */ - for (k = 1, j = 0; k < max_ypos; k++, j++) { - for (i = 0; i < max_xpos; i++) { - writew(readw(VGABASE+2*(max_xpos*k+i)), - VGABASE + 2*(max_xpos*j + i)); - } - } - for (i = 0; i < max_xpos; i++) - writew(0x720, VGABASE + 2*(max_xpos*j + i)); - current_ypos = max_ypos-1; - } - if (c == '\n') { - current_xpos = 0; - current_ypos++; - } else if (c != '\r') { - writew(((0x7 << 8) | (unsigned short) c), - VGABASE + 2*(max_xpos*current_ypos + - current_xpos++)); - if (current_xpos >= max_xpos) { - current_xpos = 0; - current_ypos++; - } - } - } -} - -static struct console early_vga_console = { - .name = "earlyvga", - .write = early_vga_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ - -static int early_serial_base = 0x3f8; /* ttyS0 */ - -#define XMTRDY 0x20 - -#define DLAB 0x80 - -#define TXR 0 /* Transmit register (WRITE) */ -#define RXR 0 /* Receive register (READ) */ -#define IER 1 /* Interrupt Enable */ -#define IIR 2 /* Interrupt ID */ -#define FCR 2 /* FIFO control */ -#define LCR 3 /* Line control */ -#define MCR 4 /* Modem control */ -#define LSR 5 /* Line Status */ -#define MSR 6 /* Modem Status */ -#define DLL 0 /* Divisor Latch Low */ -#define DLH 1 /* Divisor latch High */ - -static int early_serial_putc(unsigned char ch) -{ - unsigned timeout = 0xffff; - while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) - cpu_relax(); - outb(ch, early_serial_base + TXR); - return timeout ? 0 : -1; -} - -static void early_serial_write(struct console *con, const char *s, unsigned n) -{ - while (*s && n-- > 0) { - if (*s == '\n') - early_serial_putc('\r'); - early_serial_putc(*s); - s++; - } -} - -#define DEFAULT_BAUD 9600 - -static __init void early_serial_init(char *s) -{ - unsigned char c; - unsigned divisor; - unsigned baud = DEFAULT_BAUD; - char *e; - - if (*s == ',') - ++s; - - if (*s) { - unsigned port; - if (!strncmp(s,"0x",2)) { - early_serial_base = simple_strtoul(s, &e, 16); - } else { - static int bases[] = { 0x3f8, 0x2f8 }; - - if (!strncmp(s,"ttyS",4)) - s += 4; - port = simple_strtoul(s, &e, 10); - if (port > 1 || s == e) - port = 0; - early_serial_base = bases[port]; - } - s += strcspn(s, ","); - if (*s == ',') - s++; - } - - outb(0x3, early_serial_base + LCR); /* 8n1 */ - outb(0, early_serial_base + IER); /* no interrupt */ - outb(0, early_serial_base + FCR); /* no fifo */ - outb(0x3, early_serial_base + MCR); /* DTR + RTS */ - - if (*s) { - baud = simple_strtoul(s, &e, 0); - if (baud == 0 || s == e) - baud = DEFAULT_BAUD; - } - - divisor = 115200 / baud; - c = inb(early_serial_base + LCR); - outb(c | DLAB, early_serial_base + LCR); - outb(divisor & 0xff, early_serial_base + DLL); - outb((divisor >> 8) & 0xff, early_serial_base + DLH); - outb(c & ~DLAB, early_serial_base + LCR); -} - -static struct console early_serial_console = { - .name = "earlyser", - .write = early_serial_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* Console interface to a host file on AMD's SimNow! */ - -static int simnow_fd; - -enum { - MAGIC1 = 0xBACCD00A, - MAGIC2 = 0xCA110000, - XOPEN = 5, - XWRITE = 4, -}; - -static noinline long simnow(long cmd, long a, long b, long c) -{ - long ret; - asm volatile("cpuid" : - "=a" (ret) : - "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); - return ret; -} - -static void __init simnow_init(char *str) -{ - char *fn = "klog"; - if (*str == '=') - fn = ++str; - /* error ignored */ - simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); -} - -static void simnow_write(struct console *con, const char *s, unsigned n) -{ - simnow(XWRITE, simnow_fd, (unsigned long)s, n); -} - -static struct console simnow_console = { - .name = "simnow", - .write = simnow_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* Direct interface for emergencies */ -struct console *early_console = &early_vga_console; -static int early_console_initialized = 0; - -void early_printk(const char *fmt, ...) -{ - char buf[512]; - int n; - va_list ap; - - va_start(ap,fmt); - n = vscnprintf(buf,512,fmt,ap); - early_console->write(early_console,buf,n); - va_end(ap); -} - -static int __initdata keep_early; - -static int __init setup_early_printk(char *buf) -{ - if (!buf) - return 0; - - if (early_console_initialized) - return 0; - early_console_initialized = 1; - - if (strstr(buf, "keep")) - keep_early = 1; - - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3) - && SCREEN_INFO.orig_video_isVGA == 1) { - max_xpos = SCREEN_INFO.orig_video_cols; - max_ypos = SCREEN_INFO.orig_video_lines; - current_ypos = SCREEN_INFO.orig_y; - early_console = &early_vga_console; - } else if (!strncmp(buf, "simnow", 6)) { - simnow_init(buf + 6); - early_console = &simnow_console; - keep_early = 1; -#ifdef CONFIG_HVC_XEN - } else if (!strncmp(buf, "xen", 3)) { - early_console = &xenboot_console; -#endif - } - - if (keep_early) - early_console->flags &= ~CON_BOOT; - else - early_console->flags |= CON_BOOT; - register_console(early_console); - return 0; -} -early_param("earlyprintk", setup_early_printk); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S deleted file mode 100644 index 1d232e5f5658..000000000000 --- a/arch/x86_64/kernel/entry.S +++ /dev/null @@ -1,1172 +0,0 @@ -/* - * linux/arch/x86_64/entry.S - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * - * NOTE: This code handles signal-recognition, which happens every time - * after an interrupt and after each system call. - * - * Normal syscalls and interrupts don't save a full stack frame, this is - * only done for syscall tracing, signals or fork/exec et.al. - * - * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. - * - partial stack frame: partially saved registers upto R11. - * - full stack frame: Like partial stack frame, but all register saved. - * - * Some macro usage: - * - CFI macros are used to generate dwarf2 unwind information for better - * backtraces. They don't change any code. - * - SAVE_ALL/RESTORE_ALL - Save/restore all registers - * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. - * There are unfortunately lots of special cases where some registers - * not touched. The macro is a big mess that should be cleaned up. - * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. - * Gives a full stack frame. - * - ENTRY/END Define functions in the symbol table. - * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack - * frame that is otherwise undefined after a SYSCALL - * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - errorentry/paranoidentry/zeroentry - Define exception entry points. - */ - -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/cache.h> -#include <asm/errno.h> -#include <asm/dwarf2.h> -#include <asm/calling.h> -#include <asm/asm-offsets.h> -#include <asm/msr.h> -#include <asm/unistd.h> -#include <asm/thread_info.h> -#include <asm/hw_irq.h> -#include <asm/page.h> -#include <asm/irqflags.h> - - .code64 - -#ifndef CONFIG_PREEMPT -#define retint_kernel retint_restore_args -#endif - - -.macro TRACE_IRQS_IRETQ offset=ARGOFFSET -#ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON -1: -#endif -.endm - -/* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based - * fast path FIXUP_TOP_OF_STACK is needed. - * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs - * manipulation. - */ - - /* %rsp:at FRAMEEND */ - .macro FIXUP_TOP_OF_STACK tmp - movq %gs:pda_oldrsp,\tmp - movq \tmp,RSP(%rsp) - movq $__USER_DS,SS(%rsp) - movq $__USER_CS,CS(%rsp) - movq $-1,RCX(%rsp) - movq R11(%rsp),\tmp /* get eflags */ - movq \tmp,EFLAGS(%rsp) - .endm - - .macro RESTORE_TOP_OF_STACK tmp,offset=0 - movq RSP-\offset(%rsp),\tmp - movq \tmp,%gs:pda_oldrsp - movq EFLAGS-\offset(%rsp),\tmp - movq \tmp,R11-\offset(%rsp) - .endm - - .macro FAKE_STACK_FRAME child_rip - /* push in order ss, rsp, eflags, cs, rip */ - xorl %eax, %eax - pushq %rax /* ss */ - CFI_ADJUST_CFA_OFFSET 8 - /*CFI_REL_OFFSET ss,0*/ - pushq %rax /* rsp */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rsp,0 - pushq $(1<<9) /* eflags - interrupts on */ - CFI_ADJUST_CFA_OFFSET 8 - /*CFI_REL_OFFSET rflags,0*/ - pushq $__KERNEL_CS /* cs */ - CFI_ADJUST_CFA_OFFSET 8 - /*CFI_REL_OFFSET cs,0*/ - pushq \child_rip /* rip */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rip,0 - pushq %rax /* orig rax */ - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro UNFAKE_STACK_FRAME - addq $8*6, %rsp - CFI_ADJUST_CFA_OFFSET -(6*8) - .endm - - .macro CFI_DEFAULT_STACK start=1 - .if \start - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8 - .else - CFI_DEF_CFA_OFFSET SS+8 - .endif - CFI_REL_OFFSET r15,R15 - CFI_REL_OFFSET r14,R14 - CFI_REL_OFFSET r13,R13 - CFI_REL_OFFSET r12,R12 - CFI_REL_OFFSET rbp,RBP - CFI_REL_OFFSET rbx,RBX - CFI_REL_OFFSET r11,R11 - CFI_REL_OFFSET r10,R10 - CFI_REL_OFFSET r9,R9 - CFI_REL_OFFSET r8,R8 - CFI_REL_OFFSET rax,RAX - CFI_REL_OFFSET rcx,RCX - CFI_REL_OFFSET rdx,RDX - CFI_REL_OFFSET rsi,RSI - CFI_REL_OFFSET rdi,RDI - CFI_REL_OFFSET rip,RIP - /*CFI_REL_OFFSET cs,CS*/ - /*CFI_REL_OFFSET rflags,EFLAGS*/ - CFI_REL_OFFSET rsp,RSP - /*CFI_REL_OFFSET ss,SS*/ - .endm -/* - * A newly forked process directly context switches into this. - */ -/* rdi: prev */ -ENTRY(ret_from_fork) - CFI_DEFAULT_STACK - push kernel_eflags(%rip) - CFI_ADJUST_CFA_OFFSET 4 - popf # reset kernel eflags - CFI_ADJUST_CFA_OFFSET -4 - call schedule_tail - GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) - jnz rff_trace -rff_action: - RESTORE_REST - testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? - je int_ret_from_sys_call - testl $_TIF_IA32,threadinfo_flags(%rcx) - jnz int_ret_from_sys_call - RESTORE_TOP_OF_STACK %rdi,ARGOFFSET - jmp ret_from_sys_call -rff_trace: - movq %rsp,%rdi - call syscall_trace_leave - GET_THREAD_INFO(%rcx) - jmp rff_action - CFI_ENDPROC -END(ret_from_fork) - -/* - * System call entry. Upto 6 arguments in registers are supported. - * - * SYSCALL does not save anything on the stack and does not change the - * stack pointer. - */ - -/* - * Register setup: - * rax system call number - * rdi arg0 - * rcx return address for syscall/sysret, C arg3 - * rsi arg1 - * rdx arg2 - * r10 arg3 (--> moved to rcx for C) - * r8 arg4 - * r9 arg5 - * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. - * - * Interrupts are off on entry. - * Only called from user space. - * - * XXX if we had a free scratch register we could save the RSP into the stack frame - * and report it properly in ps. Unfortunately we haven't. - * - * When user can change the frames always force IRET. That is because - * it deals with uncanonical addresses better. SYSRET has trouble - * with them due to bugs in both AMD and Intel CPUs. - */ - -ENTRY(system_call) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,PDA_STACKOFFSET - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - swapgs - movq %rsp,%gs:pda_oldrsp - movq %gs:pda_kernelstack,%rsp - /* - * No need to follow this irqs off/on section - it's straight - * and short: - */ - sti - SAVE_ARGS 8,1 - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) - movq %rcx,RIP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rip,RIP-ARGOFFSET - GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) - jnz tracesys - cmpq $__NR_syscall_max,%rax - ja badsys - movq %r10,%rcx - call *sys_call_table(,%rax,8) # XXX: rip relative - movq %rax,RAX-ARGOFFSET(%rsp) -/* - * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. - */ -ret_from_sys_call: - movl $_TIF_ALLWORK_MASK,%edi - /* edi: flagmask */ -sysret_check: - GET_THREAD_INFO(%rcx) - cli - TRACE_IRQS_OFF - movl threadinfo_flags(%rcx),%edx - andl %edi,%edx - jnz sysret_careful - CFI_REMEMBER_STATE - /* - * sysretq will re-enable interrupts: - */ - TRACE_IRQS_ON - movq RIP-ARGOFFSET(%rsp),%rcx - CFI_REGISTER rip,rcx - RESTORE_ARGS 0,-ARG_SKIP,1 - /*CFI_REGISTER rflags,r11*/ - movq %gs:pda_oldrsp,%rsp - swapgs - sysretq - - CFI_RESTORE_STATE - /* Handle reschedules */ - /* edx: work, edi: workmask */ -sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal - TRACE_IRQS_ON - sti - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 - call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 - jmp sysret_check - - /* Handle a signal */ -sysret_signal: - TRACE_IRQS_ON - sti - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx - jz 1f - - /* Really a signal */ - /* edx: work flags (arg3) */ - leaq do_notify_resume(%rip),%rax - leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi - /* Use IRET because user could have changed frame. This - works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - cli - TRACE_IRQS_OFF - jmp int_with_check - -badsys: - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp ret_from_sys_call - - /* Do syscall tracing */ -tracesys: - SAVE_REST - movq $-ENOSYS,RAX(%rsp) - FIXUP_TOP_OF_STACK %rdi - movq %rsp,%rdi - call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST - cmpq $__NR_syscall_max,%rax - movq $-ENOSYS,%rcx - cmova %rcx,%rax - ja 1f - movq %r10,%rcx /* fixup for C */ - call *sys_call_table(,%rax,8) -1: movq %rax,RAX-ARGOFFSET(%rsp) - /* Use IRET because user could have changed frame */ - -/* - * Syscall return path ending with IRET. - * Has correct top of stack, but partial stack frame. - */ - .globl int_ret_from_sys_call -int_ret_from_sys_call: - cli - TRACE_IRQS_OFF - testl $3,CS-ARGOFFSET(%rsp) - je retint_restore_args - movl $_TIF_ALLWORK_MASK,%edi - /* edi: mask to check */ -int_with_check: - GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%edx - andl %edi,%edx - jnz int_careful - andl $~TS_COMPAT,threadinfo_status(%rcx) - jmp retint_swapgs - - /* Either reschedule or signal or syscall exit tracking needed. */ - /* First do a reschedule test. */ - /* edx: work, edi: workmask */ -int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful - TRACE_IRQS_ON - sti - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 - call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 - cli - TRACE_IRQS_OFF - jmp int_with_check - - /* handle signals and tracing -- both require a full stack frame */ -int_very_careful: - TRACE_IRQS_ON - sti - SAVE_REST - /* Check for syscall exit trace */ - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx - jz int_signal - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 - leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace_leave - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi - jmp int_restore_rest - -int_signal: - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi -int_restore_rest: - RESTORE_REST - cli - TRACE_IRQS_OFF - jmp int_with_check - CFI_ENDPROC -END(system_call) - -/* - * Certain special system calls that need to save a complete full stack frame. - */ - - .macro PTREGSCALL label,func,arg - .globl \label -\label: - leaq \func(%rip),%rax - leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ - jmp ptregscall_common -END(\label) - .endm - - CFI_STARTPROC - - PTREGSCALL stub_clone, sys_clone, %r8 - PTREGSCALL stub_fork, sys_fork, %rdi - PTREGSCALL stub_vfork, sys_vfork, %rdi - PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx - PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx - PTREGSCALL stub_iopl, sys_iopl, %rsi - -ENTRY(ptregscall_common) - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 - SAVE_REST - movq %r11, %r15 - CFI_REGISTER rip, r15 - FIXUP_TOP_OF_STACK %r11 - call *%rax - RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - CFI_REGISTER rip, r11 - RESTORE_REST - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rip, 0 - ret - CFI_ENDPROC -END(ptregscall_common) - -ENTRY(stub_execve) - CFI_STARTPROC - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call sys_execve - RESTORE_TOP_OF_STACK %r11 - movq %rax,RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC -END(stub_execve) - -/* - * sigreturn is special because it needs to restore all registers on return. - * This cannot be done with SYSRET, so use the IRET return path instead. - */ -ENTRY(stub_rt_sigreturn) - CFI_STARTPROC - addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 - SAVE_REST - movq %rsp,%rdi - FIXUP_TOP_OF_STACK %r11 - call sys_rt_sigreturn - movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC -END(stub_rt_sigreturn) - -/* - * initial frame state for interrupts and exceptions - */ - .macro _frame ref - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-\ref - /*CFI_REL_OFFSET ss,SS-\ref*/ - CFI_REL_OFFSET rsp,RSP-\ref - /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ - /*CFI_REL_OFFSET cs,CS-\ref*/ - CFI_REL_OFFSET rip,RIP-\ref - .endm - -/* initial frame state for interrupts (and exceptions without error code) */ -#define INTR_FRAME _frame RIP -/* initial frame state for exceptions with error code (and interrupts with - vector already pushed) */ -#define XCPT_FRAME _frame ORIG_RAX - -/* - * Interrupt entry/exit. - * - * Interrupt entry points save only callee clobbered registers in fast path. - * - * Entry runs with interrupts off. - */ - -/* 0(%rsp): interrupt number */ - .macro interrupt func - cld - SAVE_ARGS - leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler - pushq %rbp - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbp, 0 - movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp - testl $3,CS(%rdi) - je 1f - swapgs - /* irqcount is used to check if a CPU is already on an interrupt - stack or not. While this is essentially redundant with preempt_count - it is a little cheaper to use a separate counter in the PDA - (short of moving irq_enter into assembly, which would be too - much work) */ -1: incl %gs:pda_irqcount - cmoveq %gs:pda_irqstackptr,%rsp - push %rbp # backlink for old unwinder - /* - * We entered an interrupt context - irqs are off: - */ - TRACE_IRQS_OFF - call \func - .endm - -ENTRY(common_interrupt) - XCPT_FRAME - interrupt do_IRQ - /* 0(%rsp): oldrsp-ARGOFFSET */ -ret_from_intr: - cli - TRACE_IRQS_OFF - decl %gs:pda_irqcount - leaveq - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 -exit_intr: - GET_THREAD_INFO(%rcx) - testl $3,CS-ARGOFFSET(%rsp) - je retint_kernel - - /* Interrupt came from user space */ - /* - * Has a correct top of stack, but a partial stack frame - * %rcx: thread info. Interrupts off. - */ -retint_with_reschedule: - movl $_TIF_WORK_MASK,%edi -retint_check: - movl threadinfo_flags(%rcx),%edx - andl %edi,%edx - CFI_REMEMBER_STATE - jnz retint_careful -retint_swapgs: - /* - * The iretq could re-enable interrupts: - */ - cli - TRACE_IRQS_IRETQ - swapgs - jmp restore_args - -retint_restore_args: - cli - /* - * The iretq could re-enable interrupts: - */ - TRACE_IRQS_IRETQ -restore_args: - RESTORE_ARGS 0,8,0 -iret_label: - iretq - - .section __ex_table,"a" - .quad iret_label,bad_iret - .previous - .section .fixup,"ax" - /* force a signal here? this matches i386 behaviour */ - /* running with kernel gs */ -bad_iret: - movq $11,%rdi /* SIGSEGV */ - TRACE_IRQS_ON - sti - jmp do_exit - .previous - - /* edi: workmask, edx: work */ -retint_careful: - CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal - TRACE_IRQS_ON - sti - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 - call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 - GET_THREAD_INFO(%rcx) - cli - TRACE_IRQS_OFF - jmp retint_check - -retint_signal: - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx - jz retint_swapgs - TRACE_IRQS_ON - sti - SAVE_REST - movq $-1,ORIG_RAX(%rsp) - xorl %esi,%esi # oldset - movq %rsp,%rdi # &pt_regs - call do_notify_resume - RESTORE_REST - cli - TRACE_IRQS_OFF - movl $_TIF_NEED_RESCHED,%edi - GET_THREAD_INFO(%rcx) - jmp retint_check - -#ifdef CONFIG_PREEMPT - /* Returning to kernel space. Check if we need preemption */ - /* rcx: threadinfo. interrupts off. */ -ENTRY(retint_kernel) - cmpl $0,threadinfo_preempt_count(%rcx) - jnz retint_restore_args - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) - jnc retint_restore_args - bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ - jnc retint_restore_args - call preempt_schedule_irq - jmp exit_intr -#endif - - CFI_ENDPROC -END(common_interrupt) - -/* - * APIC interrupts. - */ - .macro apicinterrupt num,func - INTR_FRAME - pushq $~(\num) - CFI_ADJUST_CFA_OFFSET 8 - interrupt \func - jmp ret_from_intr - CFI_ENDPROC - .endm - -ENTRY(thermal_interrupt) - apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt -END(thermal_interrupt) - -ENTRY(threshold_interrupt) - apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt -END(threshold_interrupt) - -#ifdef CONFIG_SMP -ENTRY(reschedule_interrupt) - apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt -END(reschedule_interrupt) - - .macro INVALIDATE_ENTRY num -ENTRY(invalidate_interrupt\num) - apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt -END(invalidate_interrupt\num) - .endm - - INVALIDATE_ENTRY 0 - INVALIDATE_ENTRY 1 - INVALIDATE_ENTRY 2 - INVALIDATE_ENTRY 3 - INVALIDATE_ENTRY 4 - INVALIDATE_ENTRY 5 - INVALIDATE_ENTRY 6 - INVALIDATE_ENTRY 7 - -ENTRY(call_function_interrupt) - apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt -END(call_function_interrupt) -ENTRY(irq_move_cleanup_interrupt) - apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt -END(irq_move_cleanup_interrupt) -#endif - -ENTRY(apic_timer_interrupt) - apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt -END(apic_timer_interrupt) - -ENTRY(error_interrupt) - apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt -END(error_interrupt) - -ENTRY(spurious_interrupt) - apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt -END(spurious_interrupt) - -/* - * Exception entry points. - */ - .macro zeroentry sym - INTR_FRAME - pushq $0 /* push error code/oldrax */ - CFI_ADJUST_CFA_OFFSET 8 - pushq %rax /* push real oldrax to the rdi slot */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rax,0 - leaq \sym(%rip),%rax - jmp error_entry - CFI_ENDPROC - .endm - - .macro errorentry sym - XCPT_FRAME - pushq %rax - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rax,0 - leaq \sym(%rip),%rax - jmp error_entry - CFI_ENDPROC - .endm - - /* error code is on the stack already */ - /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym, ist=0, irqtrace=1 - SAVE_ALL - cld - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f - swapgs - xorl %ebx,%ebx -1: - .if \ist - movq %gs:pda_data_offset, %rbp - .endif - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi - movq $-1,ORIG_RAX(%rsp) - .if \ist - subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - .endif - call \sym - .if \ist - addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - .endif - cli - .if \irqtrace - TRACE_IRQS_OFF - .endif - .endm - - /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. - * - * "trace" is 0 for the NMI handler only, because irq-tracing - * is fundamentally NMI-unsafe. (we cannot change the soft and - * hard flags at once, atomically) - */ - .macro paranoidexit trace=1 - /* ebx: no swapgs flag */ -paranoid_exit\trace: - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore\trace - testl $3,CS(%rsp) - jnz paranoid_userspace\trace -paranoid_swapgs\trace: - .if \trace - TRACE_IRQS_IRETQ 0 - .endif - swapgs -paranoid_restore\trace: - RESTORE_ALL 8 - iretq -paranoid_userspace\trace: - GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs\trace - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule\trace - movl %ebx,%edx /* arg3: thread flags */ - .if \trace - TRACE_IRQS_ON - .endif - sti - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - cli - .if \trace - TRACE_IRQS_OFF - .endif - jmp paranoid_userspace\trace -paranoid_schedule\trace: - .if \trace - TRACE_IRQS_ON - .endif - sti - call schedule - cli - .if \trace - TRACE_IRQS_OFF - .endif - jmp paranoid_userspace\trace - CFI_ENDPROC - .endm - -/* - * Exception entry point. This expects an error code/orig_rax on the stack - * and the exception handler in %rax. - */ -KPROBE_ENTRY(error_entry) - _frame RDI - CFI_REL_OFFSET rax,0 - /* rdi slot contains rax, oldrax contains error code */ - cld - subq $14*8,%rsp - CFI_ADJUST_CFA_OFFSET (14*8) - movq %rsi,13*8(%rsp) - CFI_REL_OFFSET rsi,RSI - movq 14*8(%rsp),%rsi /* load rax from rdi slot */ - CFI_REGISTER rax,rsi - movq %rdx,12*8(%rsp) - CFI_REL_OFFSET rdx,RDX - movq %rcx,11*8(%rsp) - CFI_REL_OFFSET rcx,RCX - movq %rsi,10*8(%rsp) /* store rax */ - CFI_REL_OFFSET rax,RAX - movq %r8, 9*8(%rsp) - CFI_REL_OFFSET r8,R8 - movq %r9, 8*8(%rsp) - CFI_REL_OFFSET r9,R9 - movq %r10,7*8(%rsp) - CFI_REL_OFFSET r10,R10 - movq %r11,6*8(%rsp) - CFI_REL_OFFSET r11,R11 - movq %rbx,5*8(%rsp) - CFI_REL_OFFSET rbx,RBX - movq %rbp,4*8(%rsp) - CFI_REL_OFFSET rbp,RBP - movq %r12,3*8(%rsp) - CFI_REL_OFFSET r12,R12 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13,R13 - movq %r14,1*8(%rsp) - CFI_REL_OFFSET r14,R14 - movq %r15,(%rsp) - CFI_REL_OFFSET r15,R15 - xorl %ebx,%ebx - testl $3,CS(%rsp) - je error_kernelspace -error_swapgs: - swapgs -error_sti: - movq %rdi,RDI(%rsp) - CFI_REL_OFFSET rdi,RDI - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) - call *%rax - /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ -error_exit: - movl %ebx,%eax - RESTORE_REST - cli - TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - testl %eax,%eax - jne retint_kernel - movl threadinfo_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - /* - * The iret might restore flags: - */ - TRACE_IRQS_IRETQ - swapgs - RESTORE_ARGS 0,8,0 - jmp iret_label - CFI_ENDPROC - -error_kernelspace: - incl %ebx - /* There are two places in the kernel that can potentially fault with - usergs. Handle them here. The exception handlers after - iret run with kernel gs again, so don't set the user space flag. - B stepping K8s sometimes report an truncated RIP for IRET - exceptions returning to compat mode. Check for these here too. */ - leaq iret_label(%rip),%rbp - cmpq %rbp,RIP(%rsp) - je error_swapgs - movl %ebp,%ebp /* zero extend */ - cmpq %rbp,RIP(%rsp) - je error_swapgs - cmpq $gs_change,RIP(%rsp) - je error_swapgs - jmp error_sti -KPROBE_END(error_entry) - - /* Reload gs selector with exception handling */ - /* edi: new selector */ -ENTRY(load_gs_index) - CFI_STARTPROC - pushf - CFI_ADJUST_CFA_OFFSET 8 - cli - swapgs -gs_change: - movl %edi,%gs -2: mfence /* workaround */ - swapgs - popf - CFI_ADJUST_CFA_OFFSET -8 - ret - CFI_ENDPROC -ENDPROC(load_gs_index) - - .section __ex_table,"a" - .align 8 - .quad gs_change,bad_gs - .previous - .section .fixup,"ax" - /* running with kernelgs */ -bad_gs: - swapgs /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b - .previous - -/* - * Create a kernel thread. - * - * C extern interface: - * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) - * - * asm input arguments: - * rdi: fn, rsi: arg, rdx: flags - */ -ENTRY(kernel_thread) - CFI_STARTPROC - FAKE_STACK_FRAME $child_rip - SAVE_ALL - - # rdi: flags, rsi: usp, rdx: will be &pt_regs - movq %rdx,%rdi - orq kernel_thread_flags(%rip),%rdi - movq $-1, %rsi - movq %rsp, %rdx - - xorl %r8d,%r8d - xorl %r9d,%r9d - - # clone now - call do_fork - movq %rax,RAX(%rsp) - xorl %edi,%edi - - /* - * It isn't worth to check for reschedule here, - * so internally to the x86_64 port you can rely on kernel_thread() - * not to reschedule the child before returning, this avoids the need - * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] - */ - RESTORE_ALL - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC -ENDPROC(kernel_thread) - -child_rip: - pushq $0 # fake return address - CFI_STARTPROC - /* - * Here we are in the child and the registers are set as they were - * at kernel_thread() invocation in the parent. - */ - movq %rdi, %rax - movq %rsi, %rdi - call *%rax - # exit - xorl %edi, %edi - call do_exit - CFI_ENDPROC -ENDPROC(child_rip) - -/* - * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. - * - * C extern interface: - * extern long execve(char *name, char **argv, char **envp) - * - * asm input arguments: - * rdi: name, rsi: argv, rdx: envp - * - * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) - * - * do_sys_execve asm fallback arguments: - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack - */ -ENTRY(kernel_execve) - CFI_STARTPROC - FAKE_STACK_FRAME $0 - SAVE_ALL - call sys_execve - movq %rax, RAX(%rsp) - RESTORE_REST - testq %rax,%rax - je int_ret_from_sys_call - RESTORE_ARGS - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC -ENDPROC(kernel_execve) - -KPROBE_ENTRY(page_fault) - errorentry do_page_fault -KPROBE_END(page_fault) - -ENTRY(coprocessor_error) - zeroentry do_coprocessor_error -END(coprocessor_error) - -ENTRY(simd_coprocessor_error) - zeroentry do_simd_coprocessor_error -END(simd_coprocessor_error) - -ENTRY(device_not_available) - zeroentry math_state_restore -END(device_not_available) - - /* runs on exception stack */ -KPROBE_ENTRY(debug) - INTR_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_debug, DEBUG_STACK - paranoidexit -KPROBE_END(debug) - - /* runs on exception stack */ -KPROBE_ENTRY(nmi) - INTR_FRAME - pushq $-1 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi, 0, 0 -#ifdef CONFIG_TRACE_IRQFLAGS - paranoidexit 0 -#else - jmp paranoid_exit1 - CFI_ENDPROC -#endif -KPROBE_END(nmi) - -KPROBE_ENTRY(int3) - INTR_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_int3, DEBUG_STACK - jmp paranoid_exit1 - CFI_ENDPROC -KPROBE_END(int3) - -ENTRY(overflow) - zeroentry do_overflow -END(overflow) - -ENTRY(bounds) - zeroentry do_bounds -END(bounds) - -ENTRY(invalid_op) - zeroentry do_invalid_op -END(invalid_op) - -ENTRY(coprocessor_segment_overrun) - zeroentry do_coprocessor_segment_overrun -END(coprocessor_segment_overrun) - -ENTRY(reserved) - zeroentry do_reserved -END(reserved) - - /* runs on exception stack */ -ENTRY(double_fault) - XCPT_FRAME - paranoidentry do_double_fault - jmp paranoid_exit1 - CFI_ENDPROC -END(double_fault) - -ENTRY(invalid_TSS) - errorentry do_invalid_TSS -END(invalid_TSS) - -ENTRY(segment_not_present) - errorentry do_segment_not_present -END(segment_not_present) - - /* runs on exception stack */ -ENTRY(stack_segment) - XCPT_FRAME - paranoidentry do_stack_segment - jmp paranoid_exit1 - CFI_ENDPROC -END(stack_segment) - -KPROBE_ENTRY(general_protection) - errorentry do_general_protection -KPROBE_END(general_protection) - -ENTRY(alignment_check) - errorentry do_alignment_check -END(alignment_check) - -ENTRY(divide_error) - zeroentry do_divide_error -END(divide_error) - -ENTRY(spurious_interrupt_bug) - zeroentry do_spurious_interrupt_bug -END(spurious_interrupt_bug) - -#ifdef CONFIG_X86_MCE - /* runs on exception stack */ -ENTRY(machine_check) - INTR_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_machine_check - jmp paranoid_exit1 - CFI_ENDPROC -END(machine_check) -#endif - -/* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(call_softirq) - CFI_STARTPROC - push %rbp - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbp,0 - mov %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp - incl %gs:pda_irqcount - cmove %gs:pda_irqstackptr,%rsp - push %rbp # backlink for old unwinder - call __do_softirq - leaveq - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 - decl %gs:pda_irqcount - ret - CFI_ENDPROC -ENDPROC(call_softirq) - -KPROBE_ENTRY(ignore_sysret) - CFI_STARTPROC - mov $-ENOSYS,%eax - sysret - CFI_ENDPROC -ENDPROC(ignore_sysret) diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c deleted file mode 100644 index 47496a40e84f..000000000000 --- a/arch/x86_64/kernel/genapic.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright 2004 James Cleverdon, IBM. - * Subject to the GNU Public License, v.2 - * - * Generic APIC sub-arch probe layer. - * - * Hacked for x86-64 by James Cleverdon from i386 architecture code by - * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and - * James Cleverdon. - */ -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/init.h> - -#include <asm/smp.h> -#include <asm/ipi.h> -#include <asm/genapic.h> - -#ifdef CONFIG_ACPI -#include <acpi/acpi_bus.h> -#endif - -/* which logical CPU number maps to which CPU (physical APIC ID) */ -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly - = { [0 ... NR_CPUS-1] = BAD_APICID }; -EXPORT_SYMBOL(x86_cpu_to_apicid); - -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; - -struct genapic __read_mostly *genapic = &apic_flat; - -/* - * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. - */ -void __init setup_apic_routing(void) -{ -#ifdef CONFIG_ACPI - /* - * Quirk: some x86_64 machines can only use physical APIC mode - * regardless of how many processors are present (x86_64 ES7000 - * is an example). - */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) - genapic = &apic_physflat; - else -#endif - - if (cpus_weight(cpu_possible_map) <= 8) - genapic = &apic_flat; - else - genapic = &apic_physflat; - - printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); -} - -/* Same for both flat and physical. */ - -void send_IPI_self(int vector) -{ - __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); -} diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c deleted file mode 100644 index ecb01eefdd27..000000000000 --- a/arch/x86_64/kernel/genapic_flat.c +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright 2004 James Cleverdon, IBM. - * Subject to the GNU Public License, v.2 - * - * Flat APIC subarch code. - * - * Hacked for x86-64 by James Cleverdon from i386 architecture code by - * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and - * James Cleverdon. - */ -#include <linux/errno.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/init.h> -#include <asm/smp.h> -#include <asm/ipi.h> -#include <asm/genapic.h> - -static cpumask_t flat_target_cpus(void) -{ - return cpu_online_map; -} - -static cpumask_t flat_vector_allocation_domain(int cpu) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; -} - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -static void flat_init_apic_ldr(void) -{ - unsigned long val; - unsigned long num, id; - - num = smp_processor_id(); - id = 1UL << num; - x86_cpu_to_log_apicid[num] = id; - apic_write(APIC_DFR, APIC_DFR_FLAT); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write(APIC_LDR, val); -} - -static void flat_send_IPI_mask(cpumask_t cpumask, int vector) -{ - unsigned long mask = cpus_addr(cpumask)[0]; - unsigned long flags; - - local_irq_save(flags); - __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); - local_irq_restore(flags); -} - -static void flat_send_IPI_allbutself(int vector) -{ -#ifdef CONFIG_HOTPLUG_CPU - int hotplug = 1; -#else - int hotplug = 0; -#endif - if (hotplug || vector == NMI_VECTOR) { - cpumask_t allbutme = cpu_online_map; - - cpu_clear(smp_processor_id(), allbutme); - - if (!cpus_empty(allbutme)) - flat_send_IPI_mask(allbutme, vector); - } else if (num_online_cpus() > 1) { - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); - } -} - -static void flat_send_IPI_all(int vector) -{ - if (vector == NMI_VECTOR) - flat_send_IPI_mask(cpu_online_map, vector); - else - __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); -} - -static int flat_apic_id_registered(void) -{ - return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); -} - -static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) -{ - return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; -} - -static unsigned int phys_pkg_id(int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - -struct genapic apic_flat = { - .name = "flat", - .int_delivery_mode = dest_LowestPrio, - .int_dest_mode = (APIC_DEST_LOGICAL != 0), - .target_cpus = flat_target_cpus, - .vector_allocation_domain = flat_vector_allocation_domain, - .apic_id_registered = flat_apic_id_registered, - .init_apic_ldr = flat_init_apic_ldr, - .send_IPI_all = flat_send_IPI_all, - .send_IPI_allbutself = flat_send_IPI_allbutself, - .send_IPI_mask = flat_send_IPI_mask, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, -}; - -/* - * Physflat mode is used when there are more than 8 CPUs on a AMD system. - * We cannot use logical delivery in this case because the mask - * overflows, so use physical mode. - */ - -static cpumask_t physflat_target_cpus(void) -{ - return cpu_online_map; -} - -static cpumask_t physflat_vector_allocation_domain(int cpu) -{ - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; -} - - -static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) -{ - send_IPI_mask_sequence(cpumask, vector); -} - -static void physflat_send_IPI_allbutself(int vector) -{ - cpumask_t allbutme = cpu_online_map; - - cpu_clear(smp_processor_id(), allbutme); - physflat_send_IPI_mask(allbutme, vector); -} - -static void physflat_send_IPI_all(int vector) -{ - physflat_send_IPI_mask(cpu_online_map, vector); -} - -static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) - return x86_cpu_to_apicid[cpu]; - else - return BAD_APICID; -} - -struct genapic apic_physflat = { - .name = "physical flat", - .int_delivery_mode = dest_Fixed, - .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .target_cpus = physflat_target_cpus, - .vector_allocation_domain = physflat_vector_allocation_domain, - .apic_id_registered = flat_apic_id_registered, - .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ - .send_IPI_all = physflat_send_IPI_all, - .send_IPI_allbutself = physflat_send_IPI_allbutself, - .send_IPI_mask = physflat_send_IPI_mask, - .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, -}; diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S deleted file mode 100644 index b6167fe3330e..000000000000 --- a/arch/x86_64/kernel/head.S +++ /dev/null @@ -1,416 +0,0 @@ -/* - * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit - * - * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE - * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> - * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> - * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> - */ - - -#include <linux/linkage.h> -#include <linux/threads.h> -#include <linux/init.h> -#include <asm/desc.h> -#include <asm/segment.h> -#include <asm/pgtable.h> -#include <asm/page.h> -#include <asm/msr.h> -#include <asm/cache.h> - -/* we are not able to switch in one step to the final KERNEL ADRESS SPACE - * because we need identity-mapped pages. - * - */ - - .text - .section .text.head - .code64 - .globl startup_64 -startup_64: - - /* - * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, - * and someone has loaded an identity mapped page table - * for us. These identity mapped page tables map all of the - * kernel pages and possibly all of memory. - * - * %esi holds a physical pointer to real_mode_data. - * - * We come here either directly from a 64bit bootloader, or from - * arch/x86_64/boot/compressed/head.S. - * - * We only come here initially at boot nothing else comes here. - * - * Since we may be loaded at an address different from what we were - * compiled to run at we first fixup the physical addresses in our page - * tables and then reload them. - */ - - /* Compute the delta between the address I am compiled to run at and the - * address I am actually running at. - */ - leaq _text(%rip), %rbp - subq $_text - __START_KERNEL_map, %rbp - - /* Is the address not 2M aligned? */ - movq %rbp, %rax - andl $~LARGE_PAGE_MASK, %eax - testl %eax, %eax - jnz bad_address - - /* Is the address too large? */ - leaq _text(%rip), %rdx - movq $PGDIR_SIZE, %rax - cmpq %rax, %rdx - jae bad_address - - /* Fixup the physical addresses in the page table - */ - addq %rbp, init_level4_pgt + 0(%rip) - addq %rbp, init_level4_pgt + (258*8)(%rip) - addq %rbp, init_level4_pgt + (511*8)(%rip) - - addq %rbp, level3_ident_pgt + 0(%rip) - - addq %rbp, level3_kernel_pgt + (510*8)(%rip) - addq %rbp, level3_kernel_pgt + (511*8)(%rip) - - addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - - /* Add an Identity mapping if I am above 1G */ - leaq _text(%rip), %rdi - andq $LARGE_PAGE_MASK, %rdi - - movq %rdi, %rax - shrq $PUD_SHIFT, %rax - andq $(PTRS_PER_PUD - 1), %rax - jz ident_complete - - leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx - leaq level3_ident_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) - - movq %rdi, %rax - shrq $PMD_SHIFT, %rax - andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx - leaq level2_spare_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) -ident_complete: - - /* Fixup the kernel text+data virtual addresses - */ - leaq level2_kernel_pgt(%rip), %rdi - leaq 4096(%rdi), %r8 - /* See if it is a valid page table entry */ -1: testq $1, 0(%rdi) - jz 2f - addq %rbp, 0(%rdi) - /* Go to the next page */ -2: addq $8, %rdi - cmp %r8, %rdi - jne 1b - - /* Fixup phys_base */ - addq %rbp, phys_base(%rip) - -#ifdef CONFIG_SMP - addq %rbp, trampoline_level4_pgt + 0(%rip) - addq %rbp, trampoline_level4_pgt + (511*8)(%rip) -#endif -#ifdef CONFIG_ACPI_SLEEP - addq %rbp, wakeup_level4_pgt + 0(%rip) - addq %rbp, wakeup_level4_pgt + (511*8)(%rip) -#endif - - /* Due to ENTRY(), sometimes the empty space gets filled with - * zeros. Better take a jmp than relying on empty space being - * filled with 0x90 (nop) - */ - jmp secondary_startup_64 -ENTRY(secondary_startup_64) - /* - * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, - * and someone has loaded a mapped page table. - * - * %esi holds a physical pointer to real_mode_data. - * - * We come here either from startup_64 (using physical addresses) - * or from trampoline.S (using virtual addresses). - * - * Using virtual addresses from trampoline.S removes the need - * to have any identity mapped pages in the kernel page table - * after the boot processor executes this code. - */ - - /* Enable PAE mode and PGE */ - xorq %rax, %rax - btsq $5, %rax - btsq $7, %rax - movq %rax, %cr4 - - /* Setup early boot stage 4 level pagetables. */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax - addq phys_base(%rip), %rax - movq %rax, %cr3 - - /* Ensure I am executing from virtual addresses */ - movq $1f, %rax - jmp *%rax -1: - - /* Check if nx is implemented */ - movl $0x80000001, %eax - cpuid - movl %edx,%edi - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_SCE, %eax /* Enable System Call */ - btl $20,%edi /* No Execute supported? */ - jnc 1f - btsl $_EFER_NX, %eax -1: wrmsr /* Make changes effective */ - - /* Setup cr0 */ -#define CR0_PM 1 /* protected mode */ -#define CR0_MP (1<<1) -#define CR0_ET (1<<4) -#define CR0_NE (1<<5) -#define CR0_WP (1<<16) -#define CR0_AM (1<<18) -#define CR0_PAGING (1<<31) - movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax - /* Make changes effective */ - movq %rax, %cr0 - - /* Setup a boot time stack */ - movq init_rsp(%rip),%rsp - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq - - /* - * We must switch to a new descriptor in kernel space for the GDT - * because soon the kernel won't have access anymore to the userspace - * addresses where we're currently running on. We have to do that here - * because in 32bit we couldn't load a 64bit linear address. - */ - lgdt cpu_gdt_descr(%rip) - - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - movl %eax,%ss - movl %eax,%es - - /* - * We don't really need to load %fs or %gs, but load them anyway - * to kill any stale realmode selectors. This allows execution - * under VT hardware. - */ - movl %eax,%fs - movl %eax,%gs - - /* - * Setup up a dummy PDA. this is just for some early bootup code - * that does in_interrupt() - */ - movl $MSR_GS_BASE,%ecx - movq $empty_zero_page,%rax - movq %rax,%rdx - shrq $32,%rdx - wrmsr - - /* esi is pointer to real mode structure with interesting info. - pass it to C */ - movl %esi, %edi - - /* Finally jump to run C code and to be on real kernel address - * Since we are running on identity-mapped space we have to jump - * to the full 64bit address, this is only possible as indirect - * jump. In addition we need to ensure %cs is set so we make this - * a far return. - */ - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq - - /* SMP bootup changes these two */ -#ifndef CONFIG_HOTPLUG_CPU - .pushsection .init.data -#endif - .align 8 - .globl initial_code -initial_code: - .quad x86_64_start_kernel -#ifndef CONFIG_HOTPLUG_CPU - .popsection -#endif - .globl init_rsp -init_rsp: - .quad init_thread_union+THREAD_SIZE-8 - -bad_address: - jmp bad_address - -ENTRY(early_idt_handler) - cmpl $2,early_recursion_flag(%rip) - jz 1f - incl early_recursion_flag(%rip) - xorl %eax,%eax - movq 8(%rsp),%rsi # get rip - movq (%rsp),%rdx - movq %cr2,%rcx - leaq early_idt_msg(%rip),%rdi - call early_printk - cmpl $2,early_recursion_flag(%rip) - jz 1f - call dump_stack -#ifdef CONFIG_KALLSYMS - leaq early_idt_ripmsg(%rip),%rdi - movq 8(%rsp),%rsi # get rip again - call __print_symbol -#endif -1: hlt - jmp 1b -early_recursion_flag: - .long 0 - -early_idt_msg: - .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" -early_idt_ripmsg: - .asciz "RIP %s\n" - -.balign PAGE_SIZE - -#define NEXT_PAGE(name) \ - .balign PAGE_SIZE; \ -ENTRY(name) - -/* Automate the creation of 1 to 1 mapping pmd entries */ -#define PMDS(START, PERM, COUNT) \ - i = 0 ; \ - .rept (COUNT) ; \ - .quad (START) + (i << 21) + (PERM) ; \ - i = i + 1 ; \ - .endr - - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ -NEXT_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 257,8,0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 252,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE - -NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 511,8,0 - -NEXT_PAGE(level3_kernel_pgt) - .fill 510,8,0 - /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - -NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 - -NEXT_PAGE(level1_fixmap_pgt) - .fill 512,8,0 - -NEXT_PAGE(level2_ident_pgt) - /* Since I easily can, map the first 1G. - * Don't set NX because code runs from these pages. - */ - PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) - -NEXT_PAGE(level2_kernel_pgt) - /* 40MB kernel mapping. The kernel code cannot be bigger than that. - When you change this change KERNEL_TEXT_SIZE in page.h too. */ - /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ - PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE) - /* Module mapping starts here */ - .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 - -NEXT_PAGE(level2_spare_pgt) - .fill 512,8,0 - -#undef PMDS -#undef NEXT_PAGE - - .data - .align 16 - .globl cpu_gdt_descr -cpu_gdt_descr: - .word gdt_end-cpu_gdt_table-1 -gdt: - .quad cpu_gdt_table -#ifdef CONFIG_SMP - .rept NR_CPUS-1 - .word 0 - .quad 0 - .endr -#endif - -ENTRY(phys_base) - /* This must match the first entry in level2_kernel_pgt */ - .quad 0x0000000000000000 - -/* We need valid kernel segments for data and code in long mode too - * IRET will check the segment types kkeil 2000/10/28 - * Also sysret mandates a special GDT layout - */ - - .section .data.page_aligned, "aw" - .align PAGE_SIZE - -/* The TLS descriptors are currently at a different place compared to i386. - Hopefully nobody expects them at a fixed place (Wine?) */ - -ENTRY(cpu_gdt_table) - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ - .quad 0x00af9b000000ffff /* __KERNEL_CS */ - .quad 0x00cf93000000ffff /* __KERNEL_DS */ - .quad 0x00cffb000000ffff /* __USER32_CS */ - .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ - .quad 0x00affb000000ffff /* __USER_CS */ - .quad 0x0 /* unused */ - .quad 0,0 /* TSS */ - .quad 0,0 /* LDT */ - .quad 0,0,0 /* three TLS descriptors */ - .quad 0x0000f40000000000 /* node/CPU stored in limit */ -gdt_end: - /* asm/segment.h:GDT_ENTRIES must match this */ - /* This should be a multiple of the cache line size */ - /* GDTs of other CPUs are now dynamically allocated */ - - /* zero the remaining page */ - .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 - - .section .bss, "aw", @nobits - .align L1_CACHE_BYTES -ENTRY(idt_table) - .skip 256 * 16 - - .section .bss.page_aligned, "aw", @nobits - .align PAGE_SIZE -ENTRY(empty_zero_page) - .skip PAGE_SIZE diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c deleted file mode 100644 index 6c34bdd22e26..000000000000 --- a/arch/x86_64/kernel/head64.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * linux/arch/x86_64/kernel/head64.c -- prepare to run common code - * - * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE - */ - -#include <linux/init.h> -#include <linux/linkage.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/percpu.h> - -#include <asm/processor.h> -#include <asm/proto.h> -#include <asm/smp.h> -#include <asm/bootsetup.h> -#include <asm/setup.h> -#include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/sections.h> - -static void __init zap_identity_mappings(void) -{ - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - __flush_tlb(); -} - -/* Don't add a printk in there. printk relies on the PDA which is not initialized - yet. */ -static void __init clear_bss(void) -{ - memset(__bss_start, 0, - (unsigned long) __bss_stop - (unsigned long) __bss_start); -} - -#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#define OLD_CL_MAGIC_ADDR 0x20 -#define OLD_CL_MAGIC 0xA33F -#define OLD_CL_OFFSET 0x22 - -static void __init copy_bootdata(char *real_mode_data) -{ - unsigned long new_data; - char * command_line; - - memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); - new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); - if (!new_data) { - if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { - return; - } - new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); - } - command_line = __va(new_data); - memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); -} - -void __init x86_64_start_kernel(char * real_mode_data) -{ - int i; - - /* clear bss before set_intr_gate with early_idt_handler */ - clear_bss(); - - /* Make NULL pointers segfault */ - zap_identity_mappings(); - - for (i = 0; i < IDT_ENTRIES; i++) - set_intr_gate(i, early_idt_handler); - asm volatile("lidt %0" :: "m" (idt_descr)); - - early_printk("Kernel alive\n"); - - for (i = 0; i < NR_CPUS; i++) - cpu_pda(i) = &boot_cpu_pda[i]; - - pda_init(0); - copy_bootdata(__va(real_mode_data)); -#ifdef CONFIG_SMP - cpu_set(0, cpu_online_map); -#endif - start_kernel(); -} diff --git a/arch/x86_64/kernel/hpet.c b/arch/x86_64/kernel/hpet.c deleted file mode 100644 index e2d1b912e154..000000000000 --- a/arch/x86_64/kernel/hpet.c +++ /dev/null @@ -1,493 +0,0 @@ -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/mc146818rtc.h> -#include <linux/time.h> -#include <linux/clocksource.h> -#include <linux/ioport.h> -#include <linux/acpi.h> -#include <linux/hpet.h> -#include <asm/pgtable.h> -#include <asm/vsyscall.h> -#include <asm/timex.h> -#include <asm/hpet.h> - -#define HPET_MASK 0xFFFFFFFF -#define HPET_SHIFT 22 - -/* FSEC = 10^-15 NSEC = 10^-9 */ -#define FSEC_PER_NSEC 1000000 - -int nohpet __initdata; - -unsigned long hpet_address; -unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* HPET clocks / interrupt */ - -int hpet_use_timer; /* Use counter of hpet for time keeping, - * otherwise PIT - */ - -#ifdef CONFIG_HPET -static __init int late_hpet_init(void) -{ - struct hpet_data hd; - unsigned int ntimer; - - if (!hpet_address) - return 0; - - memset(&hd, 0, sizeof(hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = hpet_address; - hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet *hpet; - struct hpet_timer *timer; - int i; - - hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); - timer = &hpet->hpet_timers[2]; - for (i = 2; i < ntimer; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - - } - - hpet_alloc(&hd); - return 0; -} -fs_initcall(late_hpet_init); -#endif - -int hpet_timer_stop_set_go(unsigned long tick) -{ - unsigned int cfg; - -/* - * Stop the timers and reset the main counter. - */ - - cfg = hpet_readl(HPET_CFG); - cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - hpet_writel(0, HPET_COUNTER); - hpet_writel(0, HPET_COUNTER + 4); - -/* - * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, - * and period also hpet_tick. - */ - if (hpet_use_timer) { - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ - hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ - cfg |= HPET_CFG_LEGACY; - } -/* - * Go! - */ - - cfg |= HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static cycle_t read_hpet(void) -{ - return (cycle_t)hpet_readl(HPET_COUNTER); -} - -static cycle_t __vsyscall_fn vread_hpet(void) -{ - return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); -} - -struct clocksource clocksource_hpet = { - .name = "hpet", - .rating = 250, - .read = read_hpet, - .mask = (cycle_t)HPET_MASK, - .mult = 0, /* set below */ - .shift = HPET_SHIFT, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .vread = vread_hpet, -}; - -int __init hpet_arch_init(void) -{ - unsigned int id; - u64 tmp; - - if (!hpet_address) - return -1; - set_fixmap_nocache(FIX_HPET_BASE, hpet_address); - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - -/* - * Read the period, compute tick and quotient. - */ - - id = hpet_readl(HPET_ID); - - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) - return -1; - - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < 100000 || hpet_period > 100000000) - return -1; - - hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; - - hpet_use_timer = (id & HPET_ID_LEGSUP); - - /* - * hpet period is in femto seconds per cycle - * so we need to convert this to ns/cyc units - * aproximated by mult/2^shift - * - * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift - * fsec/cyc * 1ns/1000000fsec * 2^shift = mult - * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult - * (fsec/cyc << shift)/1000000 = mult - * (hpet_period << shift)/FSEC_PER_NSEC = mult - */ - tmp = (u64)hpet_period << HPET_SHIFT; - do_div(tmp, FSEC_PER_NSEC); - clocksource_hpet.mult = (u32)tmp; - clocksource_register(&clocksource_hpet); - - return hpet_timer_stop_set_go(hpet_tick); -} - -int hpet_reenable(void) -{ - return hpet_timer_stop_set_go(hpet_tick); -} - -/* - * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing - * it to the HPET timer of known frequency. - */ - -#define TICK_COUNT 100000000 -#define SMI_THRESHOLD 50000 -#define MAX_TRIES 5 - -/* - * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none - * occurs between the reads of the hpet & TSC. - */ -static void __init read_hpet_tsc(int *hpet, int *tsc) -{ - int tsc1, tsc2, hpet1, i; - - for (i = 0; i < MAX_TRIES; i++) { - tsc1 = get_cycles_sync(); - hpet1 = hpet_readl(HPET_COUNTER); - tsc2 = get_cycles_sync(); - if ((tsc2 - tsc1) < SMI_THRESHOLD) - break; - } - *hpet = hpet1; - *tsc = tsc2; -} - -unsigned int __init hpet_calibrate_tsc(void) -{ - int tsc_start, hpet_start; - int tsc_now, hpet_now; - unsigned long flags; - - local_irq_save(flags); - - read_hpet_tsc(&hpet_start, &tsc_start); - - do { - local_irq_disable(); - read_hpet_tsc(&hpet_now, &tsc_now); - local_irq_restore(flags); - } while ((tsc_now - tsc_start) < TICK_COUNT && - (hpet_now - hpet_start) < TICK_COUNT); - - return (tsc_now - tsc_start) * 1000000000L - / ((hpet_now - hpet_start) * hpet_period / 1000); -} - -#ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET - * is enabled, we support RTC interrupt functionality in software. - * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. - */ -#include <linux/rtc.h> - -#define DEFAULT_RTC_INT_FREQ 64 -#define RTC_NUM_INTS 1 - -static unsigned long UIE_on; -static unsigned long prev_update_sec; - -static unsigned long AIE_on; -static struct rtc_time alarm_time; - -static unsigned long PIE_on; -static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; -static unsigned long PIE_count; - -static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ -static unsigned int hpet_t1_cmp; /* cached comparator register */ - -int is_hpet_enabled(void) -{ - return hpet_address != 0; -} - -/* - * Timer 1 for RTC, we do not use periodic interrupt feature, - * even if HPET supports periodic interrupts on Timer 1. - * The reason being, to set up a periodic interrupt in HPET, we need to - * stop the main counter. And if we do that everytime someone diables/enables - * RTC, we will have adverse effect on main kernel timer running on Timer 0. - * So, for the time being, simulate the periodic interrupt in software. - * - * hpet_rtc_timer_init() is called for the first time and during subsequent - * interuppts reinit happens through hpet_rtc_timer_reinit(). - */ -int hpet_rtc_timer_init(void) -{ - unsigned int cfg, cnt; - unsigned long flags; - - if (!is_hpet_enabled()) - return 0; - /* - * Set the counter 1 and enable the interrupts. - */ - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - local_irq_save(flags); - - cnt = hpet_readl(HPET_COUNTER); - cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; - - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - local_irq_restore(flags); - - return 1; -} - -static void hpet_rtc_timer_reinit(void) -{ - unsigned int cfg, cnt, ticks_per_int, lost_ints; - - if (unlikely(!(PIE_on | AIE_on | UIE_on))) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } - - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - /* It is more accurate to use the comparator value than current count.*/ - ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; - hpet_t1_cmp += ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - /* - * If the interrupt handler was delayed too long, the write above tries - * to schedule the next interrupt in the past and the hardware would - * not interrupt until the counter had wrapped around. - * So we have to check that the comparator wasn't set to a past time. - */ - cnt = hpet_readl(HPET_COUNTER); - if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { - lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; - /* Make sure that, even with the time needed to execute - * this code, the next scheduled interrupt has been moved - * back to the future: */ - lost_ints++; - - hpet_t1_cmp += lost_ints * ticks_per_int; - hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - - if (PIE_on) - PIE_count += lost_ints; - - if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); - } -} - -/* - * The functions below are called from rtc driver. - * Return 0 if HPET is not being used. - * Otherwise do the necessary changes and return 1. - */ -int hpet_mask_rtc_irq_bit(unsigned long bit_mask) -{ - if (!is_hpet_enabled()) - return 0; - - if (bit_mask & RTC_UIE) - UIE_on = 0; - if (bit_mask & RTC_PIE) - PIE_on = 0; - if (bit_mask & RTC_AIE) - AIE_on = 0; - - return 1; -} - -int hpet_set_rtc_irq_bit(unsigned long bit_mask) -{ - int timer_init_reqd = 0; - - if (!is_hpet_enabled()) - return 0; - - if (!(PIE_on | AIE_on | UIE_on)) - timer_init_reqd = 1; - - if (bit_mask & RTC_UIE) { - UIE_on = 1; - } - if (bit_mask & RTC_PIE) { - PIE_on = 1; - PIE_count = 0; - } - if (bit_mask & RTC_AIE) { - AIE_on = 1; - } - - if (timer_init_reqd) - hpet_rtc_timer_init(); - - return 1; -} - -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) -{ - if (!is_hpet_enabled()) - return 0; - - alarm_time.tm_hour = hrs; - alarm_time.tm_min = min; - alarm_time.tm_sec = sec; - - return 1; -} - -int hpet_set_periodic_freq(unsigned long freq) -{ - if (!is_hpet_enabled()) - return 0; - - PIE_freq = freq; - PIE_count = 0; - - return 1; -} - -int hpet_rtc_dropped_irq(void) -{ - if (!is_hpet_enabled()) - return 0; - - return 1; -} - -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) -{ - struct rtc_time curr_time; - unsigned long rtc_int_flag = 0; - int call_rtc_interrupt = 0; - - hpet_rtc_timer_reinit(); - - if (UIE_on | AIE_on) { - rtc_get_rtc_time(&curr_time); - } - if (UIE_on) { - if (curr_time.tm_sec != prev_update_sec) { - /* Set update int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag = RTC_UF; - prev_update_sec = curr_time.tm_sec; - } - } - if (PIE_on) { - PIE_count++; - if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { - /* Set periodic int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_PF; - PIE_count = 0; - } - } - if (AIE_on) { - if ((curr_time.tm_sec == alarm_time.tm_sec) && - (curr_time.tm_min == alarm_time.tm_min) && - (curr_time.tm_hour == alarm_time.tm_hour)) { - /* Set alarm int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_AF; - } - } - if (call_rtc_interrupt) { - rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id); - } - return IRQ_HANDLED; -} -#endif - -static int __init nohpet_setup(char *s) -{ - nohpet = 1; - return 1; -} - -__setup("nohpet", nohpet_setup); diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c deleted file mode 100644 index 1d58c13bc6bc..000000000000 --- a/arch/x86_64/kernel/i387.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - * linux/arch/x86_64/kernel/i387.c - * - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * x86-64 rework 2002 Andi Kleen. - * Does direct fxsave in and out of user space now for signal handlers. - * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, - * the 64bit user space sees a FXSAVE frame directly. - */ - -#include <linux/sched.h> -#include <linux/init.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/sigcontext.h> -#include <asm/user.h> -#include <asm/ptrace.h> -#include <asm/uaccess.h> - -unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff; - -void mxcsr_feature_mask_init(void) -{ - unsigned int mask; - clts(); - memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); - mask = current->thread.i387.fxsave.mxcsr_mask; - if (mask == 0) mask = 0x0000ffbf; - mxcsr_feature_mask &= mask; - stts(); -} - -/* - * Called at bootup to set up the initial FPU state that is later cloned - * into all processes. - */ -void __cpuinit fpu_init(void) -{ - unsigned long oldcr0 = read_cr0(); - extern void __bad_fxsave_alignment(void); - - if (offsetof(struct task_struct, thread.i387.fxsave) & 15) - __bad_fxsave_alignment(); - set_in_cr4(X86_CR4_OSFXSR); - set_in_cr4(X86_CR4_OSXMMEXCPT); - - write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ - - mxcsr_feature_mask_init(); - /* clean state in init */ - current_thread_info()->status = 0; - clear_used_math(); -} - -void init_fpu(struct task_struct *child) -{ - if (tsk_used_math(child)) { - if (child == current) - unlazy_fpu(child); - return; - } - memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - child->thread.i387.fxsave.cwd = 0x37f; - child->thread.i387.fxsave.mxcsr = 0x1f80; - /* only the device not available exception or ptrace can call init_fpu */ - set_stopped_child_used_math(child); -} - -/* - * Signal frame handlers. - */ - -int save_i387(struct _fpstate __user *buf) -{ - struct task_struct *tsk = current; - int err = 0; - - BUILD_BUG_ON(sizeof(struct user_i387_struct) != - sizeof(tsk->thread.i387.fxsave)); - - if ((unsigned long)buf % 16) - printk("save_i387: bad fpstate %p\n",buf); - - if (!used_math()) - return 0; - clear_used_math(); /* trigger finit */ - if (task_thread_info(tsk)->status & TS_USEDFPU) { - err = save_i387_checking((struct i387_fxsave_struct __user *)buf); - if (err) return err; - stts(); - } else { - if (__copy_to_user(buf, &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct))) - return -1; - } - return 1; -} - -/* - * ptrace request handlers. - */ - -int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) -{ - init_fpu(tsk); - return __copy_to_user(buf, &tsk->thread.i387.fxsave, - sizeof(struct user_i387_struct)) ? -EFAULT : 0; -} - -int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) -{ - if (__copy_from_user(&tsk->thread.i387.fxsave, buf, - sizeof(struct user_i387_struct))) - return -EFAULT; - return 0; -} - -/* - * FPU state for core dumps. - */ - -int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) -{ - struct task_struct *tsk = current; - - if (!used_math()) - return 0; - - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); - return 1; -} - -int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) -{ - int fpvalid = !!tsk_used_math(tsk); - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); -} - return fpvalid; -} diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c deleted file mode 100644 index 948cae646099..000000000000 --- a/arch/x86_64/kernel/i8259.c +++ /dev/null @@ -1,544 +0,0 @@ -#include <linux/linkage.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/bitops.h> - -#include <asm/acpi.h> -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/hw_irq.h> -#include <asm/pgtable.h> -#include <asm/delay.h> -#include <asm/desc.h> -#include <asm/apic.h> - -/* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - * this file should become arch/i386/kernel/irq.c when the old irq.c - * moves to arch independent land - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -static unsigned int cached_irq_mask = 0xffff; - -#define __byte(x,y) (((unsigned char *)&(y))[x]) -#define cached_21 (__byte(0,cached_irq_mask)) -#define cached_A1 (__byte(1,cached_irq_mask)) - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_A1,0xA1); - else - outb(cached_21,0x21); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_A1,0xA1); - else - outb(cached_21,0x21); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<<irq; - unsigned long flags; - int ret; - - spin_lock_irqsave(&i8259A_lock, flags); - if (irq < 8) - ret = inb(0x20) & mask; - else - ret = inb(0xA0) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<<irq); - set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, - "XT"); - enable_irq(irq); -} - -/* - * This function assumes to be called rarely. Switching between - * 8259A registers is slow. - * This has to be protected by the irq controller spinlock - * before being called. - */ -static inline int i8259A_irq_real(unsigned int irq) -{ - int value; - int irqmask = 1<<irq; - - if (irq < 8) { - outb(0x0B,0x20); /* ISR register */ - value = inb(0x20) & irqmask; - outb(0x0A,0x20); /* back to the IRR register */ - return value; - } - outb(0x0B,0xA0); /* ISR register */ - value = inb(0xA0) & (irqmask >> 8); - outb(0x0A,0xA0); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; - -handle_real_irq: - if (irq & 8) { - inb(0xA1); /* DUMMY - (do we need this?) */ - outb(cached_A1,0xA1); - outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ - outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ - } else { - inb(0x21); /* DUMMY - (do we need this?) */ - outb(cached_21,0x21); - outb(0x60+irq,0x20); /* 'Specific EOI' to master */ - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-2 */ - - /* - * outb_p - this has to work on a wide range of PC hardware. - */ - outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ - outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) - outb_p(0x03, 0x21); /* master does Auto EOI */ - else - outb_p(0x01, 0x21); /* master expects normal EOI */ - - outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ - outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ - outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ - outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode - is to be investigated) */ - - if (auto_eoi) - /* - * in AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_21, 0x21); /* restore master IRQ mask */ - outb(cached_A1, 0xA1); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - set_kset_name("i8259"), - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -void __init init_ISA_irqs (void) -{ - int i; - - init_bsp_APIC(); - init_8259A(0); - - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].chip = &no_irq_chip; - } - } -} - -static void setup_timer_hardware(void) -{ - outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , 0x40); /* LSB */ - udelay(10); - outb(LATCH >> 8 , 0x40); /* MSB */ -} - -static int timer_resume(struct sys_device *dev) -{ - setup_timer_hardware(); - return 0; -} - -void i8254_timer_resume(void) -{ - setup_timer_hardware(); -} - -static struct sysdev_class timer_sysclass = { - set_kset_name("timer_pit"), - .resume = timer_resume, -}; - -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int __init init_timer_sysfs(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(init_timer_sysfs); - -void __init init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); -#endif - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* - * Set the clock to HZ Hz, we already have a valid - * vector now: - */ - setup_timer_hardware(); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c deleted file mode 100644 index 4ff33d4f8551..000000000000 --- a/arch/x86_64/kernel/init_task.c +++ /dev/null @@ -1,54 +0,0 @@ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/init_task.h> -#include <linux/fs.h> -#include <linux/mqueue.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/desc.h> - -static struct fs_struct init_fs = INIT_FS; -static struct files_struct init_files = INIT_FILES; -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - -/* - * Initial task structure. - * - * We need to make sure that this is 8192-byte aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); - -EXPORT_SYMBOL(init_task); -/* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data.cacheline_aligned - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; - -/* Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -#define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c deleted file mode 100644 index 966fa1062491..000000000000 --- a/arch/x86_64/kernel/io_apic.c +++ /dev/null @@ -1,2202 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and - * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, - * further tested and cleaned up by Zach Brown <zab@redhat.com> - * and Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/sched.h> -#include <linux/pci.h> -#include <linux/mc146818rtc.h> -#include <linux/acpi.h> -#include <linux/sysdev.h> -#include <linux/msi.h> -#include <linux/htirq.h> -#ifdef CONFIG_ACPI -#include <acpi/acpi_bus.h> -#endif - -#include <asm/idle.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/mach_apic.h> -#include <asm/acpi.h> -#include <asm/dma.h> -#include <asm/nmi.h> -#include <asm/msidef.h> -#include <asm/hypertransport.h> - -struct irq_cfg { - cpumask_t domain; - cpumask_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - -/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { - [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, -}; - -static int assign_irq_vector(int irq, cpumask_t mask); - -#define __apicdebuginit __init - -int sis_apic_bug; /* not actually supported, dummy for compile */ - -static int no_timer_check; - -static int disable_timer_pin_1 __initdata; - -int timer_over_8254 __initdata = 1; - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -static struct irq_pin_list { - short apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; - -struct io_apic { - unsigned int index; - unsigned int unused[3]; - unsigned int data; -}; - -static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) -{ - return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); -} - -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - return readl(&io_apic->data); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - */ -static inline void io_apic_modify(unsigned int apic, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(value, &io_apic->data); -} - -static int io_apic_level_ack_pending(unsigned int irq) -{ - struct irq_pin_list *entry; - unsigned long flags; - int pending = 0; - - spin_lock_irqsave(&ioapic_lock, flags); - entry = irq_2_pin + irq; - for (;;) { - unsigned int reg; - int pin; - - pin = entry->pin; - if (pin == -1) - break; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - /* Is the remote IRR bit set? */ - pending |= (reg >> 14) & 1; - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - spin_unlock_irqrestore(&ioapic_lock, flags); - return pending; -} - -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ - \ - BUG_ON(irq >= NR_IRQS); \ - for (;;) { \ - unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ - break; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = irq_2_pin + entry->next; \ - } \ -} - -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - -static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) -{ - union entry_union eu; - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; -} - -/* - * When we write a new IO APIC routing entry, we need to write the high - * word first! If the mask bit in the low word is clear, we will enable - * the interrupt, and we need to make sure the entry is fully populated - * before that happens. - */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - union entry_union eu; - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); -} - -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * When we mask an IO APIC routing entry, we need to write the low - * word first, in order to set the mask bit before we change the - * high bits! - */ -static void ioapic_mask_entry(int apic, int pin) -{ - unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - int apic, pin; - struct irq_pin_list *entry = irq_2_pin + irq; - - BUG_ON(irq >= NR_IRQS); - for (;;) { - unsigned int reg; - apic = entry->apic; - pin = entry->pin; - if (pin == -1) - break; - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~0x000000ff; - reg |= vector; - io_apic_modify(apic, reg); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg = irq_cfg + irq; - unsigned long flags; - unsigned int dest; - cpumask_t tmp; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); - - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} -#endif - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - static int first_free_entry = NR_IRQS; - struct irq_pin_list *entry = irq_2_pin + irq; - - BUG_ON(irq >= NR_IRQS); - while (entry->next) - entry = irq_2_pin + entry->next; - - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: ran out of irq_2_pin entries!"); - } - entry->apic = apic; - entry->pin = pin; -} - - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ - -static void mask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) - return; - /* - * Disable it in the IO-APIC irq-routing table: - */ - ioapic_mask_entry(apic, pin); -} - -static void clear_IO_APIC (void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -int skip_ioapic_setup; -int ioapic_force; - -static int __init parse_noapic(char *str) -{ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) -{ - disable_timer_pin_1 = 1; - return 1; -} -__setup("disable_timer_pin_1", disable_timer_pin_setup); - -static int __init setup_disable_8254_timer(char *s) -{ - timer_over_8254 = -1; - return 1; -} -static int __init setup_enable_8254_timer(char *s) -{ - timer_over_8254 = 2; - return 1; -} - -__setup("disable_8254_timer", setup_disable_8254_timer); -__setup("enable_8254_timer", setup_enable_8254_timer); - - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && - mp_irqs[i].mpc_dstirq == pin) - return i; - - return -1; -} - -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int __init find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) - - return mp_irqs[i].mpc_dstirq; - } - return -1; -} - -static int __init find_isa_irq_apic(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) - break; - } - if (i < mp_irq_entries) { - int apic; - for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) - return apic; - } - } - - return -1; -} - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (mp_bus_id_to_pci_bus[bus] == -1) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mpc_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - BUG_ON(best_guess >= NR_IRQS); - return best_guess; -} - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -static int __init MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mpc_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mpc_irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mpc_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); - break; - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mpc_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mpc_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) { - irq = mp_irqs[idx].mpc_srcbusirq; - } else { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - } - BUG_ON(irq >= NR_IRQS); - return irq; -} - -static int __assign_irq_vector(int irq, cpumask_t mask) -{ - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; - unsigned int old_vector; - int cpu; - struct irq_cfg *cfg; - - BUG_ON((unsigned)irq >= NR_IRQS); - cfg = &irq_cfg[irq]; - - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); - - if ((cfg->move_in_progress) || cfg->move_cleanup_count) - return -EBUSY; - - old_vector = cfg->vector; - if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) - return 0; - } - - for_each_cpu_mask(cpu, mask) { - cpumask_t domain, new_mask; - int new_cpu; - int vector, offset; - - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); - - vector = current_vector; - offset = current_offset; -next: - vector += 8; - if (vector >= FIRST_SYSTEM_VECTOR) { - /* If we run out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (unlikely(current_vector == vector)) - continue; - if (vector == IA32_SYSCALL_VECTOR) - goto next; - for_each_cpu_mask(new_cpu, new_mask) - if (per_cpu(vector_irq, new_cpu)[vector] != -1) - goto next; - /* Found one! */ - current_vector = vector; - current_offset = offset; - if (old_vector) { - cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; - } - for_each_cpu_mask(new_cpu, new_mask) - per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cfg->domain = domain; - return 0; - } - return -ENOSPC; -} - -static int assign_irq_vector(int irq, cpumask_t mask) -{ - int err; - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, mask); - spin_unlock_irqrestore(&vector_lock, flags); - return err; -} - -static void __clear_irq_vector(int irq) -{ - struct irq_cfg *cfg; - cpumask_t mask; - int cpu, vector; - - BUG_ON((unsigned)irq >= NR_IRQS); - cfg = &irq_cfg[irq]; - BUG_ON(!cfg->vector); - - vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask(cpu, mask) - per_cpu(vector_irq, cpu)[vector] = -1; - - cfg->vector = 0; - cfg->domain = CPU_MASK_NONE; -} - -void __setup_vector_irq(int cpu) -{ - /* Initialize vector_irq on a new cpu */ - /* This function must be called with vector_lock held */ - int irq, vector; - - /* Mark the inuse vectors */ - for (irq = 0; irq < NR_IRQS; ++irq) { - if (!cpu_isset(cpu, irq_cfg[irq].domain)) - continue; - vector = irq_cfg[irq].vector; - per_cpu(vector_irq, cpu)[vector] = irq; - } - /* Mark the free vectors */ - for (vector = 0; vector < NR_VECTORS; ++vector) { - irq = per_cpu(vector_irq, cpu)[vector]; - if (irq < 0) - continue; - if (!cpu_isset(cpu, irq_cfg[irq].domain)) - per_cpu(vector_irq, cpu)[vector] = -1; - } -} - - -static struct irq_chip ioapic_chip; - -static void ioapic_register_intr(int irq, unsigned long trigger) -{ - if (trigger) { - irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); - } else { - irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); - } -} - -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, - int trigger, int polarity) -{ - struct irq_cfg *cfg = irq_cfg + irq; - struct IO_APIC_route_entry entry; - cpumask_t mask; - - if (!IO_APIC_IRQ(irq)) - return; - - mask = TARGET_CPUS; - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(mask, cfg->domain, mask); - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i)\n", - apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, - irq, trigger, polarity); - - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry,0,sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest = cpu_mask_to_apicid(mask); - entry.mask = 0; /* enable IRQ */ - entry.trigger = trigger; - entry.polarity = polarity; - entry.vector = cfg->vector; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (trigger) - entry.mask = 1; - - ioapic_register_intr(irq, trigger); - if (irq < 16) - disable_8259A_irq(irq); - - ioapic_write_entry(apic, pin, entry); -} - -static void __init setup_IO_APIC_irqs(void) -{ - int apic, pin, idx, irq, first_notcon = 1; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); - continue; - } - - irq = pin_2_irq(idx, apic, pin); - add_pin_to_irq(irq, apic, pin); - - setup_IO_APIC_irq(apic, pin, irq, - irq_trigger(idx), irq_polarity(idx)); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE," not connected.\n"); -} - -/* - * Set up the 8259A-master output pin as broadcast to all - * CPUs. - */ -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) -{ - struct IO_APIC_route_entry entry; - unsigned long flags; - - memset(&entry,0,sizeof(entry)); - - disable_8259A_irq(0); - - /* mask LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* unmask IRQ now */ - entry.dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we have a 8259A-master in AEOI mode ... - */ - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - enable_8259A_irq(0); -} - -void __apicdebuginit print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - - if (reg_01.bits.version >= 0x10) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - printk(KERN_DEBUG " %02x %03X ", - i, - entry.dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) - continue; - printk(KERN_DEBUG "IRQ%d ", i); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -#if 0 - -static __apicdebuginit void print_APIC_bitfield (int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1<<j)) - printk("1"); - else - printk("0"); - } - printk("\n"); - } -} - -void __apicdebuginit print_local_APIC(void * dummy) -{ - unsigned int v, ver, maxlvt; - - if (apic_verbosity == APIC_QUIET) - return; - - printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", - smp_processor_id(), hard_smp_processor_id()); - v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); - v = apic_read(APIC_LVR); - printk(KERN_INFO "... APIC VERSION: %08x\n", v); - ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); - - v = apic_read(APIC_TASKPRI); - printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); - - v = apic_read(APIC_EOI); - printk(KERN_DEBUG "... APIC EOI: %08x\n", v); - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); - v = apic_read(APIC_LDR); - printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); - v = apic_read(APIC_SPIV); - printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); - - printk(KERN_DEBUG "... APIC ISR field:\n"); - print_APIC_bitfield(APIC_ISR); - printk(KERN_DEBUG "... APIC TMR field:\n"); - print_APIC_bitfield(APIC_TMR); - printk(KERN_DEBUG "... APIC IRR field:\n"); - print_APIC_bitfield(APIC_IRR); - - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -void print_all_local_APICs (void) -{ - on_each_cpu(print_local_APIC, NULL, 1, 1); -} - -void __apicdebuginit print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b,0xa0); - outb(0x0b,0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} - -#endif /* 0 */ - -static void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; - int i8259_apic, i8259_pin; - int i, apic; - unsigned long flags; - - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - for(apic = 0; apic < nr_ioapics; apic++) { - int pin; - /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); - - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } - } - } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - i8259_pin = find_isa_irq_pin(0, mp_ExtINT); - i8259_apic = find_isa_irq_apic(0, mp_ExtINT); - /* Trust the MP table if nothing is setup in the hardware */ - if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); - ioapic_i8259.pin = i8259_pin; - ioapic_i8259.apic = i8259_apic; - } - /* Complain if the MP table and the hardware disagree */ - if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - - /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. - */ - if (ioapic_i8259.pin != -1) { - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = GET_APIC_ID(apic_read(APIC_ID)); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); - } - - disconnect_bsp_APIC(ioapic_i8259.pin != -1); -} - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - - /* jiffies wrap? */ - if (jiffies - t1 > 4) - return 1; - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - */ - -static unsigned int startup_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -static int ioapic_retrigger_irq(unsigned int irq) -{ - struct irq_cfg *cfg = &irq_cfg[irq]; - cpumask_t mask; - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - cpus_clear(mask); - cpu_set(first_cpu(cfg->domain), mask); - - send_IPI_mask(mask, cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); - - return 1; -} - -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - -#ifdef CONFIG_SMP -asmlinkage void smp_irq_move_cleanup_interrupt(void) -{ - unsigned vector, me; - ack_APIC_irq(); - exit_idle(); - irq_enter(); - - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - irq = __get_cpu_var(vector_irq)[vector]; - if (irq >= NR_IRQS) - continue; - - desc = irq_desc + irq; - cfg = irq_cfg + irq; - spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; - - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) - goto unlock; - - __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; -unlock: - spin_unlock(&desc->lock); - } - - irq_exit(); -} - -static void irq_complete_move(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg + irq; - unsigned vector, me; - - if (likely(!cfg->move_in_progress)) - return; - - vector = ~get_irq_regs()->orig_rax; - me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; - - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } -} -#else -static inline void irq_complete_move(unsigned int irq) {} -#endif - -static void ack_apic_edge(unsigned int irq) -{ - irq_complete_move(irq); - move_native_irq(irq); - ack_APIC_irq(); -} - -static void ack_apic_level(unsigned int irq) -{ - int do_unmask_irq = 0; - - irq_complete_move(irq); -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) - /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { - do_unmask_irq = 1; - mask_IO_APIC_irq(irq); - } -#endif - - /* - * We must acknowledge the irq before we move it or the acknowledge will - * not propagate properly. - */ - ack_APIC_irq(); - - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(irq)) - move_masked_irq(irq); - unmask_IO_APIC_irq(irq); - } -} - -static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -static inline void init_IO_APIC_traps(void) -{ - int irq; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for (irq = 0; irq < NR_IRQS ; irq++) { - int tmp = irq; - if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); - else - /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; - } - } -} - -static void enable_lapic_irq (unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static void disable_lapic_irq (unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void ack_lapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static void end_lapic_irq (unsigned int i) { /* nothing */ } - -static struct hw_interrupt_type lapic_irq_type __read_mostly = { - .name = "local-APIC", - .typename = "local-APIC-edge", - .startup = NULL, /* startup_irq() not used for IRQ0 */ - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ - .enable = enable_lapic_irq, - .disable = disable_lapic_irq, - .ack = ack_lapic_irq, - .end = end_lapic_irq, -}; - -static void setup_nmi (void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - printk(KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(NULL); - - printk(" done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void unlock_ExtINT_logic(void) -{ - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - unsigned long flags; - - pin = find_isa_irq_pin(8, mp_INT); - apic = find_isa_irq_apic(8, mp_INT); - if (pin == -1) - return; - - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - clear_IO_APIC_pin(apic, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(apic, pin); - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - * - * FIXME: really need to revamp this for modern platforms only. - */ -static inline void check_timer(void) -{ - struct irq_cfg *cfg = irq_cfg + 0; - int apic1, pin1, apic2, pin2; - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - assign_irq_vector(0, TARGET_CPUS); - - /* - * Subtle, code in do_timer_interrupt() expects an AEOI - * mode for the 8259A whenever interrupts are routed - * through I/O APICs. Also IRQ0 has to be enabled in - * the 8259A which implies the virtual wire has to be - * disabled in the local APIC. - */ - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); - if (timer_over_8254 > 0) - enable_8259A_irq(0); - - pin1 = find_isa_irq_pin(0, mp_INT); - apic1 = find_isa_irq_apic(0, mp_INT); - pin2 = ioapic_i8259.pin; - apic2 = ioapic_i8259.apic; - - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - unmask_IO_APIC_irq(0); - if (!no_timer_check && timer_irq_works()) { - nmi_watchdog_default(); - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - if (disable_timer_pin_1 > 0) - clear_IO_APIC_pin(0, pin1); - return; - } - clear_IO_APIC_pin(apic1, pin1); - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " - "connected to IO-APIC\n"); - } - - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - if (pin2 != -1) { - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", - apic2, pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); - if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); - nmi_watchdog_default(); - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - } - return; - } - /* - * Cleanup, just in case ... - */ - clear_IO_APIC_pin(apic2, pin2); - } - apic_printk(APIC_VERBOSE," failed.\n"); - - if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = 0; - } - - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - - disable_8259A_irq(0); - irq_desc[0].chip = &lapic_irq_type; - apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); - return; - } - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_VERBOSE," failed.\n"); - - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); - - init_8259A(0); - make_8259A_irq(0); - apic_write(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); - return; - } - apic_printk(APIC_VERBOSE," failed :(.\n"); - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); -} - -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - -/* - * - * IRQ's that are handled by the PIC in the MPS IOAPIC case. - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. - * Linux doesn't really care, as it's not actually used - * for any interrupt handling anyway. - */ -#define PIC_IRQS (1<<2) - -void __init setup_IO_APIC(void) -{ - enable_IO_APIC(); - - if (acpi_ioapic) - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ - else - io_apic_irqs = ~PIC_IRQS; - - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - - sync_Arb_IDs(); - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); -} - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) - *entry = ioapic_read_entry(dev->id, i); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - set_kset_name("ioapic"), - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device * dev; - int i, size, error = 0; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - memset(mp_ioapic_data[i], 0, size); - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* - * Dynamic irq allocate and deallocation - */ -int create_irq(void) -{ - /* Allocate an unused irq */ - int irq; - int new; - unsigned long flags; - - irq = -ENOSPC; - spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { - if (platform_legacy_irq(new)) - continue; - if (irq_cfg[new].vector != 0) - continue; - if (__assign_irq_vector(new, TARGET_CPUS) == 0) - irq = new; - break; - } - spin_unlock_irqrestore(&vector_lock, flags); - - if (irq >= 0) { - dynamic_irq_init(irq); - } - return irq; -} - -void destroy_irq(unsigned int irq) -{ - unsigned long flags; - - dynamic_irq_cleanup(irq); - - spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq); - spin_unlock_irqrestore(&vector_lock, flags); -} - -/* - * MSI mesage composition - */ -#ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) -{ - struct irq_cfg *cfg = irq_cfg + irq; - int err; - unsigned dest; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (!err) { - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); - } - return err; -} - -#ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg = irq_cfg + irq; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - read_msi_msg(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; -} -#endif /* CONFIG_SMP */ - -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -{ - struct msi_msg msg; - int irq, ret; - irq = create_irq(); - if (irq < 0) - return irq; - - ret = msi_compose_msg(dev, irq, &msg); - if (ret < 0) { - destroy_irq(irq); - return ret; - } - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); - - return 0; -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - destroy_irq(irq); -} - -#endif /* CONFIG_PCI_MSI */ - -/* - * Hypertransport interrupt support - */ -#ifdef CONFIG_HT_IRQ - -#ifdef CONFIG_SMP - -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg = irq_cfg + irq; - unsigned int dest; - cpumask_t tmp; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - target_ht_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; -} -#endif - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -{ - struct irq_cfg *cfg = irq_cfg + irq; - int err; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (!err) { - struct ht_irq_msg msg; - unsigned dest; - - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((INT_DEST_MODE == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - } - return err; -} -#endif /* CONFIG_HT_IRQ */ - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -#define IO_APIC_MAX_ID 0xFE - -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); - - return 0; -} - -#endif /* CONFIG_ACPI */ - - -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - if (!irq_cfg[irq].vector) - setup_IO_APIC_irq(ioapic, pin, irq, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); - else - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c deleted file mode 100644 index 653efa30b0f4..000000000000 --- a/arch/x86_64/kernel/ioport.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * linux/arch/x86_64/kernel/ioport.c - * - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/smp.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> -#include <linux/syscalls.h> - -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -{ - int i; - if (new_value) - for (i = base; i < base + extent; i++) - __set_bit(i, bitmap); - else - for (i = base; i < base + extent; i++) - clear_bit(i, bitmap); -} - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - unsigned int i, max_long, bytes, bytes_updated; - struct thread_struct * t = ¤t->thread; - struct tss_struct * tss; - unsigned long *bitmap; - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); - } - - /* - * do it in the per-thread copy and in the TSS ... - * - * Disable preemption via get_cpu() - we must not switch away - * because the ->io_bitmap_max value must match the bitmap - * contents: - */ - tss = &per_cpu(init_tss, get_cpu()); - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - - /* - * Search for a (possibly new) maximum. This is simple and stupid, - * to keep it obviously correct: - */ - max_long = 0; - for (i = 0; i < IO_BITMAP_LONGS; i++) - if (t->io_bitmap_ptr[i] != ~0UL) - max_long = i; - - bytes = (max_long + 1) * sizeof(long); - bytes_updated = max(bytes, t->io_bitmap_max); - - t->io_bitmap_max = bytes; - - /* Update the TSS: */ - memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); - - put_cpu(); - - return 0; -} - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - * Here we just change the eflags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. - */ - -asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -{ - unsigned int old = (regs->eflags >> 12) & 3; - - if (level > 3) - return -EINVAL; - /* Trying to gain more privileges? */ - if (level > old) { - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - } - regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); - return 0; -} diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c deleted file mode 100644 index 39cb3fa83ebb..000000000000 --- a/arch/x86_64/kernel/irq.c +++ /dev/null @@ -1,213 +0,0 @@ -/* - * linux/arch/x86_64/kernel/irq.c - * - * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar - * - * This file contains the lowest level x86_64-specific interrupt - * entry and irq statistics code. All the remaining irq logic is - * done by the generic kernel/irq/ code and in the - * x86_64-specific irq controller code. (e.g. i8259.c and - * io_apic.c.) - */ - -#include <linux/kernel_stat.h> -#include <linux/interrupt.h> -#include <linux/seq_file.h> -#include <linux/module.h> -#include <linux/delay.h> -#include <asm/uaccess.h> -#include <asm/io_apic.h> -#include <asm/idle.h> -#include <asm/smp.h> - -atomic_t irq_err_count; - -#ifdef CONFIG_DEBUG_STACKOVERFLOW -/* - * Probabilistic stack overflow check: - * - * Only check the stack in process context, because everything else - * runs on the big interrupt stacks. Checking reliably is too expensive, - * so we just check from interrupts. - */ -static inline void stack_overflow_check(struct pt_regs *regs) -{ - u64 curbase = (u64)task_stack_page(current); - static unsigned long warned = -60*HZ; - - if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && - regs->rsp < curbase + sizeof(struct thread_info) + 128 && - time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", - current->comm, curbase, regs->rsp); - show_stack(NULL,NULL); - warned = jiffies; - } -} -#endif - -/* - * Generic, controller-independent functions: - */ - -int show_interrupts(struct seq_file *p, void *v) -{ - int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; - - if (i == 0) { - seq_printf(p, " "); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); - seq_putc(p, '\n'); - } - - if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; - if (!action) - goto skip; - seq_printf(p, "%3d: ",i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); -#endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); - - seq_printf(p, " %s", action->name); - for (action=action->next; action; action = action->next) - seq_printf(p, ", %s", action->name); - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); - seq_putc(p, '\n'); - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); - seq_putc(p, '\n'); - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); - } - return 0; -} - -/* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - */ -asmlinkage unsigned int do_IRQ(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* high bit used in ret_from_ code */ - unsigned vector = ~regs->orig_rax; - unsigned irq; - - exit_idle(); - irq_enter(); - irq = __get_cpu_var(vector_irq)[vector]; - -#ifdef CONFIG_DEBUG_STACKOVERFLOW - stack_overflow_check(regs); -#endif - - if (likely(irq < NR_IRQS)) - generic_handle_irq(irq); - else { - if (!disable_apic) - ack_APIC_irq(); - - if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", - __func__, smp_processor_id(), vector); - } - - irq_exit(); - - set_irq_regs(old_regs); - return 1; -} - -#ifdef CONFIG_HOTPLUG_CPU -void fixup_irqs(cpumask_t map) -{ - unsigned int irq; - static int warned; - - for (irq = 0; irq < NR_IRQS; irq++) { - cpumask_t mask; - int break_affinity = 0; - int set_affinity = 1; - - if (irq == 2) - continue; - - /* interrupt's are disabled at this point */ - spin_lock(&irq_desc[irq].lock); - - if (!irq_has_action(irq) || - cpus_equal(irq_desc[irq].affinity, map)) { - spin_unlock(&irq_desc[irq].lock); - continue; - } - - cpus_and(mask, irq_desc[irq].affinity, map); - if (cpus_empty(mask)) { - break_affinity = 1; - mask = map; - } - - if (irq_desc[irq].chip->mask) - irq_desc[irq].chip->mask(irq); - - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); - else if (!(warned++)) - set_affinity = 0; - - if (irq_desc[irq].chip->unmask) - irq_desc[irq].chip->unmask(irq); - - spin_unlock(&irq_desc[irq].lock); - - if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); - else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); - } - - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -} -#endif - -extern void call_softirq(void); - -asmlinkage void do_softirq(void) -{ - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - pending = local_softirq_pending(); - /* Switch to interrupt stack */ - if (pending) { - call_softirq(); - WARN_ON_ONCE(softirq_count()); - } - local_irq_restore(flags); -} -EXPORT_SYMBOL(do_softirq); diff --git a/arch/x86_64/kernel/k8.c b/arch/x86_64/kernel/k8.c deleted file mode 100644 index 7377ccb21335..000000000000 --- a/arch/x86_64/kernel/k8.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Shared support code for AMD K8 northbridges and derivates. - * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. - */ -#include <linux/gfp.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <asm/k8.h> - -int num_k8_northbridges; -EXPORT_SYMBOL(num_k8_northbridges); - -static u32 *flush_words; - -struct pci_device_id k8_nb_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, - {} -}; -EXPORT_SYMBOL(k8_nb_ids); - -struct pci_dev **k8_northbridges; -EXPORT_SYMBOL(k8_northbridges); - -static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) -{ - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(&k8_nb_ids[0], dev)); - return dev; -} - -int cache_k8_northbridges(void) -{ - int i; - struct pci_dev *dev; - - if (num_k8_northbridges) - return 0; - - dev = NULL; - while ((dev = next_k8_northbridge(dev)) != NULL) - num_k8_northbridges++; - - k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), - GFP_KERNEL); - if (!k8_northbridges) - return -ENOMEM; - - if (!num_k8_northbridges) { - k8_northbridges[0] = NULL; - return 0; - } - - flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges); - return -ENOMEM; - } - - dev = NULL; - i = 0; - while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges[i] = dev; - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); - } - k8_northbridges[i] = NULL; - return 0; -} -EXPORT_SYMBOL_GPL(cache_k8_northbridges); - -/* Ignores subdevice/subvendor but as far as I can figure out - they're useless anyways */ -int __init early_is_k8_nb(u32 device) -{ - struct pci_device_id *id; - u32 vendor = device & 0xffff; - device >>= 16; - for (id = k8_nb_ids; id->vendor; id++) - if (vendor == id->vendor && device == id->device) - return 1; - return 0; -} - -void k8_flush_garts(void) -{ - int flushed, i; - unsigned long flags; - static DEFINE_SPINLOCK(gart_lock); - - /* Avoid races between AGP and IOMMU. In theory it's not needed - but I'm not sure if the hardware won't lose flush requests - when another is pending. This whole thing is so expensive anyways - that it doesn't matter to serialize more. -AK */ - spin_lock_irqsave(&gart_lock, flags); - flushed = 0; - for (i = 0; i < num_k8_northbridges; i++) { - pci_write_config_dword(k8_northbridges[i], 0x9c, - flush_words[i]|1); - flushed++; - } - for (i = 0; i < num_k8_northbridges; i++) { - u32 w; - /* Make sure the hardware actually executed the flush*/ - for (;;) { - pci_read_config_dword(k8_northbridges[i], - 0x9c, &w); - if (!(w & 1)) - break; - cpu_relax(); - } - } - spin_unlock_irqrestore(&gart_lock, flags); - if (!flushed) - printk("nothing to flush?\n"); -} -EXPORT_SYMBOL_GPL(k8_flush_garts); - diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c deleted file mode 100644 index a30e004682e2..000000000000 --- a/arch/x86_64/kernel/kprobes.c +++ /dev/null @@ -1,749 +0,0 @@ -/* - * Kernel Probes (KProbes) - * arch/x86_64/kernel/kprobes.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2002, 2004 - * - * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel - * Probes initial implementation ( includes contributions from - * Rusty Russell). - * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes - * interface to access function arguments. - * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi - * <prasanna@in.ibm.com> adapted for x86_64 - * 2005-Mar Roland McGrath <roland@redhat.com> - * Fixed to handle %rip-relative addressing mode correctly. - * 2005-May Rusty Lynch <rusty.lynch@intel.com> - * Added function return probes functionality - */ - -#include <linux/kprobes.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/preempt.h> -#include <linux/module.h> -#include <linux/kdebug.h> - -#include <asm/pgtable.h> -#include <asm/uaccess.h> -#include <asm/alternative.h> - -void jprobe_return_end(void); -static void __kprobes arch_copy_kprobe(struct kprobe *p); - -DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; -DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); - -/* - * returns non-zero if opcode modifies the interrupt flag. - */ -static __always_inline int is_IF_modifier(kprobe_opcode_t *insn) -{ - switch (*insn) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - - if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) - return 1; - return 0; -} - -int __kprobes arch_prepare_kprobe(struct kprobe *p) -{ - /* insn: must be on special executable page on x86_64. */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) { - return -ENOMEM; - } - arch_copy_kprobe(p); - return 0; -} - -/* - * Determine if the instruction uses the %rip-relative addressing mode. - * If it does, return the address of the 32-bit displacement word. - * If not, return null. - */ -static s32 __kprobes *is_riprel(u8 *insn) -{ -#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ - (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ - (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ - (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ - (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ - << (row % 64)) - static const u64 onebyte_has_modrm[256 / 64] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ - W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ - W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ - W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ - W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ - W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ - W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ - W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ - W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ - W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ - W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ - W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ - W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ - W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ - W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ - W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; - static const u64 twobyte_has_modrm[256 / 64] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ - W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ - W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ - W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ - W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ - W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ - W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ - W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ - W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ - W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ - W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ - W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ - W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ - W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ - W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ - W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; -#undef W - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } - - /* Skip REX instruction prefix. */ - if ((*insn & 0xf0) == 0x40) - ++insn; - - if (*insn == 0x0f) { /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, twobyte_has_modrm); - } else { /* One-byte opcode. */ - need_modrm = test_bit(*insn, onebyte_has_modrm); - } - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - return (s32 *) ++insn; - } - } - - /* No %rip-relative addressing mode here. */ - return NULL; -} - -static void __kprobes arch_copy_kprobe(struct kprobe *p) -{ - s32 *ripdisp; - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); - ripdisp = is_riprel(p->ainsn.insn); - if (ripdisp) { - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *ripdisp = disp; - } - p->opcode = *p->addr; -} - -void __kprobes arch_arm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); -} - -void __kprobes arch_disarm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, &p->opcode, 1); -} - -void __kprobes arch_remove_kprobe(struct kprobe *p) -{ - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, 0); - mutex_unlock(&kprobe_mutex); -} - -static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - kcb->prev_kprobe.kp = kprobe_running(); - kcb->prev_kprobe.status = kcb->kprobe_status; - kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags; - kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags; -} - -static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; - kcb->kprobe_status = kcb->prev_kprobe.status; - kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags; - kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags; -} - -static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = p; - kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->ainsn.insn)) - kcb->kprobe_saved_rflags &= ~IF_MASK; -} - -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - regs->eflags |= TF_MASK; - regs->eflags &= ~IF_MASK; - /*single step inline if the instruction is an int3*/ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->rip = (unsigned long)p->addr; - else - regs->rip = (unsigned long)p->ainsn.insn; -} - -/* Called with kretprobe_lock held */ -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long *sara = (unsigned long *)regs->rsp; - - ri->ret_addr = (kprobe_opcode_t *) *sara; - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; -} - -int __kprobes kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *p; - int ret = 0; - kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); - struct kprobe_ctlblk *kcb; - - /* - * We don't want to be preempted for the entire - * duration of kprobe processing - */ - preempt_disable(); - kcb = get_kprobe_ctlblk(); - - /* Check we're not actually recursing */ - if (kprobe_running()) { - p = get_kprobe(addr); - if (p) { - if (kcb->kprobe_status == KPROBE_HIT_SS && - *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { - regs->eflags &= ~TF_MASK; - regs->eflags |= kcb->kprobe_saved_rflags; - goto no_kprobe; - } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) { - /* TODO: Provide re-entrancy from - * post_kprobes_handler() and avoid exception - * stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->rip = (unsigned long)p->addr; - reset_current_kprobe(); - ret = 1; - } else { - /* We have reentered the kprobe_handler(), since - * another probe was hit while within the - * handler. We here save the original kprobe - * variables and just single step on instruction - * of the new probe without calling any user - * handlers. - */ - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); - kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; - return 1; - } - } else { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* The breakpoint instruction was removed by - * another cpu right after we hit, no further - * handling of this interrupt is appropriate - */ - regs->rip = (unsigned long)addr; - ret = 1; - goto no_kprobe; - } - p = __get_cpu_var(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - goto ss_probe; - } - } - goto no_kprobe; - } - - p = get_kprobe(addr); - if (!p) { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->rip = (unsigned long)addr; - ret = 1; - } - /* Not one of ours: let kernel handle it */ - goto no_kprobe; - } - - set_current_kprobe(p, regs, kcb); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - - if (p->pre_handler && p->pre_handler(p, regs)) - /* handler has already set things up, so skip ss setup */ - return 1; - -ss_probe: - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; - return 1; - -no_kprobe: - preempt_enable_no_resched(); - return ret; -} - -/* - * For function-return probes, init_kprobes() establishes a probepoint - * here. When a retprobed function returns, this probe is hit and - * trampoline_probe_handler() runs, calling the kretprobe's handler. - */ - void kretprobe_trampoline_holder(void) - { - asm volatile ( ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" - "nop\n"); - } - -/* - * Called when we hit the probe point at kretprobe_trampoline - */ -int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; - - INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); - - /* - * It is possible to have multiple instances associated with a given - * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always inserted at the head of the list - * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the - * real return address, and all the rest will point to - * kretprobe_trampoline - */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - if (ri->rp && ri->rp->handler) - ri->rp->handler(ri, regs); - - orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - regs->rip = orig_ret_address; - - reset_current_kprobe(); - spin_unlock_irqrestore(&kretprobe_lock, flags); - preempt_enable_no_resched(); - - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we don't want the post_handler - * to run (and have re-enabled preemption) - */ - return 1; -} - -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new rip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed eflags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - */ -static void __kprobes resume_execution(struct kprobe *p, - struct pt_regs *regs, struct kprobe_ctlblk *kcb) -{ - unsigned long *tos = (unsigned long *)regs->rsp; - unsigned long next_rip = 0; - unsigned long copy_rip = (unsigned long)p->ainsn.insn; - unsigned long orig_rip = (unsigned long)p->addr; - kprobe_opcode_t *insn = p->ainsn.insn; - - /*skip the REX prefix*/ - if (*insn >= 0x40 && *insn <= 0x4f) - insn++; - - switch (*insn) { - case 0x9c: /* pushfl */ - *tos &= ~(TF_MASK | IF_MASK); - *tos |= kcb->kprobe_old_rflags; - break; - case 0xc3: /* ret/lret */ - case 0xcb: - case 0xc2: - case 0xca: - regs->eflags &= ~TF_MASK; - /* rip is already adjusted, no more changes required*/ - return; - case 0xe8: /* call relative - Fix return addr */ - *tos = orig_rip + (*tos - copy_rip); - break; - case 0xff: - if ((insn[1] & 0x30) == 0x10) { - /* call absolute, indirect */ - /* Fix return addr; rip is correct. */ - next_rip = regs->rip; - *tos = orig_rip + (*tos - copy_rip); - } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ - ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ - /* rip is correct. */ - next_rip = regs->rip; - } - break; - case 0xea: /* jmp absolute -- rip is correct */ - next_rip = regs->rip; - break; - default: - break; - } - - regs->eflags &= ~TF_MASK; - if (next_rip) { - regs->rip = next_rip; - } else { - regs->rip = orig_rip + (regs->rip - copy_rip); - } -} - -int __kprobes post_kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - if (!cur) - return 0; - - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - - resume_execution(cur, regs, kcb); - regs->eflags |= kcb->kprobe_saved_rflags; - - /* Restore the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(kcb); - goto out; - } - reset_current_kprobe(); -out: - preempt_enable_no_resched(); - - /* - * if somebody else is singlestepping across a probe point, eflags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->eflags & TF_MASK) - return 0; - - return 1; -} - -int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - const struct exception_table_entry *fixup; - - switch(kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: - /* - * We are here because the instruction being single - * stepped caused a page fault. We reset the current - * kprobe and the rip points back to the probe address - * and allow the page fault handler to continue as a - * normal page fault. - */ - regs->rip = (unsigned long)cur->addr; - regs->eflags |= kcb->kprobe_old_rflags; - if (kcb->kprobe_status == KPROBE_REENTER) - restore_previous_kprobe(kcb); - else - reset_current_kprobe(); - preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accouting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) - return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return 1; - } - - /* - * fixup() could not handle it, - * Let do_page_fault() fix it. - */ - break; - default: - break; - } - return 0; -} - -/* - * Wrapper routine for handling exceptions. - */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - if (args->regs && user_mode(args->regs)) - return ret; - - switch (val) { - case DIE_INT3: - if (kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_GPF: - case DIE_PAGE_FAULT: - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - ret = NOTIFY_STOP; - preempt_enable(); - break; - default: - break; - } - return ret; -} - -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_rsp = (long *) regs->rsp; - addr = (unsigned long)(kcb->jprobe_saved_rsp); - /* - * As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - */ - memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, - MIN_STACK_SIZE(addr)); - regs->eflags &= ~IF_MASK; - regs->rip = (unsigned long)(jp->entry); - return 1; -} - -void __kprobes jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - asm volatile (" xchg %%rbx,%%rsp \n" - " int3 \n" - " .globl jprobe_return_end \n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_rsp):"memory"); -} - -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->rip - 1); - unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp); - struct jprobe *jp = container_of(p, struct jprobe, kp); - - if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if ((long *)regs->rsp != kcb->jprobe_saved_rsp) { - struct pt_regs *saved_regs = - container_of(kcb->jprobe_saved_rsp, - struct pt_regs, rsp); - printk("current rsp %p does not match saved rsp %p\n", - (long *)regs->rsp, kcb->jprobe_saved_rsp); - printk("Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); - printk("Current registers\n"); - show_registers(regs); - BUG(); - } - *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, - MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} - -static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, - .pre_handler = trampoline_probe_handler -}; - -int __init arch_init_kprobes(void) -{ - return register_kprobe(&trampoline_p); -} - -int __kprobes arch_trampoline_kprobe(struct kprobe *p) -{ - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) - return 1; - - return 0; -} diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c deleted file mode 100644 index bc9ffd5c19cc..000000000000 --- a/arch/x86_64/kernel/ldt.c +++ /dev/null @@ -1,252 +0,0 @@ -/* - * linux/arch/x86_64/kernel/ldt.c - * - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2002 Andi Kleen - * - * This handles calls from both 32bit and 64bit mode. - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) -{ - void *oldldt; - void *newldt; - unsigned oldsize; - - if (mincount <= (unsigned)pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - wmb(); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - - preempt_disable(); - mask = cpumask_of_cpu(smp_processor_id()); - load_LDT(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#else - load_LDT(pc); -#endif - } - if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - init_MUTEX(&mm->context.sem); - mm->context.size = 0; - old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - down(&old_mm->context.sem); - retval = copy_ldt(&mm->context, &old_mm->context); - up(&old_mm->context.sem); - } - return retval; -} - -/* - * - * Don't touch the LDT register - we're already in the next thread. - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - down(&mm->context.sem); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - up(&mm->context.sem); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - /* Arbitrary number */ - /* x86-64 default LDT is all zeros */ - if (bytecount > 128) - bytecount = 128; - if (clear_user(ptr, bytecount)) - return -EFAULT; - return bytecount; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct task_struct *me = current; - struct mm_struct * mm = me->mm; - __u32 entry_1, entry_2, *lp; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, bytecount)) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - down(&mm->context.sem); - if (ldt_info.entry_number >= (unsigned)mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - *lp = entry_1; - *(lp+1) = entry_2; - error = 0; - -out_unlock: - up(&mm->context.sem); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c deleted file mode 100644 index c3a554703672..000000000000 --- a/arch/x86_64/kernel/machine_kexec.c +++ /dev/null @@ -1,259 +0,0 @@ -/* - * machine_kexec.c - handle transition of Linux booting another kernel - * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <linux/mm.h> -#include <linux/kexec.h> -#include <linux/string.h> -#include <linux/reboot.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/io.h> - -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -static u64 kexec_pgd[512] PAGE_ALIGNED; -static u64 kexec_pud0[512] PAGE_ALIGNED; -static u64 kexec_pmd0[512] PAGE_ALIGNED; -static u64 kexec_pte0[512] PAGE_ALIGNED; -static u64 kexec_pud1[512] PAGE_ALIGNED; -static u64 kexec_pmd1[512] PAGE_ALIGNED; -static u64 kexec_pte1[512] PAGE_ALIGNED; - -static void init_level2_page(pmd_t *level2p, unsigned long addr) -{ - unsigned long end_addr; - - addr &= PAGE_MASK; - end_addr = addr + PUD_SIZE; - while (addr < end_addr) { - set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - addr += PMD_SIZE; - } -} - -static int init_level3_page(struct kimage *image, pud_t *level3p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + PGDIR_SIZE; - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pmd_t *level2p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level2p = (pmd_t *)page_address(page); - init_level2_page(level2p, addr); - set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); - addr += PUD_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pud_clear(level3p++); - addr += PUD_SIZE; - } -out: - return result; -} - - -static int init_level4_page(struct kimage *image, pgd_t *level4p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pud_t *level3p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level3p = (pud_t *)page_address(page); - result = init_level3_page(image, level3p, addr, last_addr); - if (result) { - goto out; - } - set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); - addr += PGDIR_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pgd_clear(level4p++); - addr += PGDIR_SIZE; - } -out: - return result; -} - - -static int init_pgtable(struct kimage *image, unsigned long start_pgtable) -{ - pgd_t *level4p; - level4p = (pgd_t *)__va(start_pgtable); - return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); -} - -static void set_idt(void *newidt, u16 limit) -{ - struct desc_ptr curidt; - - /* x86-64 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - __asm__ __volatile__ ( - "lidtq %0\n" - : : "m" (curidt) - ); -}; - - -static void set_gdt(void *newgdt, u16 limit) -{ - struct desc_ptr curgdt; - - /* x86-64 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - __asm__ __volatile__ ( - "lgdtq %0\n" - : : "m" (curgdt) - ); -}; - -static void load_segments(void) -{ - __asm__ __volatile__ ( - "\tmovl %0,%%ds\n" - "\tmovl %0,%%es\n" - "\tmovl %0,%%ss\n" - "\tmovl %0,%%fs\n" - "\tmovl %0,%%gs\n" - : : "a" (__KERNEL_DS) : "memory" - ); -} - -int machine_kexec_prepare(struct kimage *image) -{ - unsigned long start_pgtable; - int result; - - /* Calculate the offsets */ - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - - /* Setup the identity mapped 64bit page table */ - result = init_pgtable(image, start_pgtable); - if (result) - return result; - - return 0; -} - -void machine_kexec_cleanup(struct kimage *image) -{ - return; -} - -/* - * Do not allocate memory (or fail in any way) in machine_kexec(). - * We are past the point of no return, committed to rebooting now. - */ -NORET_TYPE void machine_kexec(struct kimage *image) -{ - unsigned long page_list[PAGES_NR]; - void *control_page; - - /* Interrupts aren't acceptable while we reboot */ - local_irq_disable(); - - control_page = page_address(image->control_code_page) + PAGE_SIZE; - memcpy(control_page, relocate_kernel, PAGE_SIZE); - - page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; - page_list[PA_PGD] = virt_to_phys(&kexec_pgd); - page_list[VA_PGD] = (unsigned long)kexec_pgd; - page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0); - page_list[VA_PUD_0] = (unsigned long)kexec_pud0; - page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0); - page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0); - page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1); - page_list[VA_PUD_1] = (unsigned long)kexec_pud1; - page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1); - page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; - page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1); - page_list[VA_PTE_1] = (unsigned long)kexec_pte1; - - page_list[PA_TABLE_PAGE] = - (unsigned long)__pa(page_address(image->control_code_page)); - - /* The segment registers are funny things, they have both a - * visible and an invisible part. Whenever the visible part is - * set to a specific selector, the invisible part is loaded - * with from a table in memory. At no other time is the - * descriptor table in memory accessed. - * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. - */ - load_segments(); - /* The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); - - /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start); -} - -/* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ -static int __init setup_crashkernel(char *arg) -{ - unsigned long size, base; - char *p; - if (!arg) - return -EINVAL; - size = memparse(arg, &p); - if (arg == p) - return -EINVAL; - if (*p == '@') { - base = memparse(p+1, &p); - /* FIXME: Do I want a sanity check to validate the - * memory range? Yes you do, but it's too early for - * e820 -AK */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - return 0; -} -early_param("crashkernel", setup_crashkernel); - diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c deleted file mode 100644 index a66d607f5b92..000000000000 --- a/arch/x86_64/kernel/mce.c +++ /dev/null @@ -1,875 +0,0 @@ -/* - * Machine check handler. - * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Rest from unknown author(s). - * 2004 Andi Kleen. Rewrote most of it. - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/rcupdate.h> -#include <linux/kallsyms.h> -#include <linux/sysdev.h> -#include <linux/miscdevice.h> -#include <linux/fs.h> -#include <linux/capability.h> -#include <linux/cpu.h> -#include <linux/percpu.h> -#include <linux/poll.h> -#include <linux/thread_info.h> -#include <linux/ctype.h> -#include <linux/kmod.h> -#include <linux/kdebug.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/mce.h> -#include <asm/uaccess.h> -#include <asm/smp.h> -#include <asm/idle.h> - -#define MISC_MCELOG_MINOR 227 -#define NR_BANKS 6 - -atomic_t mce_entry; - -static int mce_dont_init; - -/* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant = 1; -static int banks; -static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; -static unsigned long notify_user; -static int rip_msr; -static int mce_bootlog = 1; -static atomic_t mce_events; - -static char trigger[128]; -static char *trigger_argv[2] = { trigger, NULL }; - -static DECLARE_WAIT_QUEUE_HEAD(mce_wait); - -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -struct mce_log mcelog = { - MCE_LOG_SIGNATURE, - MCE_LOG_LEN, -}; - -void mce_log(struct mce *mce) -{ - unsigned next, entry; - atomic_inc(&mce_events); - mce->finished = 0; - wmb(); - for (;;) { - entry = rcu_dereference(mcelog.next); - /* The rmb forces the compiler to reload next in each - iteration */ - rmb(); - for (;;) { - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, &mcelog.flags); - return; - } - /* Old left over entry. Skip. */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); - mcelog.entry[entry].finished = 1; - wmb(); - - set_bit(0, ¬ify_user); -} - -static void print_mce(struct mce *m) -{ - printk(KERN_EMERG "\n" - KERN_EMERG "HARDWARE ERROR\n" - KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", - m->cpu, m->mcgstatus, m->bank, m->status); - if (m->rip) { - printk(KERN_EMERG - "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->rip); - if (m->cs == __KERNEL_CS) - print_symbol("{%s}", m->rip); - printk("\n"); - } - printk(KERN_EMERG "TSC %Lx ", m->tsc); - if (m->addr) - printk("ADDR %Lx ", m->addr); - if (m->misc) - printk("MISC %Lx ", m->misc); - printk("\n"); - printk(KERN_EMERG "This is not a software problem!\n"); - printk(KERN_EMERG - "Run through mcelog --ascii to decode and contact your hardware vendor\n"); -} - -static void mce_panic(char *msg, struct mce *backup, unsigned long start) -{ - int i; - - oops_begin(); - for (i = 0; i < MCE_LOG_LEN; i++) { - unsigned long tsc = mcelog.entry[i].tsc; - if (time_before(tsc, start)) - continue; - print_mce(&mcelog.entry[i]); - if (backup && mcelog.entry[i].tsc == backup->tsc) - backup = NULL; - } - if (backup) - print_mce(backup); - panic(msg); -} - -static int mce_available(struct cpuinfo_x86 *c) -{ - return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); -} - -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ - if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { - m->rip = regs->rip; - m->cs = regs->cs; - } else { - m->rip = 0; - m->cs = 0; - } - if (rip_msr) { - /* Assume the RIP in the MSR is exact. Is this true? */ - m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->rip); - m->cs = 0; - } -} - -/* - * The actual machine check handler - */ - -void do_machine_check(struct pt_regs * regs, long error_code) -{ - struct mce m, panicm; - u64 mcestart = 0; - int i; - int panicm_found = 0; - /* - * If no_way_out gets set, there is no safe way to recover from this - * MCE. If tolerant is cranked up, we'll try anyway. - */ - int no_way_out = 0; - /* - * If kill_it gets set, there might be a way to recover from this - * error. - */ - int kill_it = 0; - - atomic_inc(&mce_entry); - - if (regs) - notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); - if (!banks) - goto out2; - - memset(&m, 0, sizeof(struct mce)); - m.cpu = smp_processor_id(); - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - /* if the restart IP is not valid, we're done for */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - no_way_out = 1; - - rdtscll(mcestart); - barrier(); - - for (i = 0; i < banks; i++) { - if (!bank[i]) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - m.tsc = 0; - - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); - if ((m.status & MCI_STATUS_VAL) == 0) - continue; - - if (m.status & MCI_STATUS_EN) { - /* if PCC was set, there's no way out */ - no_way_out |= !!(m.status & MCI_STATUS_PCC); - /* - * If this error was uncorrectable and there was - * an overflow, we're in trouble. If no overflow, - * we might get away with just killing a task. - */ - if (m.status & MCI_STATUS_UC) { - if (tolerant < 1 || m.status & MCI_STATUS_OVER) - no_way_out = 1; - kill_it = 1; - } - } - - if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); - if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - - mce_get_rip(&m, regs); - if (error_code >= 0) - rdtscll(m.tsc); - if (error_code != -2) - mce_log(&m); - - /* Did this bank cause the exception? */ - /* Assume that the bank with uncorrectable errors did it, - and that there is only a single one. */ - if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { - panicm = m; - panicm_found = 1; - } - - add_taint(TAINT_MACHINE_CHECK); - } - - /* Never do anything final in the polling timer */ - if (!regs) - goto out; - - /* If we didn't find an uncorrectable error, pick - the last one (shouldn't happen, just being safe). */ - if (!panicm_found) - panicm = m; - - /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - */ - if (no_way_out && tolerant < 3) - mce_panic("Machine check", &panicm, mcestart); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. - */ - if (kill_it && tolerant < 3) { - int user_space = 0; - - /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. - */ - if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = panicm.rip && (panicm.cs & 3); - - /* - * If we know that the error was in user space, send a - * SIGBUS. Otherwise, panic if tolerance is low. - * - * do_exit() takes an awful lot of locks and has a slight - * risk of deadlocking. - */ - if (user_space) { - do_exit(SIGBUS); - } else if (panic_on_oops || tolerant < 2) { - mce_panic("Uncorrected machine check", - &panicm, mcestart); - } - } - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); - - out: - /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - wrmsrl(MSR_IA32_MCG_STATUS, 0); - out2: - atomic_dec(&mce_entry); -} - -#ifdef CONFIG_X86_MCE_INTEL -/*** - * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occured. - * @status: Event status information - * - * This function should be called by the thermal interrupt after the - * event has been processed and the decision was made to log the event - * further. - * - * The status parameter will be saved to the 'status' field of 'struct mce' - * and historically has been the register value of the - * MSR_IA32_THERMAL_STATUS (Intel) msr. - */ -void mce_log_therm_throt_event(unsigned int cpu, __u64 status) -{ - struct mce m; - - memset(&m, 0, sizeof(m)); - m.cpu = cpu; - m.bank = MCE_THERMAL_BANK; - m.status = status; - rdtscll(m.tsc); - mce_log(&m); -} -#endif /* CONFIG_X86_MCE_INTEL */ - -/* - * Periodic polling timer for "silent" machine check errors. If the - * poller finds an MCE, poll 2x faster. When the poller finds no more - * errors, poll 2x slower (up to check_interval seconds). - */ - -static int check_interval = 5 * 60; /* 5 minutes */ -static int next_interval; /* in jiffies */ -static void mcheck_timer(struct work_struct *work); -static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); - -static void mcheck_check_cpu(void *info) -{ - if (mce_available(¤t_cpu_data)) - do_machine_check(NULL, 0); -} - -static void mcheck_timer(struct work_struct *work) -{ - on_each_cpu(mcheck_check_cpu, NULL, 1, 1); - - /* - * Alert userspace if needed. If we logged an MCE, reduce the - * polling interval, otherwise increase the polling interval. - */ - if (mce_notify_user()) { - next_interval = max(next_interval/2, HZ/100); - } else { - next_interval = min(next_interval*2, - (int)round_jiffies_relative(check_interval*HZ)); - } - - schedule_delayed_work(&mcheck_work, next_interval); -} - -/* - * This is only called from process context. This is where we do - * anything we need to alert userspace about new MCEs. This is called - * directly from the poller and also from entry.S and idle, thanks to - * TIF_MCE_NOTIFY. - */ -int mce_notify_user(void) -{ - clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, ¬ify_user)) { - static unsigned long last_print; - unsigned long now = jiffies; - - wake_up_interruptible(&mce_wait); - if (trigger[0]) - call_usermodehelper(trigger, trigger_argv, NULL, - UMH_NO_WAIT); - - if (time_after_eq(now, last_print + (check_interval*HZ))) { - last_print = now; - printk(KERN_INFO "Machine check events logged\n"); - } - - return 1; - } - return 0; -} - -/* see if the idle task needs to notify userspace */ -static int -mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) -{ - /* IDLE_END should be safe - interrupts are back on */ - if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) - mce_notify_user(); - - return NOTIFY_OK; -} - -static struct notifier_block mce_idle_notifier = { - .notifier_call = mce_idle_callback, -}; - -static __init int periodic_mcheck_init(void) -{ - next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); - idle_notifier_register(&mce_idle_notifier); - return 0; -} -__initcall(periodic_mcheck_init); - - -/* - * Initialize Machine Checks for a CPU. - */ -static void mce_init(void *dummy) -{ - u64 cap; - int i; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - banks = cap & 0xff; - if (banks > NR_BANKS) { - printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); - banks = NR_BANKS; - } - /* Use accurate RIP reporting if available. */ - if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) - rip_msr = MSR_IA32_MCG_EIP; - - /* Log the machine checks left over from the previous reset. - This also clears all registers */ - do_machine_check(NULL, mce_bootlog ? -1 : -2); - - set_in_cr4(X86_CR4_MCE); - - if (cap & MCG_CTL_P) - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - - for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } -} - -/* Add per CPU specific workarounds here */ -static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) -{ - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { - /* disable GART TBL walk error reporting, which trips off - incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, &bank[4]); - /* Lots of broken BIOS around that don't clear them - by default and leave crap in there. Don't log. */ - mce_bootlog = 0; - } - -} - -static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) -{ - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - break; - case X86_VENDOR_AMD: - mce_amd_feature_init(c); - break; - default: - break; - } -} - -/* - * Called for each booted CPU to set up machine checks. - * Must be called with preempt off. - */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) -{ - static cpumask_t mce_cpus = CPU_MASK_NONE; - - mce_cpu_quirks(c); - - if (mce_dont_init || - cpu_test_and_set(smp_processor_id(), mce_cpus) || - !mce_available(c)) - return; - - mce_init(NULL); - mce_cpu_features(c); -} - -/* - * Character device to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_state_lock); -static int open_count; /* #times opened */ -static int open_exclu; /* already open exclusive? */ - -static int mce_open(struct inode *inode, struct file *file) -{ - spin_lock(&mce_state_lock); - - if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_state_lock); - return -EBUSY; - } - - if (file->f_flags & O_EXCL) - open_exclu = 1; - open_count++; - - spin_unlock(&mce_state_lock); - - return nonseekable_open(inode, file); -} - -static int mce_release(struct inode *inode, struct file *file) -{ - spin_lock(&mce_state_lock); - - open_count--; - open_exclu = 0; - - spin_unlock(&mce_state_lock); - - return 0; -} - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - rdtscll(cpu_tsc[smp_processor_id()]); -} - -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) -{ - unsigned long *cpu_tsc; - static DECLARE_MUTEX(mce_read_sem); - unsigned next; - char __user *buf = ubuf; - int i, err; - - cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - down(&mce_read_sem); - next = rcu_dereference(mcelog.next); - - /* Only supports full reads right now */ - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - up(&mce_read_sem); - kfree(cpu_tsc); - return -EINVAL; - } - - err = 0; - for (i = 0; i < next; i++) { - unsigned long start = jiffies; - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i,0, sizeof(struct mce)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); - buf += sizeof(struct mce); - timeout: - ; - } - - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; - - synchronize_sched(); - - /* Collect entries that were still getting written before the synchronize. */ - - on_each_cpu(collect_tscs, cpu_tsc, 1, 1); - for (i = next; i < MCE_LOG_LEN; i++) { - if (mcelog.entry[i].finished && - mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { - err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); - smp_rmb(); - buf += sizeof(struct mce); - memset(&mcelog.entry[i], 0, sizeof(struct mce)); - } - } - up(&mce_read_sem); - kfree(cpu_tsc); - return err ? -EFAULT : buf - ubuf; -} - -static unsigned int mce_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &mce_wait, wait); - if (rcu_dereference(mcelog.next)) - return POLLIN | POLLRDNORM; - return 0; -} - -static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) -{ - int __user *p = (int __user *)arg; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static const struct file_operations mce_chrdev_ops = { - .open = mce_open, - .release = mce_release, - .read = mce_read, - .poll = mce_poll, - .ioctl = mce_ioctl, -}; - -static struct miscdevice mce_log_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - -/* - * Old style boot options parsing. Only for compatibility. - */ - -static int __init mcheck_disable(char *str) -{ - mce_dont_init = 1; - return 1; -} - -/* mce=off disables machine check. Note you can reenable it later - using sysfs. - mce=TOLERANCELEVEL (number, see above) - mce=bootlog Log MCEs from before booting. Disabled by default on AMD. - mce=nobootlog Don't log MCEs from before booting. */ -static int __init mcheck_enable(char *str) -{ - if (*str == '=') - str++; - if (!strcmp(str, "off")) - mce_dont_init = 1; - else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) - mce_bootlog = str[0] == 'b'; - else if (isdigit(str[0])) - get_option(&str, &tolerant); - else - printk("mce= argument %s ignored. Please use /sys", str); - return 1; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); - -/* - * Sysfs support - */ - -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. - Only one CPU is active at this time, the others get readded later using - CPU hotplug. */ -static int mce_resume(struct sys_device *dev) -{ - mce_init(NULL); - return 0; -} - -/* Reinit MCEs after user configuration changes */ -static void mce_restart(void) -{ - if (next_interval) - cancel_delayed_work(&mcheck_work); - /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1, 1); - next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); -} - -static struct sysdev_class mce_sysclass = { - .resume = mce_resume, - set_kset_name("machinecheck"), -}; - -DEFINE_PER_CPU(struct sys_device, device_mce); - -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ - return sprintf(buf, "%lx\n", (unsigned long)var); \ - } \ - static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ - char *end; \ - unsigned long new = simple_strtoul(buf, &end, 0); \ - if (end == buf) return -EINVAL; \ - var = new; \ - start; \ - return end-buf; \ - } \ - static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - -/* TBD should generate these dynamically based on number of available banks */ -ACCESSOR(bank0ctl,bank[0],mce_restart()) -ACCESSOR(bank1ctl,bank[1],mce_restart()) -ACCESSOR(bank2ctl,bank[2],mce_restart()) -ACCESSOR(bank3ctl,bank[3],mce_restart()) -ACCESSOR(bank4ctl,bank[4],mce_restart()) -ACCESSOR(bank5ctl,bank[5],mce_restart()) - -static ssize_t show_trigger(struct sys_device *s, char *buf) -{ - strcpy(buf, trigger); - strcat(buf, "\n"); - return strlen(trigger) + 1; -} - -static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) -{ - char *p; - int len; - strncpy(trigger, buf, sizeof(trigger)); - trigger[sizeof(trigger)-1] = 0; - len = strlen(trigger); - p = strchr(trigger, '\n'); - if (*p) *p = 0; - return len; -} - -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -ACCESSOR(tolerant,tolerant,) -ACCESSOR(check_interval,check_interval,mce_restart()) -static struct sysdev_attribute *mce_attributes[] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, - &attr_tolerant, &attr_check_interval, &attr_trigger, - NULL -}; - -/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ -static __cpuinit int mce_create_device(unsigned int cpu) -{ - int err; - int i; - if (!mce_available(&cpu_data[cpu])) - return -EIO; - - per_cpu(device_mce,cpu).id = cpu; - per_cpu(device_mce,cpu).cls = &mce_sysclass; - - err = sysdev_register(&per_cpu(device_mce,cpu)); - - if (!err) { - for (i = 0; mce_attributes[i]; i++) - sysdev_create_file(&per_cpu(device_mce,cpu), - mce_attributes[i]); - } - return err; -} - -static void mce_remove_device(unsigned int cpu) -{ - int i; - - for (i = 0; mce_attributes[i]; i++) - sysdev_remove_file(&per_cpu(device_mce,cpu), - mce_attributes[i]); - sysdev_unregister(&per_cpu(device_mce,cpu)); - memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - mce_create_device(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - mce_remove_device(cpu); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block mce_cpu_notifier = { - .notifier_call = mce_cpu_callback, -}; - -static __init int mce_init_device(void) -{ - int err; - int i = 0; - - if (!mce_available(&boot_cpu_data)) - return -EIO; - err = sysdev_class_register(&mce_sysclass); - - for_each_online_cpu(i) { - mce_create_device(i); - } - - register_hotcpu_notifier(&mce_cpu_notifier); - misc_register(&mce_log_device); - return err; -} - -device_initcall(mce_init_device); diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c deleted file mode 100644 index 2f8a7f18b0fe..000000000000 --- a/arch/x86_64/kernel/mce_amd.c +++ /dev/null @@ -1,689 +0,0 @@ -/* - * (c) 2005, 2006 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - * - * Written by Jacob Shin - AMD, Inc. - * - * Support : jacob.shin@amd.com - * - * April 2006 - * - added support for AMD Family 0x10 processors - * - * All MC4_MISCi registers are shared between multi-cores - */ - -#include <linux/cpu.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/kobject.h> -#include <linux/notifier.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/sysdev.h> -#include <linux/sysfs.h> -#include <asm/apic.h> -#include <asm/mce.h> -#include <asm/msr.h> -#include <asm/percpu.h> -#include <asm/idle.h> - -#define PFX "mce_threshold: " -#define VERSION "version 1.1.1" -#define NR_BANKS 6 -#define NR_BLOCKS 9 -#define THRESHOLD_MAX 0xFFF -#define INT_TYPE_APIC 0x00020000 -#define MASK_VALID_HI 0x80000000 -#define MASK_CNTP_HI 0x40000000 -#define MASK_LOCKED_HI 0x20000000 -#define MASK_LVTOFF_HI 0x00F00000 -#define MASK_COUNT_EN_HI 0x00080000 -#define MASK_INT_TYPE_HI 0x00060000 -#define MASK_OVERFLOW_HI 0x00010000 -#define MASK_ERR_COUNT_HI 0x00000FFF -#define MASK_BLKPTR_LO 0xFF000000 -#define MCG_XBLK_ADDR 0xC0000400 - -struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; -}; - -/* defaults used early on boot */ -static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, -}; - -struct threshold_bank { - struct kobject kobj; - struct threshold_block *blocks; - cpumask_t cpus; -}; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); - -#ifdef CONFIG_SMP -static unsigned char shared_bank[NR_BANKS] = { - 0, 0, 0, 0, 1 -}; -#endif - -static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ - -/* - * CPU Initialization - */ - -/* must be called with correct cpu affinity */ -static void threshold_restart_bank(struct threshold_block *b, - int reset, u16 old_limit) -{ - u32 mci_misc_hi, mci_misc_lo; - - rdmsr(b->address, mci_misc_lo, mci_misc_hi); - - if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) - reset = 1; /* limit cannot be lower than err count */ - - if (reset) { /* reset err count and overflow bit */ - mci_misc_hi = - (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - b->threshold_limit); - } else if (old_limit) { /* change limit w/o reset */ - int new_count = (mci_misc_hi & THRESHOLD_MAX) + - (old_limit - b->threshold_limit); - mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | - (new_count & THRESHOLD_MAX); - } - - b->interrupt_enable ? - (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (mci_misc_hi &= ~MASK_INT_TYPE_HI); - - mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(b->address, mci_misc_lo, mci_misc_hi); -} - -/* cpu init entry point, called from mce.c with preempt off */ -void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) -{ - unsigned int bank, block; - unsigned int cpu = smp_processor_id(); - u32 low = 0, high = 0, address = 0; - - for (bank = 0; bank < NR_BANKS; ++bank) { - for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) - address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - address += MCG_XBLK_ADDR; - } - else - ++address; - - if (rdmsr_safe(address, &low, &high)) - break; - - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; - - if (!block) - per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP - if (shared_bank[bank] && c->cpu_core_id) - break; -#endif - high &= ~MASK_LVTOFF_HI; - high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; - wrmsr(address, low, high); - - setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, - THRESHOLD_APIC_VECTOR, - K8_APIC_EXT_INT_MSG_FIX, 0); - - threshold_defaults.address = address; - threshold_restart_bank(&threshold_defaults, 0, 0); - } - } -} - -/* - * APIC Interrupt Handler - */ - -/* - * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. - * the interrupt goes off when error_count reaches threshold_limit. - * the handler will simply log mcelog w/ software defined bank number. - */ -asmlinkage void mce_threshold_interrupt(void) -{ - unsigned int bank, block; - struct mce m; - u32 low = 0, high = 0, address = 0; - - ack_APIC_irq(); - exit_idle(); - irq_enter(); - - memset(&m, 0, sizeof(m)); - rdtscll(m.tsc); - m.cpu = smp_processor_id(); - - /* assume first bank caused it */ - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) - continue; - for (block = 0; block < NR_BLOCKS; ++block) { - if (block == 0) - address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - break; - address += MCG_XBLK_ADDR; - } - else - ++address; - - if (rdmsr_safe(address, &low, &high)) - break; - - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; - - /* Log the machine check that caused the threshold - event. */ - do_machine_check(NULL, 0); - - if (high & MASK_OVERFLOW_HI) { - rdmsrl(address, m.misc); - rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, - m.status); - m.bank = K8_MCE_THRESHOLD_BASE - + bank * NR_BLOCKS - + block; - mce_log(&m); - goto out; - } - } - } -out: - irq_exit(); -} - -/* - * Sysfs Interface - */ - -struct threshold_attr { - struct attribute attr; - ssize_t(*show) (struct threshold_block *, char *); - ssize_t(*store) (struct threshold_block *, const char *, size_t count); -}; - -static cpumask_t affinity_set(unsigned int cpu) -{ - cpumask_t oldmask = current->cpus_allowed; - cpumask_t newmask = CPU_MASK_NONE; - cpu_set(cpu, newmask); - set_cpus_allowed(current, newmask); - return oldmask; -} - -static void affinity_restore(cpumask_t oldmask) -{ - set_cpus_allowed(current, oldmask); -} - -#define SHOW_FIELDS(name) \ -static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ -{ \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ -} -SHOW_FIELDS(interrupt_enable) -SHOW_FIELDS(threshold_limit) - -static ssize_t store_interrupt_enable(struct threshold_block *b, - const char *buf, size_t count) -{ - char *end; - cpumask_t oldmask; - unsigned long new = simple_strtoul(buf, &end, 0); - if (end == buf) - return -EINVAL; - b->interrupt_enable = !!new; - - oldmask = affinity_set(b->cpu); - threshold_restart_bank(b, 0, 0); - affinity_restore(oldmask); - - return end - buf; -} - -static ssize_t store_threshold_limit(struct threshold_block *b, - const char *buf, size_t count) -{ - char *end; - cpumask_t oldmask; - u16 old; - unsigned long new = simple_strtoul(buf, &end, 0); - if (end == buf) - return -EINVAL; - if (new > THRESHOLD_MAX) - new = THRESHOLD_MAX; - if (new < 1) - new = 1; - old = b->threshold_limit; - b->threshold_limit = new; - - oldmask = affinity_set(b->cpu); - threshold_restart_bank(b, 0, old); - affinity_restore(oldmask); - - return end - buf; -} - -static ssize_t show_error_count(struct threshold_block *b, char *buf) -{ - u32 high, low; - cpumask_t oldmask; - oldmask = affinity_set(b->cpu); - rdmsr(b->address, low, high); - affinity_restore(oldmask); - return sprintf(buf, "%x\n", - (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); -} - -static ssize_t store_error_count(struct threshold_block *b, - const char *buf, size_t count) -{ - cpumask_t oldmask; - oldmask = affinity_set(b->cpu); - threshold_restart_bank(b, 1, 0); - affinity_restore(oldmask); - return 1; -} - -#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ -}; - -#define RW_ATTR(name) \ -static struct threshold_attr name = \ - THRESHOLD_ATTR(name, 0644, show_## name, store_## name) - -RW_ATTR(interrupt_enable); -RW_ATTR(threshold_limit); -RW_ATTR(error_count); - -static struct attribute *default_attrs[] = { - &interrupt_enable.attr, - &threshold_limit.attr, - &error_count.attr, - NULL -}; - -#define to_block(k) container_of(k, struct threshold_block, kobj) -#define to_attr(a) container_of(a, struct threshold_attr, attr) - -static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - struct threshold_block *b = to_block(kobj); - struct threshold_attr *a = to_attr(attr); - ssize_t ret; - ret = a->show ? a->show(b, buf) : -EIO; - return ret; -} - -static ssize_t store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t count) -{ - struct threshold_block *b = to_block(kobj); - struct threshold_attr *a = to_attr(attr); - ssize_t ret; - ret = a->store ? a->store(b, buf, count) : -EIO; - return ret; -} - -static struct sysfs_ops threshold_ops = { - .show = show, - .store = store, -}; - -static struct kobj_type threshold_ktype = { - .sysfs_ops = &threshold_ops, - .default_attrs = default_attrs, -}; - -static __cpuinit int allocate_threshold_blocks(unsigned int cpu, - unsigned int bank, - unsigned int block, - u32 address) -{ - int err; - u32 low, high; - struct threshold_block *b = NULL; - - if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) - return 0; - - if (rdmsr_safe(address, &low, &high)) - return 0; - - if (!(high & MASK_VALID_HI)) { - if (block) - goto recurse; - else - return 0; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - goto recurse; - - b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); - if (!b) - return -ENOMEM; - - b->block = block; - b->bank = bank; - b->cpu = cpu; - b->address = address; - b->interrupt_enable = 0; - b->threshold_limit = THRESHOLD_MAX; - - INIT_LIST_HEAD(&b->miscj); - - if (per_cpu(threshold_banks, cpu)[bank]->blocks) - list_add(&b->miscj, - &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); - else - per_cpu(threshold_banks, cpu)[bank]->blocks = b; - - kobject_set_name(&b->kobj, "misc%i", block); - b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj; - b->kobj.ktype = &threshold_ktype; - err = kobject_register(&b->kobj); - if (err) - goto out_free; -recurse: - if (!block) { - address = (low & MASK_BLKPTR_LO) >> 21; - if (!address) - return 0; - address += MCG_XBLK_ADDR; - } else - ++address; - - err = allocate_threshold_blocks(cpu, bank, ++block, address); - if (err) - goto out_free; - - return err; - -out_free: - if (b) { - kobject_unregister(&b->kobj); - kfree(b); - } - return err; -} - -/* symlinks sibling shared banks to first core. first core owns dir/files. */ -static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) -{ - int i, err = 0; - struct threshold_bank *b = NULL; - cpumask_t oldmask = CPU_MASK_NONE; - char name[32]; - - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */ - i = first_cpu(cpu_core_map[cpu]); - - /* first core not up yet */ - if (cpu_data[i].cpu_core_id) - goto out; - - /* already linked */ - if (per_cpu(threshold_banks, cpu)[bank]) - goto out; - - b = per_cpu(threshold_banks, i)[bank]; - - if (!b) - goto out; - - err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, - &b->kobj, name); - if (err) - goto out; - - b->cpus = cpu_core_map[cpu]; - per_cpu(threshold_banks, cpu)[bank] = b; - goto out; - } -#endif - - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); - if (!b) { - err = -ENOMEM; - goto out; - } - - kobject_set_name(&b->kobj, "threshold_bank%i", bank); - b->kobj.parent = &per_cpu(device_mce, cpu).kobj; -#ifndef CONFIG_SMP - b->cpus = CPU_MASK_ALL; -#else - b->cpus = cpu_core_map[cpu]; -#endif - err = kobject_register(&b->kobj); - if (err) - goto out_free; - - per_cpu(threshold_banks, cpu)[bank] = b; - - oldmask = affinity_set(cpu); - err = allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); - affinity_restore(oldmask); - - if (err) - goto out_free; - - for_each_cpu_mask(i, b->cpus) { - if (i == cpu) - continue; - - err = sysfs_create_link(&per_cpu(device_mce, i).kobj, - &b->kobj, name); - if (err) - goto out; - - per_cpu(threshold_banks, i)[bank] = b; - } - - goto out; - -out_free: - per_cpu(threshold_banks, cpu)[bank] = NULL; - kfree(b); -out: - return err; -} - -/* create dir/files for all valid threshold banks */ -static __cpuinit int threshold_create_device(unsigned int cpu) -{ - unsigned int bank; - int err = 0; - - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) - continue; - err = threshold_create_bank(cpu, bank); - if (err) - goto out; - } -out: - return err; -} - -/* - * let's be hotplug friendly. - * in case of multiple core processors, the first core always takes ownership - * of shared sysfs dir/files, and rest of the cores will be symlinked to it. - */ - -static void deallocate_threshold_block(unsigned int cpu, - unsigned int bank) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; - - if (!head) - return; - - list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { - kobject_unregister(&pos->kobj); - list_del(&pos->miscj); - kfree(pos); - } - - kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); - per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; -} - -static void threshold_remove_bank(unsigned int cpu, int bank) -{ - int i = 0; - struct threshold_bank *b; - char name[32]; - - b = per_cpu(threshold_banks, cpu)[bank]; - - if (!b) - return; - - if (!b->blocks) - goto free_out; - - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - /* sibling symlink */ - if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); - per_cpu(threshold_banks, cpu)[bank] = NULL; - return; - } -#endif - - /* remove all sibling symlinks before unregistering */ - for_each_cpu_mask(i, b->cpus) { - if (i == cpu) - continue; - - sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); - per_cpu(threshold_banks, i)[bank] = NULL; - } - - deallocate_threshold_block(cpu, bank); - -free_out: - kobject_unregister(&b->kobj); - kfree(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; -} - -static void threshold_remove_device(unsigned int cpu) -{ - unsigned int bank; - - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) - continue; - threshold_remove_bank(cpu, bank); - } -} - -/* get notified when a cpu comes on/off */ -static int threshold_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - /* cpu was unsigned int to begin with */ - unsigned int cpu = (unsigned long)hcpu; - - if (cpu >= NR_CPUS) - goto out; - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - threshold_create_device(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - threshold_remove_device(cpu); - break; - default: - break; - } - out: - return NOTIFY_OK; -} - -static struct notifier_block threshold_cpu_notifier = { - .notifier_call = threshold_cpu_callback, -}; - -static __init int threshold_init_device(void) -{ - unsigned lcpu = 0; - - /* to hit CPUs online before the notifier is up */ - for_each_online_cpu(lcpu) { - int err = threshold_create_device(lcpu); - if (err) - return err; - } - register_hotcpu_notifier(&threshold_cpu_notifier); - return 0; -} - -device_initcall(threshold_init_device); diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c deleted file mode 100644 index 6551505d8a2c..000000000000 --- a/arch/x86_64/kernel/mce_intel.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Intel specific MCE features. - * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> - */ - -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/percpu.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/mce.h> -#include <asm/hw_irq.h> -#include <asm/idle.h> -#include <asm/therm_throt.h> - -asmlinkage void smp_thermal_interrupt(void) -{ - __u64 msr_val; - - ack_APIC_irq(); - - exit_idle(); - irq_enter(); - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & 1)) - mce_log_therm_throt_event(smp_processor_id(), msr_val); - - irq_exit(); -} - -static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) -{ - u32 l, h; - int tm2 = 0; - unsigned int cpu = smp_processor_id(); - - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; - - if (!cpu_has(c, X86_FEATURE_ACC)) - return; - - /* first check if TM1 is already enabled by the BIOS, in which - * case there might be some SMM goo which handles it, so we can't even - * put a handler since it might be delivered via SMI already. - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) - tm2 = 1; - - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", cpu, (h & APIC_VECTOR_MASK)); - return; - } - - h = THERMAL_APIC_VECTOR; - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); - - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); - return; -} - -void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) -{ - intel_init_thermal(c); -} diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c deleted file mode 100644 index a888e67f5874..000000000000 --- a/arch/x86_64/kernel/module.c +++ /dev/null @@ -1,185 +0,0 @@ -/* Kernel module help for x86-64 - Copyright (C) 2001 Rusty Russell. - Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include <linux/moduleloader.h> -#include <linux/elf.h> -#include <linux/vmalloc.h> -#include <linux/fs.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/bug.h> - -#include <asm/system.h> -#include <asm/page.h> -#include <asm/pgtable.h> - -#define DEBUGP(fmt...) - -#ifndef CONFIG_UML -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ -} - -void *module_alloc(unsigned long size) -{ - struct vm_struct *area; - - if (!size) - return NULL; - size = PAGE_ALIGN(size); - if (size > MODULES_LEN) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); -} -#endif - -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - -int apply_relocate_add(Elf64_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - unsigned int i; - Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; - Elf64_Sym *sym; - void *loc; - u64 val; - - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { - /* This is where to make the change */ - loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr - + rel[i].r_offset; - - /* This is the symbol it is referring to. Note that all - undefined symbols have been resolved. */ - sym = (Elf64_Sym *)sechdrs[symindex].sh_addr - + ELF64_R_SYM(rel[i].r_info); - - DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), - sym->st_value, rel[i].r_addend, (u64)loc); - - val = sym->st_value + rel[i].r_addend; - - switch (ELF64_R_TYPE(rel[i].r_info)) { - case R_X86_64_NONE: - break; - case R_X86_64_64: - *(u64 *)loc = val; - break; - case R_X86_64_32: - *(u32 *)loc = val; - if (val != *(u32 *)loc) - goto overflow; - break; - case R_X86_64_32S: - *(s32 *)loc = val; - if ((s64)val != *(s32 *)loc) - goto overflow; - break; - case R_X86_64_PC32: - val -= (u64)loc; - *(u32 *)loc = val; -#if 0 - if ((s64)val != *(s32 *)loc) - goto overflow; -#endif - break; - default: - printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", - me->name, ELF64_R_TYPE(rel[i].r_info)); - return -ENOEXEC; - } - } - return 0; - -overflow: - printk(KERN_ERR "overflow in relocation type %d val %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), val); - printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", - me->name); - return -ENOEXEC; -} - -int apply_relocate(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk("non add relocation not supported\n"); - return -ENOSYS; -} - -int module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; - if (!strcmp(".altinstructions", secstrings + s->sh_name)) - alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks= s; - } - - if (alt) { - /* patch .altinstructions */ - void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - if (locks && text) { - void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; - alternatives_smp_module_add(me, me->name, - lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); - } - - return module_bug_finalize(hdr, sechdrs, me); -} - -void module_arch_cleanup(struct module *mod) -{ - alternatives_smp_module_del(mod); - module_bug_cleanup(mod); -} diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c deleted file mode 100644 index 8bf0ca03ac8e..000000000000 --- a/arch/x86_64/kernel/mpparse.c +++ /dev/null @@ -1,852 +0,0 @@ -/* - * Intel Multiprocessor Specification 1.1 and 1.4 - * compliant MP-table parsing routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Erich Boleyn : MP v1.4 and additional changes. - * Alan Cox : Added EBDA scanning - * Ingo Molnar : various cleanups and rewrites - * Maciej W. Rozycki: Bits for default MP configurations - * Paul Diefenbaugh: Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/acpi.h> -#include <linux/module.h> - -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/acpi.h> - -/* Have we found an MP table */ -int smp_found_config; - -/* - * Various Linux-internal data structures created from the - * MP-table. - */ -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; - -static int mp_current_pci_id = 0; -/* I/O APIC entries */ -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; - -/* # of MP IRQ source entries */ -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* MP IRQ source entries */ -int mp_irq_entries; - -int nr_ioapics; -unsigned long mp_lapic_addr = 0; - - - -/* Processor that is doing the boot up */ -unsigned int boot_cpu_id = -1U; -/* Internal processor count */ -unsigned int num_processors __cpuinitdata = 0; - -unsigned disabled_cpus __cpuinitdata; - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; - -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; - - -/* - * Intel MP BIOS table parsing routines: - */ - -/* - * Checksum an MP configuration block. - */ - -static int __init mpf_checksum(unsigned char *mp, int len) -{ - int sum = 0; - - while (len--) - sum += *mp++; - - return sum & 0xFF; -} - -static void __cpuinit MP_processor_info (struct mpc_config_processor *m) -{ - int cpu; - cpumask_t tmp_map; - char *bootup_cpu = ""; - - if (!(m->mpc_cpuflag & CPU_ENABLED)) { - disabled_cpus++; - return; - } - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - bootup_cpu = " (Bootup-CPU)"; - boot_cpu_id = m->mpc_apicid; - } - - printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); - - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); - - physid_set(m->mpc_apicid, phys_cpu_present_map); - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - /* - * bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - */ - cpu = 0; - } - bios_cpu_apicid[cpu] = m->mpc_apicid; - x86_cpu_to_apicid[cpu] = m->mpc_apicid; - - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); -} - -static void __init MP_bus_info (struct mpc_config_bus *m) -{ - char str[7]; - - memcpy(str, m->mpc_bustype, 6); - str[6] = 0; - Dprintk("Bus #%d is %s\n", m->mpc_busid, str); - - if (strncmp(str, "ISA", 3) == 0) { - set_bit(m->mpc_busid, mp_bus_not_pci); - } else if (strncmp(str, "PCI", 3) == 0) { - clear_bit(m->mpc_busid, mp_bus_not_pci); - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; - mp_current_pci_id++; - } else { - printk(KERN_ERR "Unknown bustype %s\n", str); - } -} - -static int bad_ioapic(unsigned long address) -{ - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in table, skipping!\n"); - return 1; - } - return 0; -} - -static void __init MP_ioapic_info (struct mpc_config_ioapic *m) -{ - if (!(m->mpc_flags & MPC_APIC_USABLE)) - return; - - printk("I/O APIC #%d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicaddr); - - if (bad_ioapic(m->mpc_apicaddr)) - return; - - mp_ioapics[nr_ioapics] = *m; - nr_ioapics++; -} - -static void __init MP_intsrc_info (struct mpc_config_intsrc *m) -{ - mp_irqs [mp_irq_entries] = *m; - Dprintk("Int: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - if (++mp_irq_entries >= MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); -} - -static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) -{ - Dprintk("Lint: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC LINT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); -} - -/* - * Read/parse the MPC - */ - -static int __init smp_read_mpc(struct mp_config_table *mpc) -{ - char str[16]; - int count=sizeof(*mpc); - unsigned char *mpt=((unsigned char *)mpc)+count; - - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk("MPTABLE: bad signature [%c%c%c%c]!\n", - mpc->mpc_signature[0], - mpc->mpc_signature[1], - mpc->mpc_signature[2], - mpc->mpc_signature[3]); - return 0; - } - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk("MPTABLE: checksum error!\n"); - return 0; - } - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", - mpc->mpc_spec); - return 0; - } - if (!mpc->mpc_lapic) { - printk(KERN_ERR "MPTABLE: null local APIC address!\n"); - return 0; - } - memcpy(str,mpc->mpc_oem,8); - str[8] = 0; - printk(KERN_INFO "MPTABLE: OEM ID: %s ",str); - - memcpy(str,mpc->mpc_productid,12); - str[12] = 0; - printk("MPTABLE: Product ID: %s ",str); - - printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic); - - /* save the local APIC address, it might be non-default */ - if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; - - /* - * Now process the configuration blocks. - */ - while (count < mpc->mpc_length) { - switch(*mpt) { - case MP_PROCESSOR: - { - struct mpc_config_processor *m= - (struct mpc_config_processor *)mpt; - if (!acpi_lapic) - MP_processor_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_BUS: - { - struct mpc_config_bus *m= - (struct mpc_config_bus *)mpt; - MP_bus_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_IOAPIC: - { - struct mpc_config_ioapic *m= - (struct mpc_config_ioapic *)mpt; - MP_ioapic_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_INTSRC: - { - struct mpc_config_intsrc *m= - (struct mpc_config_intsrc *)mpt; - - MP_intsrc_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_LINTSRC: - { - struct mpc_config_lintsrc *m= - (struct mpc_config_lintsrc *)mpt; - MP_lintsrc_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - } - } - setup_apic_routing(); - if (!num_processors) - printk(KERN_ERR "MPTABLE: no processors registered!\n"); - return num_processors; -} - -static int __init ELCR_trigger(unsigned int irq) -{ - unsigned int port; - - port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; -} - -static void __init construct_default_ioirq_mptable(int mpc_default_type) -{ - struct mpc_config_intsrc intsrc; - int i; - int ELCR_fallback = 0; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* conforming */ - intsrc.mpc_srcbus = 0; - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; - - intsrc.mpc_irqtype = mp_INT; - - /* - * If true, we have an ISA/PCI system with no IRQ entries - * in the MP table. To prevent the PCI interrupts from being set up - * incorrectly, we try to use the ELCR. The sanity check to see if - * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can - * never be level sensitive, so we simply see if the ELCR agrees. - * If it does, we assume it's valid. - */ - if (mpc_default_type == 5) { - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); - - if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) - printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); - else { - printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); - ELCR_fallback = 1; - } - } - - for (i = 0; i < 16; i++) { - switch (mpc_default_type) { - case 2: - if (i == 0 || i == 13) - continue; /* IRQ0 & IRQ13 not connected */ - /* fall through */ - default: - if (i == 2) - continue; /* IRQ2 is never connected */ - } - - if (ELCR_fallback) { - /* - * If the ELCR indicates a level-sensitive interrupt, we - * copy that information over to the MP table in the - * irqflag field (level sensitive, active high polarity). - */ - if (ELCR_trigger(i)) - intsrc.mpc_irqflag = 13; - else - intsrc.mpc_irqflag = 0; - } - - intsrc.mpc_srcbusirq = i; - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ - MP_intsrc_info(&intsrc); - } - - intsrc.mpc_irqtype = mp_ExtINT; - intsrc.mpc_srcbusirq = 0; - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ - MP_intsrc_info(&intsrc); -} - -static inline void __init construct_default_ISA_mptable(int mpc_default_type) -{ - struct mpc_config_processor processor; - struct mpc_config_bus bus; - struct mpc_config_ioapic ioapic; - struct mpc_config_lintsrc lintsrc; - int linttypes[2] = { mp_ExtINT, mp_NMI }; - int i; - - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* - * 2 CPUs, numbered 0 & 1. - */ - processor.mpc_type = MP_PROCESSOR; - processor.mpc_apicver = 0; - processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = 0; - processor.mpc_featureflag = 0; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; - for (i = 0; i < 2; i++) { - processor.mpc_apicid = i; - MP_processor_info(&processor); - } - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - switch (mpc_default_type) { - default: - printk(KERN_ERR "???\nUnknown standard configuration %d\n", - mpc_default_type); - /* fall through */ - case 1: - case 5: - memcpy(bus.mpc_bustype, "ISA ", 6); - break; - } - MP_bus_info(&bus); - if (mpc_default_type > 4) { - bus.mpc_busid = 1; - memcpy(bus.mpc_bustype, "PCI ", 6); - MP_bus_info(&bus); - } - - ioapic.mpc_type = MP_IOAPIC; - ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = 0; - ioapic.mpc_flags = MPC_APIC_USABLE; - ioapic.mpc_apicaddr = 0xFEC00000; - MP_ioapic_info(&ioapic); - - /* - * We set up most of the low 16 IO-APIC pins according to MPS rules. - */ - construct_default_ioirq_mptable(mpc_default_type); - - lintsrc.mpc_type = MP_LINTSRC; - lintsrc.mpc_irqflag = 0; /* conforming */ - lintsrc.mpc_srcbusid = 0; - lintsrc.mpc_srcbusirq = 0; - lintsrc.mpc_destapic = MP_APIC_ALL; - for (i = 0; i < 2; i++) { - lintsrc.mpc_irqtype = linttypes[i]; - lintsrc.mpc_destapiclint = i; - MP_lintsrc_info(&lintsrc); - } -} - -static struct intel_mp_floating *mpf_found; - -/* - * Scan the memory blocks for an SMP configuration block. - */ -void __init get_smp_config (void) -{ - struct intel_mp_floating *mpf = mpf_found; - - /* - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. - */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); - return; - } - else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); - - printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - - /* - * Now see if we need to read further. - */ - if (mpf->mpf_feature1 != 0) { - - printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); - construct_default_ISA_mptable(mpf->mpf_feature1); - - } else if (mpf->mpf_physptr) { - - /* - * Read the physical hardware table. Anything here will - * override the defaults. - */ - if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { - smp_found_config = 0; - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); - return; - } - /* - * If there are no explicit MP IRQ entries, then we are - * broken. We set up most of the low 16 IO-APIC pins to - * ISA defaults and hope it will work. - */ - if (!mp_irq_entries) { - struct mpc_config_bus bus; - - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - memcpy(bus.mpc_bustype, "ISA ", 6); - MP_bus_info(&bus); - - construct_default_ioirq_mptable(0); - } - - } else - BUG(); - - printk(KERN_INFO "Processors: %d\n", num_processors); - /* - * Only use the first configuration found. - */ -} - -static int __init smp_scan_config (unsigned long base, unsigned long length) -{ - extern void __bad_mpf_size(void); - unsigned int *bp = phys_to_virt(base); - struct intel_mp_floating *mpf; - - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); - if (sizeof(*mpf) != 16) - __bad_mpf_size(); - - while (length > 0) { - mpf = (struct intel_mp_floating *)bp; - if ((*bp == SMP_MAGIC_IDENT) && - (mpf->mpf_length == 1) && - !mpf_checksum((unsigned char *)bp, 16) && - ((mpf->mpf_specification == 1) - || (mpf->mpf_specification == 4)) ) { - - smp_found_config = 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); - if (mpf->mpf_physptr) - reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE); - mpf_found = mpf; - return 1; - } - bp += 4; - length -= 16; - } - return 0; -} - -void __init find_smp_config(void) -{ - unsigned int address; - - /* - * FIXME: Linux assumes you have 640K of base ram.. - * this continues the error... - * - * 1) Scan the bottom 1K for a signature - * 2) Scan the top 1K of base RAM - * 3) Scan the 64K of bios - */ - if (smp_scan_config(0x0,0x400) || - smp_scan_config(639*0x400,0x400) || - smp_scan_config(0xF0000,0x10000)) - return; - /* - * If it is an SMP machine we should know now. - * - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E, calculate and scan it here. - * - * NOTE! There are Linux loaders that will corrupt the EBDA - * area, and as such this kind of SMP config may be less - * trustworthy, simply because the SMP table may have been - * stomped on during early boot. These loaders are buggy and - * should be fixed. - */ - - address = *(unsigned short *)phys_to_virt(0x40E); - address <<= 4; - if (smp_scan_config(address, 0x1000)) - return; - - /* If we have come this far, we did not find an MP table */ - printk(KERN_INFO "No mptable found.\n"); -} - -/* -------------------------------------------------------------------------- - ACPI-based MP Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -void __init mp_register_lapic_address(u64 address) -{ - mp_lapic_addr = (unsigned long) address; - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_id == -1U) - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); -} - -void __cpuinit mp_register_lapic (u8 id, u8 enabled) -{ - struct mpc_config_processor processor; - int boot_cpu = 0; - - if (id == boot_cpu_id) - boot_cpu = 1; - - processor.mpc_type = MP_PROCESSOR; - processor.mpc_apicid = id; - processor.mpc_apicver = 0; - processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); - processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = 0; - processor.mpc_featureflag = 0; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; - - MP_processor_info(&processor); -} - -#define MP_ISA_BUS 0 -#define MP_MAX_IOAPIC_PIN 127 - -static struct mp_ioapic_routing { - int apic_id; - int gsi_start; - int gsi_end; - u32 pin_programmed[4]; -} mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic(int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_start) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; -} - -static u8 uniq_ioapic_id(u8 id) -{ - int i; - DECLARE_BITMAP(used, 256); - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_config_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->mpc_apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -} - -void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) -{ - int idx = 0; - - if (bad_ioapic(address)) - return; - - idx = nr_ioapics; - - mp_ioapics[idx].mpc_type = MP_IOAPIC; - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; - - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); - mp_ioapics[idx].mpc_apicver = 0; - - /* - * Build basic IRQ lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI IRQs). - */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; - mp_ioapic_routing[idx].gsi_start = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_start, - mp_ioapic_routing[idx].gsi_end); - - nr_ioapics++; -} - -void __init -mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) -{ - struct mpc_config_intsrc intsrc; - int ioapic = -1; - int pin = -1; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = gsi - mp_ioapic_routing[ioapic].gsi_start; - - /* - * TBD: This check is for faulty timer entries, where the override - * erroneously sets the trigger to level, resulting in a HUGE - * increase of timer interrupts! - */ - if ((bus_irq == 0) && (trigger == 3)) - trigger = 1; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_irqflag = (trigger << 2) | polarity; - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ - intsrc.mpc_dstirq = pin; /* INTIN# */ - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); -} - -void __init mp_config_acpi_legacy_irqs(void) -{ - struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; - - /* - * Fabricate the legacy ISA bus (bus #31). - */ - set_bit(MP_ISA_BUS, mp_bus_not_pci); - - /* - * Locate the IOAPIC that manages the ISA IRQs (0-15). - */ - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - return; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* Conforming */ - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; - - /* - * Use the default configuration for the IRQs 0-15. Unless - * overridden by (MADT) interrupt source override entries. - */ - for (i = 0; i < 16; i++) { - int idx; - - for (idx = 0; idx < mp_irq_entries; idx++) { - struct mpc_config_intsrc *irq = mp_irqs + idx; - - /* Do we already have a mapping for this ISA IRQ? */ - if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) - break; - - /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && - (irq->mpc_dstirq == i)) - break; - } - - if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); - continue; /* IRQ already used */ - } - - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_srcbusirq = i; /* Identity mapped */ - intsrc.mpc_dstirq = i; - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, - intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - } -} - -int mp_register_gsi(u32 gsi, int triggering, int polarity) -{ - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_gbl_FADT.sci_interrupt == gsi) - return gsi; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) { - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); - return gsi; - } - - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - bit = ioapic_pin % 32; - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); - if (idx > 3) { - printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, - ioapic_pin); - return gsi; - } - if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi; - } - - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); - - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} -#endif /*CONFIG_ACPI*/ diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c deleted file mode 100644 index 0ec6d2ddb931..000000000000 --- a/arch/x86_64/kernel/nmi.c +++ /dev/null @@ -1,483 +0,0 @@ -/* - * linux/arch/x86_64/nmi.c - * - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar <mingo@redhat.com> - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include <linux/nmi.h> -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/sysdev.h> -#include <linux/sysctl.h> -#include <linux/kprobes.h> -#include <linux/cpumask.h> -#include <linux/kdebug.h> - -#include <asm/smp.h> -#include <asm/nmi.h> -#include <asm/proto.h> -#include <asm/mce.h> - -int unknown_nmi_panic; -int nmi_watchdog_enabled; -int panic_on_unrecovered_nmi; - -static cpumask_t backtrace_mask = CPU_MASK_NONE; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -int panic_on_timeout; - -unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; - -static DEFINE_PER_CPU(short, wd_enabled); - -/* local prototypes */ -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); - -/* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) -{ - if (nmi_watchdog != NMI_DEFAULT) - return; - nmi_watchdog = NMI_NONE; -} - -static int endflag __initdata = 0; - -#ifdef CONFIG_SMP -/* The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* Intentionally don't use cpu_relax here. This is - to make sure that the performance counter really ticks, - even if there is a simulator or similar that catches the - pause instruction. On a real HT machine this is fine because - all other CPUs are busy with "useless" delay loops and don't - care if they get somewhat less cycles. */ - while (endflag == 0) - mb(); -} -#endif - -int __init check_nmi_watchdog (void) -{ - int *counts; - int cpu; - - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) - return 0; - - if (!atomic_read(&nmi_active)) - return 0; - - counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!counts) - return -1; - - printk(KERN_INFO "testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); -#endif - - for (cpu = 0; cpu < NR_CPUS; cpu++) - counts[cpu] = cpu_pda(cpu)->__nmi_count; - local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { - printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", - cpu, - counts[cpu], - cpu_pda(cpu)->__nmi_count); - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); - } - } - if (!atomic_read(&nmi_active)) { - kfree(counts); - atomic_set(&nmi_active, -1); - endflag = 1; - return -1; - } - endflag = 1; - printk("OK.\n"); - - /* now that we know it works we can reduce NMI frequency to - something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(counts); - return 0; -} - -int __init setup_nmi_watchdog(char *str) -{ - int nmi; - - if (!strncmp(str,"panic",5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - get_option(&str, &nmi); - - if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) - return 0; - - nmi_watchdog = nmi; - return 1; -} - -__setup("nmi_watchdog=", setup_nmi_watchdog); - - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - -static struct sysdev_class nmi_sysclass = { - set_kset_name("lapic_nmi"), - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if ( atomic_read(&nmi_active) < 0 ) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled) == 1) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - __get_cpu_var(wd_enabled) = 1; - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog > 0) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - touch_softlockup_watchdog(); -} - -int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) -{ - int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - - sum = read_pda(apic_timer_irqs); - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - if (cpu_isset(cpu, backtrace_mask)) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ - - spin_lock(&lock); - printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); - spin_unlock(&lock); - cpu_clear(cpu, backtrace_mask); - } - -#ifdef CONFIG_X86_MCE - /* Could check oops_in_progress here too, but it's safer - not too */ - if (atomic_read(&mce_entry) > 0) - touched = 1; -#endif - /* if the apic timer isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, - panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); - } - - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -static unsigned ignore_nmis; - -asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) -{ - nmi_enter(); - add_pda(__nmi_count,1); - if (!ignore_nmis) - default_do_nmi(regs); - nmi_exit(); -} - -int do_nmi_callback(struct pt_regs * regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - -#ifdef CONFIG_SYSCTL - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { - printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - /* if nmi_watchdog is not set yet, then set it */ - nmi_watchdog_default(); - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else { - printk( KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif - -void __trigger_all_cpu_backtrace(void) -{ - int i; - - backtrace_mask = cpu_online_map; - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpus_empty(backtrace_mask)) - break; - mdelay(1); - } -} - -EXPORT_SYMBOL(nmi_active); -EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c deleted file mode 100644 index 71da01e73f03..000000000000 --- a/arch/x86_64/kernel/pci-calgary.c +++ /dev/null @@ -1,1578 +0,0 @@ -/* - * Derived from arch/powerpc/kernel/iommu.c - * - * Copyright IBM Corporation, 2006-2007 - * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> - * - * Author: Jon Mason <jdmason@kudzu.us> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/dma-mapping.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> -#include <linux/delay.h> -#include <asm/iommu.h> -#include <asm/calgary.h> -#include <asm/tce.h> -#include <asm/pci-direct.h> -#include <asm/system.h> -#include <asm/dma.h> -#include <asm/rio.h> - -#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT -int use_calgary __read_mostly = 1; -#else -int use_calgary __read_mostly = 0; -#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ - -#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 -#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 - -/* register offsets inside the host bridge space */ -#define CALGARY_CONFIG_REG 0x0108 -#define PHB_CSR_OFFSET 0x0110 /* Channel Status */ -#define PHB_PLSSR_OFFSET 0x0120 -#define PHB_CONFIG_RW_OFFSET 0x0160 -#define PHB_IOBASE_BAR_LOW 0x0170 -#define PHB_IOBASE_BAR_HIGH 0x0180 -#define PHB_MEM_1_LOW 0x0190 -#define PHB_MEM_1_HIGH 0x01A0 -#define PHB_IO_ADDR_SIZE 0x01B0 -#define PHB_MEM_1_SIZE 0x01C0 -#define PHB_MEM_ST_OFFSET 0x01D0 -#define PHB_AER_OFFSET 0x0200 -#define PHB_CONFIG_0_HIGH 0x0220 -#define PHB_CONFIG_0_LOW 0x0230 -#define PHB_CONFIG_0_END 0x0240 -#define PHB_MEM_2_LOW 0x02B0 -#define PHB_MEM_2_HIGH 0x02C0 -#define PHB_MEM_2_SIZE_HIGH 0x02D0 -#define PHB_MEM_2_SIZE_LOW 0x02E0 -#define PHB_DOSHOLE_OFFSET 0x08E0 - -/* CalIOC2 specific */ -#define PHB_SAVIOR_L2 0x0DB0 -#define PHB_PAGE_MIG_CTRL 0x0DA8 -#define PHB_PAGE_MIG_DEBUG 0x0DA0 -#define PHB_ROOT_COMPLEX_STATUS 0x0CB0 - -/* PHB_CONFIG_RW */ -#define PHB_TCE_ENABLE 0x20000000 -#define PHB_SLOT_DISABLE 0x1C000000 -#define PHB_DAC_DISABLE 0x01000000 -#define PHB_MEM2_ENABLE 0x00400000 -#define PHB_MCSR_ENABLE 0x00100000 -/* TAR (Table Address Register) */ -#define TAR_SW_BITS 0x0000ffffffff800fUL -#define TAR_VALID 0x0000000000000008UL -/* CSR (Channel/DMA Status Register) */ -#define CSR_AGENT_MASK 0xffe0ffff -/* CCR (Calgary Configuration Register) */ -#define CCR_2SEC_TIMEOUT 0x000000000000000EUL -/* PMCR/PMDR (Page Migration Control/Debug Registers */ -#define PMR_SOFTSTOP 0x80000000 -#define PMR_SOFTSTOPFAULT 0x40000000 -#define PMR_HARDSTOP 0x20000000 - -#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ -#define MAX_NUM_CHASSIS 8 /* max number of chassis */ -/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ -#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) -#define PHBS_PER_CALGARY 4 - -/* register offsets in Calgary's internal register space */ -static const unsigned long tar_offsets[] = { - 0x0580 /* TAR0 */, - 0x0588 /* TAR1 */, - 0x0590 /* TAR2 */, - 0x0598 /* TAR3 */ -}; - -static const unsigned long split_queue_offsets[] = { - 0x4870 /* SPLIT QUEUE 0 */, - 0x5870 /* SPLIT QUEUE 1 */, - 0x6870 /* SPLIT QUEUE 2 */, - 0x7870 /* SPLIT QUEUE 3 */ -}; - -static const unsigned long phb_offsets[] = { - 0x8000 /* PHB0 */, - 0x9000 /* PHB1 */, - 0xA000 /* PHB2 */, - 0xB000 /* PHB3 */ -}; - -/* PHB debug registers */ - -static const unsigned long phb_debug_offsets[] = { - 0x4000 /* PHB 0 DEBUG */, - 0x5000 /* PHB 1 DEBUG */, - 0x6000 /* PHB 2 DEBUG */, - 0x7000 /* PHB 3 DEBUG */ -}; - -/* - * STUFF register for each debug PHB, - * byte 1 = start bus number, byte 2 = end bus number - */ - -#define PHB_DEBUG_STUFF_OFFSET 0x0020 - -#define EMERGENCY_PAGES 32 /* = 128KB */ - -unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; -static int translate_empty_slots __read_mostly = 0; -static int calgary_detected __read_mostly = 0; - -static struct rio_table_hdr *rio_table_hdr __initdata; -static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; -static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata; - -struct calgary_bus_info { - void *tce_space; - unsigned char translation_disabled; - signed char phbid; - void __iomem *bbar; -}; - -static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); -static void calgary_tce_cache_blast(struct iommu_table *tbl); -static void calgary_dump_error_regs(struct iommu_table *tbl); -static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); -static void calioc2_tce_cache_blast(struct iommu_table *tbl); -static void calioc2_dump_error_regs(struct iommu_table *tbl); - -static struct cal_chipset_ops calgary_chip_ops = { - .handle_quirks = calgary_handle_quirks, - .tce_cache_blast = calgary_tce_cache_blast, - .dump_error_regs = calgary_dump_error_regs -}; - -static struct cal_chipset_ops calioc2_chip_ops = { - .handle_quirks = calioc2_handle_quirks, - .tce_cache_blast = calioc2_tce_cache_blast, - .dump_error_regs = calioc2_dump_error_regs -}; - -static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; - -/* enable this to stress test the chip's TCE cache */ -#ifdef CONFIG_IOMMU_DEBUG -int debugging __read_mostly = 1; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - unsigned long idx = start; - - BUG_ON(start >= end); - - while (idx < end) { - if (!!test_bit(idx, bitmap) != expected) - return idx; - ++idx; - } - - /* all bits have the expected value */ - return ~0UL; -} -#else /* debugging is disabled */ -int debugging __read_mostly = 0; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - return ~0UL; -} - -#endif /* CONFIG_IOMMU_DEBUG */ - -static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) -{ - unsigned int npages; - - npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); - npages >>= PAGE_SHIFT; - - return npages; -} - -static inline int translate_phb(struct pci_dev* dev) -{ - int disabled = bus_info[dev->bus->number].translation_disabled; - return !disabled; -} - -static void iommu_range_reserve(struct iommu_table *tbl, - unsigned long start_addr, unsigned int npages) -{ - unsigned long index; - unsigned long end; - unsigned long badbit; - unsigned long flags; - - index = start_addr >> PAGE_SHIFT; - - /* bail out if we're asked to reserve a region we don't cover */ - if (index >= tbl->it_size) - return; - - end = index + npages; - if (end > tbl->it_size) /* don't go off the table */ - end = tbl->it_size; - - spin_lock_irqsave(&tbl->it_lock, flags); - - badbit = verify_bit_range(tbl->it_map, 0, index, end); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: entry already allocated at " - "0x%lx tbl %p dma 0x%lx npages %u\n", - badbit, tbl, start_addr, npages); - } - - set_bit_string(tbl->it_map, index, npages); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - -static unsigned long iommu_range_alloc(struct iommu_table *tbl, - unsigned int npages) -{ - unsigned long flags; - unsigned long offset; - - BUG_ON(npages == 0); - - spin_lock_irqsave(&tbl->it_lock, flags); - - offset = find_next_zero_string(tbl->it_map, tbl->it_hint, - tbl->it_size, npages); - if (offset == ~0UL) { - tbl->chip_ops->tce_cache_blast(tbl); - offset = find_next_zero_string(tbl->it_map, 0, - tbl->it_size, npages); - if (offset == ~0UL) { - printk(KERN_WARNING "Calgary: IOMMU full.\n"); - spin_unlock_irqrestore(&tbl->it_lock, flags); - if (panic_on_overflow) - panic("Calgary: fix the allocator.\n"); - else - return bad_dma_address; - } - } - - set_bit_string(tbl->it_map, offset, npages); - tbl->it_hint = offset + npages; - BUG_ON(tbl->it_hint > tbl->it_size); - - spin_unlock_irqrestore(&tbl->it_lock, flags); - - return offset; -} - -static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, - unsigned int npages, int direction) -{ - unsigned long entry; - dma_addr_t ret = bad_dma_address; - - entry = iommu_range_alloc(tbl, npages); - - if (unlikely(entry == bad_dma_address)) - goto error; - - /* set the return dma address */ - ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); - - /* put the TCEs in the HW table */ - tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, - direction); - - return ret; - -error: - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); - return bad_dma_address; -} - -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) -{ - unsigned long entry; - unsigned long badbit; - unsigned long badend; - unsigned long flags; - - /* were we called with bad_dma_address? */ - badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); - if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { - printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " - "address 0x%Lx\n", dma_addr); - WARN_ON(1); - return; - } - - entry = dma_addr >> PAGE_SHIFT; - - BUG_ON(entry + npages > tbl->it_size); - - tce_free(tbl, entry, npages); - - spin_lock_irqsave(&tbl->it_lock, flags); - - badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: bit is off at 0x%lx " - "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - badbit, tbl, dma_addr, entry, npages); - } - - __clear_bit_string(tbl->it_map, entry, npages); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - -static inline struct iommu_table *find_iommu_table(struct device *dev) -{ - struct pci_dev *pdev; - struct pci_bus *pbus; - struct iommu_table *tbl; - - pdev = to_pci_dev(dev); - - pbus = pdev->bus; - - /* is the device behind a bridge? Look for the root bus */ - while (pbus->parent) - pbus = pbus->parent; - - tbl = pci_iommu(pbus); - - BUG_ON(tbl && (tbl->it_busno != pbus->number)); - - return tbl; -} - -static void calgary_unmap_sg(struct device *dev, - struct scatterlist *sglist, int nelems, int direction) -{ - struct iommu_table *tbl = find_iommu_table(dev); - - if (!translate_phb(to_pci_dev(dev))) - return; - - while (nelems--) { - unsigned int npages; - dma_addr_t dma = sglist->dma_address; - unsigned int dmalen = sglist->dma_length; - - if (dmalen == 0) - break; - - npages = num_dma_pages(dma, dmalen); - iommu_free(tbl, dma, npages); - sglist++; - } -} - -static int calgary_nontranslate_map_sg(struct device* dev, - struct scatterlist *sg, int nelems, int direction) -{ - int i; - - for (i = 0; i < nelems; i++ ) { - struct scatterlist *s = &sg[i]; - BUG_ON(!s->page); - s->dma_address = virt_to_bus(page_address(s->page) +s->offset); - s->dma_length = s->length; - } - return nelems; -} - -static int calgary_map_sg(struct device *dev, struct scatterlist *sg, - int nelems, int direction) -{ - struct iommu_table *tbl = find_iommu_table(dev); - unsigned long vaddr; - unsigned int npages; - unsigned long entry; - int i; - - if (!translate_phb(to_pci_dev(dev))) - return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - - for (i = 0; i < nelems; i++ ) { - struct scatterlist *s = &sg[i]; - BUG_ON(!s->page); - - vaddr = (unsigned long)page_address(s->page) + s->offset; - npages = num_dma_pages(vaddr, s->length); - - entry = iommu_range_alloc(tbl, npages); - if (entry == bad_dma_address) { - /* makes sure unmap knows to stop */ - s->dma_length = 0; - goto error; - } - - s->dma_address = (entry << PAGE_SHIFT) | s->offset; - - /* insert into HW table */ - tce_build(tbl, entry, npages, vaddr & PAGE_MASK, - direction); - - s->dma_length = s->length; - } - - return nelems; -error: - calgary_unmap_sg(dev, sg, nelems, direction); - for (i = 0; i < nelems; i++) { - sg[i].dma_address = bad_dma_address; - sg[i].dma_length = 0; - } - return 0; -} - -static dma_addr_t calgary_map_single(struct device *dev, void *vaddr, - size_t size, int direction) -{ - dma_addr_t dma_handle = bad_dma_address; - unsigned long uaddr; - unsigned int npages; - struct iommu_table *tbl = find_iommu_table(dev); - - uaddr = (unsigned long)vaddr; - npages = num_dma_pages(uaddr, size); - - if (translate_phb(to_pci_dev(dev))) - dma_handle = iommu_alloc(tbl, vaddr, npages, direction); - else - dma_handle = virt_to_bus(vaddr); - - return dma_handle; -} - -static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, - size_t size, int direction) -{ - struct iommu_table *tbl = find_iommu_table(dev); - unsigned int npages; - - if (!translate_phb(to_pci_dev(dev))) - return; - - npages = num_dma_pages(dma_handle, size); - iommu_free(tbl, dma_handle, npages); -} - -static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) -{ - void *ret = NULL; - dma_addr_t mapping; - unsigned int npages, order; - struct iommu_table *tbl = find_iommu_table(dev); - - size = PAGE_ALIGN(size); /* size rounded up to full pages */ - npages = size >> PAGE_SHIFT; - order = get_order(size); - - /* alloc enough pages (and possibly more) */ - ret = (void *)__get_free_pages(flag, order); - if (!ret) - goto error; - memset(ret, 0, size); - - if (translate_phb(to_pci_dev(dev))) { - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) - goto free; - - *dma_handle = mapping; - } else /* non translated slot */ - *dma_handle = virt_to_bus(ret); - - return ret; - -free: - free_pages((unsigned long)ret, get_order(size)); - ret = NULL; -error: - return ret; -} - -static const struct dma_mapping_ops calgary_dma_ops = { - .alloc_coherent = calgary_alloc_coherent, - .map_single = calgary_map_single, - .unmap_single = calgary_unmap_single, - .map_sg = calgary_map_sg, - .unmap_sg = calgary_unmap_sg, -}; - -static inline void __iomem * busno_to_bbar(unsigned char num) -{ - return bus_info[num].bbar; -} - -static inline int busno_to_phbid(unsigned char num) -{ - return bus_info[num].phbid; -} - -static inline unsigned long split_queue_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return split_queue_offsets[idx]; -} - -static inline unsigned long tar_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return tar_offsets[idx]; -} - -static inline unsigned long phb_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return phb_offsets[idx]; -} - -static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) -{ - unsigned long target = ((unsigned long)bar) | offset; - return (void __iomem*)target; -} - -static inline int is_calioc2(unsigned short device) -{ - return (device == PCI_DEVICE_ID_IBM_CALIOC2); -} - -static inline int is_calgary(unsigned short device) -{ - return (device == PCI_DEVICE_ID_IBM_CALGARY); -} - -static inline int is_cal_pci_dev(unsigned short device) -{ - return (is_calgary(device) || is_calioc2(device)); -} - -static void calgary_tce_cache_blast(struct iommu_table *tbl) -{ - u64 val; - u32 aer; - int i = 0; - void __iomem *bbar = tbl->bbar; - void __iomem *target; - - /* disable arbitration on the bus */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); - aer = readl(target); - writel(0, target); - - /* read plssr to ensure it got there */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); - val = readl(target); - - /* poll split queues until all DMA activity is done */ - target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); - do { - val = readq(target); - i++; - } while ((val & 0xff) != 0xff && i < 100); - if (i == 100) - printk(KERN_WARNING "Calgary: PCI bus not quiesced, " - "continuing anyway\n"); - - /* invalidate TCE cache */ - target = calgary_reg(bbar, tar_offset(tbl->it_busno)); - writeq(tbl->tar_val, target); - - /* enable arbitration */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); - writel(aer, target); - (void)readl(target); /* flush */ -} - -static void calioc2_tce_cache_blast(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u64 val64; - u32 val; - int i = 0; - int count = 1; - unsigned char bus = tbl->it_busno; - -begin: - printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " - "sequence - count %d\n", bus, count); - - /* 1. using the Page Migration Control reg set SoftStop */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); - val |= PMR_SOFTSTOP; - printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); - writel(cpu_to_be32(val), target); - - /* 2. poll split queues until all DMA activity is done */ - printk(KERN_DEBUG "2a. starting to poll split queues\n"); - target = calgary_reg(bbar, split_queue_offset(bus)); - do { - val64 = readq(target); - i++; - } while ((val64 & 0xff) != 0xff && i < 100); - if (i == 100) - printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " - "continuing anyway\n"); - - /* 3. poll Page Migration DEBUG for SoftStopFault */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); - - /* 4. if SoftStopFault - goto (1) */ - if (val & PMR_SOFTSTOPFAULT) { - if (++count < 100) - goto begin; - else { - printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " - "aborting TCE cache flush sequence!\n"); - return; /* pray for the best */ - } - } - - /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); - - /* 6. invalidate TCE cache */ - printk(KERN_DEBUG "6. invalidating TCE cache\n"); - target = calgary_reg(bbar, tar_offset(bus)); - writeq(tbl->tar_val, target); - - /* 7. Re-read PMCR */ - printk(KERN_DEBUG "7a. Re-reading PMCR\n"); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); - - /* 8. Remove HardStop */ - printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = 0; - printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); - writel(cpu_to_be32(val), target); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); -} - -static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, - u64 limit) -{ - unsigned int numpages; - - limit = limit | 0xfffff; - limit++; - - numpages = ((limit - start) >> PAGE_SHIFT); - iommu_range_reserve(pci_iommu(dev->bus), start, numpages); -} - -static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) -{ - void __iomem *target; - u64 low, high, sizelow; - u64 start, limit; - struct iommu_table *tbl = pci_iommu(dev->bus); - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - - /* peripheral MEM_1 region */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); - low = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); - high = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); - sizelow = be32_to_cpu(readl(target)); - - start = (high << 32) | low; - limit = sizelow; - - calgary_reserve_mem_region(dev, start, limit); -} - -static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) -{ - void __iomem *target; - u32 val32; - u64 low, high, sizelow, sizehigh; - u64 start, limit; - struct iommu_table *tbl = pci_iommu(dev->bus); - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - - /* is it enabled? */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - if (!(val32 & PHB_MEM2_ENABLE)) - return; - - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); - low = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); - high = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); - sizelow = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); - sizehigh = be32_to_cpu(readl(target)); - - start = (high << 32) | low; - limit = (sizehigh << 32) | sizelow; - - calgary_reserve_mem_region(dev, start, limit); -} - -/* - * some regions of the IO address space do not get translated, so we - * must not give devices IO addresses in those regions. The regions - * are the 640KB-1MB region and the two PCI peripheral memory holes. - * Reserve all of them in the IOMMU bitmap to avoid giving them out - * later. - */ -static void __init calgary_reserve_regions(struct pci_dev *dev) -{ - unsigned int npages; - u64 start; - struct iommu_table *tbl = pci_iommu(dev->bus); - - /* reserve EMERGENCY_PAGES from bad_dma_address and up */ - iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); - - /* avoid the BIOS/VGA first 640KB-1MB region */ - /* for CalIOC2 - avoid the entire first MB */ - if (is_calgary(dev->device)) { - start = (640 * 1024); - npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; - } else { /* calioc2 */ - start = 0; - npages = (1 * 1024 * 1024) >> PAGE_SHIFT; - } - iommu_range_reserve(tbl, start, npages); - - /* reserve the two PCI peripheral memory regions in IO space */ - calgary_reserve_peripheral_mem_1(dev); - calgary_reserve_peripheral_mem_2(dev); -} - -static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) -{ - u64 val64; - u64 table_phys; - void __iomem *target; - int ret; - struct iommu_table *tbl; - - /* build TCE tables for each PHB */ - ret = build_tce_table(dev, bbar); - if (ret) - return ret; - - tbl = pci_iommu(dev->bus); - tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; - tce_free(tbl, 0, tbl->it_size); - - if (is_calgary(dev->device)) - tbl->chip_ops = &calgary_chip_ops; - else if (is_calioc2(dev->device)) - tbl->chip_ops = &calioc2_chip_ops; - else - BUG(); - - calgary_reserve_regions(dev); - - /* set TARs for each PHB */ - target = calgary_reg(bbar, tar_offset(dev->bus->number)); - val64 = be64_to_cpu(readq(target)); - - /* zero out all TAR bits under sw control */ - val64 &= ~TAR_SW_BITS; - table_phys = (u64)__pa(tbl->it_base); - - val64 |= table_phys; - - BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); - val64 |= (u64) specified_table_size; - - tbl->tar_val = cpu_to_be64(val64); - - writeq(tbl->tar_val, target); - readq(target); /* flush */ - - return 0; -} - -static void __init calgary_free_bus(struct pci_dev *dev) -{ - u64 val64; - struct iommu_table *tbl = pci_iommu(dev->bus); - void __iomem *target; - unsigned int bitmapsz; - - target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); - val64 = be64_to_cpu(readq(target)); - val64 &= ~TAR_SW_BITS; - writeq(cpu_to_be64(val64), target); - readq(target); /* flush */ - - bitmapsz = tbl->it_size / BITS_PER_BYTE; - free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); - tbl->it_map = NULL; - - kfree(tbl); - - set_pci_iommu(dev->bus, NULL); - - /* Can't free bootmem allocated memory after system is up :-( */ - bus_info[dev->bus->number].tce_space = NULL; -} - -static void calgary_dump_error_regs(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u32 csr, plssr; - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); - csr = be32_to_cpu(readl(target)); - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); - plssr = be32_to_cpu(readl(target)); - - /* If no error, the agent ID in the CSR is not valid */ - printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " - "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); -} - -static void calioc2_dump_error_regs(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - u32 csr, csmr, plssr, mck, rcstat; - void __iomem *target; - unsigned long phboff = phb_offset(tbl->it_busno); - unsigned long erroff; - u32 errregs[7]; - int i; - - /* dump CSR */ - target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); - csr = be32_to_cpu(readl(target)); - /* dump PLSSR */ - target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); - plssr = be32_to_cpu(readl(target)); - /* dump CSMR */ - target = calgary_reg(bbar, phboff | 0x290); - csmr = be32_to_cpu(readl(target)); - /* dump mck */ - target = calgary_reg(bbar, phboff | 0x800); - mck = be32_to_cpu(readl(target)); - - printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", - tbl->it_busno); - - printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", - csr, plssr, csmr, mck); - - /* dump rest of error regs */ - printk(KERN_EMERG "Calgary: "); - for (i = 0; i < ARRAY_SIZE(errregs); i++) { - /* err regs are at 0x810 - 0x870 */ - erroff = (0x810 + (i * 0x10)); - target = calgary_reg(bbar, phboff | erroff); - errregs[i] = be32_to_cpu(readl(target)); - printk("0x%08x@0x%lx ", errregs[i], erroff); - } - printk("\n"); - - /* root complex status */ - target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); - rcstat = be32_to_cpu(readl(target)); - printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, - PHB_ROOT_COMPLEX_STATUS); -} - -static void calgary_watchdog(unsigned long data) -{ - struct pci_dev *dev = (struct pci_dev *)data; - struct iommu_table *tbl = pci_iommu(dev->bus); - void __iomem *bbar = tbl->bbar; - u32 val32; - void __iomem *target; - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); - val32 = be32_to_cpu(readl(target)); - - /* If no error, the agent ID in the CSR is not valid */ - if (val32 & CSR_AGENT_MASK) { - tbl->chip_ops->dump_error_regs(tbl); - - /* reset error */ - writel(0, target); - - /* Disable bus that caused the error */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | - PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 |= PHB_SLOT_DISABLE; - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - } else { - /* Reset the timer */ - mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); - } -} - -static void __init calgary_set_split_completion_timeout(void __iomem *bbar, - unsigned char busnum, unsigned long timeout) -{ - u64 val64; - void __iomem *target; - unsigned int phb_shift = ~0; /* silence gcc */ - u64 mask; - - switch (busno_to_phbid(busnum)) { - case 0: phb_shift = (63 - 19); - break; - case 1: phb_shift = (63 - 23); - break; - case 2: phb_shift = (63 - 27); - break; - case 3: phb_shift = (63 - 35); - break; - default: - BUG_ON(busno_to_phbid(busnum)); - } - - target = calgary_reg(bbar, CALGARY_CONFIG_REG); - val64 = be64_to_cpu(readq(target)); - - /* zero out this PHB's timer bits */ - mask = ~(0xFUL << phb_shift); - val64 &= mask; - val64 |= (timeout << phb_shift); - writeq(cpu_to_be64(val64), target); - readq(target); /* flush */ -} - -static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) -{ - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u32 val; - - /* - * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 - */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); - val = cpu_to_be32(readl(target)); - val |= 0x00800000; - writel(cpu_to_be32(val), target); -} - -static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) -{ - unsigned char busnum = dev->bus->number; - - /* - * Give split completion a longer timeout on bus 1 for aic94xx - * http://bugzilla.kernel.org/show_bug.cgi?id=7180 - */ - if (is_calgary(dev->device) && (busnum == 1)) - calgary_set_split_completion_timeout(tbl->bbar, busnum, - CCR_2SEC_TIMEOUT); -} - -static void __init calgary_enable_translation(struct pci_dev *dev) -{ - u32 val32; - unsigned char busnum; - void __iomem *target; - void __iomem *bbar; - struct iommu_table *tbl; - - busnum = dev->bus->number; - tbl = pci_iommu(dev->bus); - bbar = tbl->bbar; - - /* enable TCE in PHB Config Register */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; - - printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", - (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? - "Calgary" : "CalIOC2", busnum); - printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " - "bus.\n"); - - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - - init_timer(&tbl->watchdog_timer); - tbl->watchdog_timer.function = &calgary_watchdog; - tbl->watchdog_timer.data = (unsigned long)dev; - mod_timer(&tbl->watchdog_timer, jiffies); -} - -static void __init calgary_disable_translation(struct pci_dev *dev) -{ - u32 val32; - unsigned char busnum; - void __iomem *target; - void __iomem *bbar; - struct iommu_table *tbl; - - busnum = dev->bus->number; - tbl = pci_iommu(dev->bus); - bbar = tbl->bbar; - - /* disable TCE in PHB Config Register */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); - - printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum); - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - - del_timer_sync(&tbl->watchdog_timer); -} - -static void __init calgary_init_one_nontraslated(struct pci_dev *dev) -{ - pci_dev_get(dev); - set_pci_iommu(dev->bus, NULL); - - /* is the device behind a bridge? */ - if (dev->bus->parent) - dev->bus->parent->self = dev; - else - dev->bus->self = dev; -} - -static int __init calgary_init_one(struct pci_dev *dev) -{ - void __iomem *bbar; - struct iommu_table *tbl; - int ret; - - BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); - - bbar = busno_to_bbar(dev->bus->number); - ret = calgary_setup_tar(dev, bbar); - if (ret) - goto done; - - pci_dev_get(dev); - - if (dev->bus->parent) { - if (dev->bus->parent->self) - printk(KERN_WARNING "Calgary: IEEEE, dev %p has " - "bus->parent->self!\n", dev); - dev->bus->parent->self = dev; - } else - dev->bus->self = dev; - - tbl = pci_iommu(dev->bus); - tbl->chip_ops->handle_quirks(tbl, dev); - - calgary_enable_translation(dev); - - return 0; - -done: - return ret; -} - -static int __init calgary_locate_bbars(void) -{ - int ret; - int rioidx, phb, bus; - void __iomem *bbar; - void __iomem *target; - unsigned long offset; - u8 start_bus, end_bus; - u32 val; - - ret = -ENODATA; - for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) { - struct rio_detail *rio = rio_devs[rioidx]; - - if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY)) - continue; - - /* map entire 1MB of Calgary config space */ - bbar = ioremap_nocache(rio->BBAR, 1024 * 1024); - if (!bbar) - goto error; - - for (phb = 0; phb < PHBS_PER_CALGARY; phb++) { - offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET; - target = calgary_reg(bbar, offset); - - val = be32_to_cpu(readl(target)); - - start_bus = (u8)((val & 0x00FF0000) >> 16); - end_bus = (u8)((val & 0x0000FF00) >> 8); - - if (end_bus) { - for (bus = start_bus; bus <= end_bus; bus++) { - bus_info[bus].bbar = bbar; - bus_info[bus].phbid = phb; - } - } else { - bus_info[start_bus].bbar = bbar; - bus_info[start_bus].phbid = phb; - } - } - } - - return 0; - -error: - /* scan bus_info and iounmap any bbars we previously ioremap'd */ - for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++) - if (bus_info[bus].bbar) - iounmap(bus_info[bus].bbar); - - return ret; -} - -static int __init calgary_init(void) -{ - int ret; - struct pci_dev *dev = NULL; - void *tce_space; - - ret = calgary_locate_bbars(); - if (ret) - return ret; - - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - if (!translate_phb(dev)) { - calgary_init_one_nontraslated(dev); - continue; - } - tce_space = bus_info[dev->bus->number].tce_space; - if (!tce_space && !translate_empty_slots) - continue; - - ret = calgary_init_one(dev); - if (ret) - goto error; - } while (1); - - return ret; - -error: - do { - dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM, - PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - if (!translate_phb(dev)) { - pci_dev_put(dev); - continue; - } - if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) - continue; - - calgary_disable_translation(dev); - calgary_free_bus(dev); - pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ - } while (1); - - return ret; -} - -static inline int __init determine_tce_table_size(u64 ram) -{ - int ret; - - if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) - return specified_table_size; - - /* - * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to - * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each - * larger table size has twice as many entries, so shift the - * max ram address by 13 to divide by 8K and then look at the - * order of the result to choose between 0-7. - */ - ret = get_order(ram >> 13); - if (ret > TCE_TABLE_SIZE_8M) - ret = TCE_TABLE_SIZE_8M; - - return ret; -} - -static int __init build_detail_arrays(void) -{ - unsigned long ptr; - int i, scal_detail_size, rio_detail_size; - - if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ - printk(KERN_WARNING - "Calgary: MAX_NUMNODES too low! Defined as %d, " - "but system has %d nodes.\n", - MAX_NUMNODES, rio_table_hdr->num_scal_dev); - return -ENODEV; - } - - switch (rio_table_hdr->version){ - case 2: - scal_detail_size = 11; - rio_detail_size = 13; - break; - case 3: - scal_detail_size = 12; - rio_detail_size = 15; - break; - default: - printk(KERN_WARNING - "Calgary: Invalid Rio Grande Table Version: %d\n", - rio_table_hdr->version); - return -EPROTO; - } - - ptr = ((unsigned long)rio_table_hdr) + 3; - for (i = 0; i < rio_table_hdr->num_scal_dev; - i++, ptr += scal_detail_size) - scal_devs[i] = (struct scal_detail *)ptr; - - for (i = 0; i < rio_table_hdr->num_rio_dev; - i++, ptr += rio_detail_size) - rio_devs[i] = (struct rio_detail *)ptr; - - return 0; -} - -static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) -{ - int dev; - u32 val; - - if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { - /* - * FIXME: properly scan for devices accross the - * PCI-to-PCI bridge on every CalIOC2 port. - */ - return 1; - } - - for (dev = 1; dev < 8; dev++) { - val = read_pci_config(bus, dev, 0, 0); - if (val != 0xffffffff) - break; - } - return (val != 0xffffffff); -} - -void __init detect_calgary(void) -{ - int bus; - void *tbl; - int calgary_found = 0; - unsigned long ptr; - unsigned int offset, prev_offset; - int ret; - - /* - * if the user specified iommu=off or iommu=soft or we found - * another HW IOMMU already, bail out. - */ - if (swiotlb || no_iommu || iommu_detected) - return; - - if (!use_calgary) - return; - - if (!early_pci_allowed()) - return; - - printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); - - ptr = (unsigned long)phys_to_virt(get_bios_ebda()); - - rio_table_hdr = NULL; - prev_offset = 0; - offset = 0x180; - /* - * The next offset is stored in the 1st word. - * Only parse up until the offset increases: - */ - while (offset > prev_offset) { - /* The block id is stored in the 2nd word */ - if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ - /* set the pointer past the offset & block id */ - rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); - break; - } - prev_offset = offset; - offset = *((unsigned short *)(ptr + offset)); - } - if (!rio_table_hdr) { - printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " - "in EBDA - bailing!\n"); - return; - } - - ret = build_detail_arrays(); - if (ret) { - printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); - return; - } - - specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); - - for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - struct calgary_bus_info *info = &bus_info[bus]; - unsigned short pci_device; - u32 val; - - val = read_pci_config(bus, 0, 0, 0); - pci_device = (val & 0xFFFF0000) >> 16; - - if (!is_cal_pci_dev(pci_device)) - continue; - - if (info->translation_disabled) - continue; - - if (calgary_bus_has_devices(bus, pci_device) || - translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; - calgary_found = 1; - } - } - - printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n", - calgary_found ? "found" : "not found"); - - if (calgary_found) { - iommu_detected = 1; - calgary_detected = 1; - printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " - "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, - debugging ? "enabled" : "disabled"); - } - return; - -cleanup: - for (--bus; bus >= 0; --bus) { - struct calgary_bus_info *info = &bus_info[bus]; - - if (info->tce_space) - free_tce_table(info->tce_space); - } -} - -int __init calgary_iommu_init(void) -{ - int ret; - - if (no_iommu || swiotlb) - return -ENODEV; - - if (!calgary_detected) - return -ENODEV; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - if (end_pfn > MAX_DMA32_PFN) - printk(KERN_ERR "WARNING more than 4GB of memory, " - "32bit PCI may malfunction.\n"); - return ret; - } - - force_iommu = 1; - bad_dma_address = 0x0; - dma_ops = &calgary_dma_ops; - - return 0; -} - -static int __init calgary_parse_options(char *p) -{ - unsigned int bridge; - size_t len; - char* endp; - - while (*p) { - if (!strncmp(p, "64k", 3)) - specified_table_size = TCE_TABLE_SIZE_64K; - else if (!strncmp(p, "128k", 4)) - specified_table_size = TCE_TABLE_SIZE_128K; - else if (!strncmp(p, "256k", 4)) - specified_table_size = TCE_TABLE_SIZE_256K; - else if (!strncmp(p, "512k", 4)) - specified_table_size = TCE_TABLE_SIZE_512K; - else if (!strncmp(p, "1M", 2)) - specified_table_size = TCE_TABLE_SIZE_1M; - else if (!strncmp(p, "2M", 2)) - specified_table_size = TCE_TABLE_SIZE_2M; - else if (!strncmp(p, "4M", 2)) - specified_table_size = TCE_TABLE_SIZE_4M; - else if (!strncmp(p, "8M", 2)) - specified_table_size = TCE_TABLE_SIZE_8M; - - len = strlen("translate_empty_slots"); - if (!strncmp(p, "translate_empty_slots", len)) - translate_empty_slots = 1; - - len = strlen("disable"); - if (!strncmp(p, "disable", len)) { - p += len; - if (*p == '=') - ++p; - if (*p == '\0') - break; - bridge = simple_strtol(p, &endp, 0); - if (p == endp) - break; - - if (bridge < MAX_PHB_BUS_NUM) { - printk(KERN_INFO "Calgary: disabling " - "translation for PHB %#x\n", bridge); - bus_info[bridge].translation_disabled = 1; - } - } - - p = strpbrk(p, ","); - if (!p) - break; - - p++; /* skip ',' */ - } - return 1; -} -__setup("calgary=", calgary_parse_options); - -static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) -{ - struct iommu_table *tbl; - unsigned int npages; - int i; - - tbl = pci_iommu(dev->bus); - - for (i = 0; i < 4; i++) { - struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; - - /* Don't give out TCEs that map MEM resources */ - if (!(r->flags & IORESOURCE_MEM)) - continue; - - /* 0-based? we reserve the whole 1st MB anyway */ - if (!r->start) - continue; - - /* cover the whole region */ - npages = (r->end - r->start) >> PAGE_SHIFT; - npages++; - - iommu_range_reserve(tbl, r->start, npages); - } -} - -static int __init calgary_fixup_tce_spaces(void) -{ - struct pci_dev *dev = NULL; - void *tce_space; - - if (no_iommu || swiotlb || !calgary_detected) - return -ENODEV; - - printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); - - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - if (!translate_phb(dev)) - continue; - - tce_space = bus_info[dev->bus->number].tce_space; - if (!tce_space) - continue; - - calgary_fixup_one_tce_space(dev); - - } while (1); - - return 0; -} - -/* - * We need to be call after pcibios_assign_resources (fs_initcall level) - * and before device_initcall. - */ -rootfs_initcall(calgary_fixup_tce_spaces); diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c deleted file mode 100644 index 29711445c818..000000000000 --- a/arch/x86_64/kernel/pci-dma.c +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Dynamic DMA mapping support. - */ - -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/pci.h> -#include <linux/module.h> -#include <asm/io.h> -#include <asm/iommu.h> -#include <asm/calgary.h> - -int iommu_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_merge); - -dma_addr_t bad_dma_address __read_mostly; -EXPORT_SYMBOL(bad_dma_address); - -/* This tells the BIO block layer to assume merging. Default to off - because we cannot guarantee merging later. */ -int iommu_bio_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_bio_merge); - -static int iommu_sac_force __read_mostly = 0; - -int no_iommu __read_mostly; -#ifdef CONFIG_IOMMU_DEBUG -int panic_on_overflow __read_mostly = 1; -int force_iommu __read_mostly = 1; -#else -int panic_on_overflow __read_mostly = 0; -int force_iommu __read_mostly= 0; -#endif - -/* Set this to 1 if there is a HW IOMMU in the system */ -int iommu_detected __read_mostly = 0; - -/* Dummy device used for NULL arguments (normally ISA). Better would - be probably a smaller DMA mask, but this is bug-to-bug compatible - to i386. */ -struct device fallback_dev = { - .bus_id = "fallback device", - .coherent_dma_mask = DMA_32BIT_MASK, - .dma_mask = &fallback_dev.coherent_dma_mask, -}; - -/* Allocate DMA memory on node near device */ -noinline static void * -dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) -{ - struct page *page; - int node; -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) - node = pcibus_to_node(to_pci_dev(dev)->bus); - else -#endif - node = numa_node_id(); - - if (node < first_node(node_online_map)) - node = first_node(node_online_map); - - page = alloc_pages_node(node, gfp, order); - return page ? page_address(page) : NULL; -} - -/* - * Allocate memory for a coherent mapping. - */ -void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp) -{ - void *memory; - unsigned long dma_mask = 0; - u64 bus; - - if (!dev) - dev = &fallback_dev; - dma_mask = dev->coherent_dma_mask; - if (dma_mask == 0) - dma_mask = DMA_32BIT_MASK; - - /* Device not DMA able */ - if (dev->dma_mask == NULL) - return NULL; - - /* Don't invoke OOM killer */ - gfp |= __GFP_NORETRY; - - /* Kludge to make it bug-to-bug compatible with i386. i386 - uses the normal dma_mask for alloc_coherent. */ - dma_mask &= *dev->dma_mask; - - /* Why <=? Even when the mask is smaller than 4GB it is often - larger than 16MB and in this case we have a chance of - finding fitting memory in the next higher zone first. If - not retry with true GFP_DMA. -AK */ - if (dma_mask <= DMA_32BIT_MASK) - gfp |= GFP_DMA32; - - again: - memory = dma_alloc_pages(dev, gfp, get_order(size)); - if (memory == NULL) - return NULL; - - { - int high, mmu; - bus = virt_to_bus(memory); - high = (bus + size) >= dma_mask; - mmu = high; - if (force_iommu && !(gfp & GFP_DMA)) - mmu = 1; - else if (high) { - free_pages((unsigned long)memory, - get_order(size)); - - /* Don't use the 16MB ZONE_DMA unless absolutely - needed. It's better to use remapping first. */ - if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { - gfp = (gfp & ~GFP_DMA32) | GFP_DMA; - goto again; - } - - /* Let low level make its own zone decisions */ - gfp &= ~(GFP_DMA32|GFP_DMA); - - if (dma_ops->alloc_coherent) - return dma_ops->alloc_coherent(dev, size, - dma_handle, gfp); - return NULL; - } - - memset(memory, 0, size); - if (!mmu) { - *dma_handle = virt_to_bus(memory); - return memory; - } - } - - if (dma_ops->alloc_coherent) { - free_pages((unsigned long)memory, get_order(size)); - gfp &= ~(GFP_DMA|GFP_DMA32); - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); - } - - if (dma_ops->map_simple) { - *dma_handle = dma_ops->map_simple(dev, memory, - size, - PCI_DMA_BIDIRECTIONAL); - if (*dma_handle != bad_dma_address) - return memory; - } - - if (panic_on_overflow) - panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size); - free_pages((unsigned long)memory, get_order(size)); - return NULL; -} -EXPORT_SYMBOL(dma_alloc_coherent); - -/* - * Unmap coherent memory. - * The caller must ensure that the device has finished accessing the mapping. - */ -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus) -{ - if (dma_ops->unmap_single) - dma_ops->unmap_single(dev, bus, size, 0); - free_pages((unsigned long)vaddr, get_order(size)); -} -EXPORT_SYMBOL(dma_free_coherent); - -static int forbid_dac __read_mostly; - -int dma_supported(struct device *dev, u64 mask) -{ -#ifdef CONFIG_PCI - if (mask > 0xffffffff && forbid_dac > 0) { - - - - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id); - return 0; - } -#endif - - if (dma_ops->dma_supported) - return dma_ops->dma_supported(dev, mask); - - /* Copied from i386. Doesn't make much sense, because it will - only work for pci_alloc_coherent. - The caller just has to use GFP_DMA in this case. */ - if (mask < DMA_24BIT_MASK) - return 0; - - /* Tell the device to use SAC when IOMMU force is on. This - allows the driver to use cheaper accesses in some cases. - - Problem with this is that if we overflow the IOMMU area and - return DAC as fallback address the device may not handle it - correctly. - - As a special case some controllers have a 39bit address - mode that is as efficient as 32bit (aic79xx). Don't force - SAC for these. Assume all masks <= 40 bits are of this - type. Normally this doesn't make any difference, but gives - more gentle handling of IOMMU overflow. */ - if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { - printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); - return 0; - } - - return 1; -} -EXPORT_SYMBOL(dma_supported); - -int dma_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - *dev->dma_mask = mask; - return 0; -} -EXPORT_SYMBOL(dma_set_mask); - -/* - * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter - * documentation. - */ -__init int iommu_setup(char *p) -{ - iommu_merge = 1; - - if (!p) - return -EINVAL; - - while (*p) { - if (!strncmp(p,"off",3)) - no_iommu = 1; - /* gart_parse_options has more force support */ - if (!strncmp(p,"force",5)) - force_iommu = 1; - if (!strncmp(p,"noforce",7)) { - iommu_merge = 0; - force_iommu = 0; - } - - if (!strncmp(p, "biomerge",8)) { - iommu_bio_merge = 4096; - iommu_merge = 1; - force_iommu = 1; - } - if (!strncmp(p, "panic",5)) - panic_on_overflow = 1; - if (!strncmp(p, "nopanic",7)) - panic_on_overflow = 0; - if (!strncmp(p, "merge",5)) { - iommu_merge = 1; - force_iommu = 1; - } - if (!strncmp(p, "nomerge",7)) - iommu_merge = 0; - if (!strncmp(p, "forcesac",8)) - iommu_sac_force = 1; - if (!strncmp(p, "allowdac", 8)) - forbid_dac = 0; - if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; - -#ifdef CONFIG_SWIOTLB - if (!strncmp(p, "soft",4)) - swiotlb = 1; -#endif - -#ifdef CONFIG_IOMMU - gart_parse_options(p); -#endif - -#ifdef CONFIG_CALGARY_IOMMU - if (!strncmp(p, "calgary", 7)) - use_calgary = 1; -#endif /* CONFIG_CALGARY_IOMMU */ - - p += strcspn(p, ","); - if (*p == ',') - ++p; - } - return 0; -} -early_param("iommu", iommu_setup); - -void __init pci_iommu_alloc(void) -{ - /* - * The order of these functions is important for - * fall-back/fail-over reasons - */ -#ifdef CONFIG_IOMMU - iommu_hole_init(); -#endif - -#ifdef CONFIG_CALGARY_IOMMU - detect_calgary(); -#endif - -#ifdef CONFIG_SWIOTLB - pci_swiotlb_init(); -#endif -} - -static int __init pci_iommu_init(void) -{ -#ifdef CONFIG_CALGARY_IOMMU - calgary_iommu_init(); -#endif - -#ifdef CONFIG_IOMMU - gart_iommu_init(); -#endif - - no_iommu_init(); - return 0; -} - -void pci_iommu_shutdown(void) -{ - gart_iommu_shutdown(); -} - -#ifdef CONFIG_PCI -/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ - -static __devinit void via_no_dac(struct pci_dev *dev) -{ - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n"); - forbid_dac = 1; - } -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); -#endif -/* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c deleted file mode 100644 index 4918c575d582..000000000000 --- a/arch/x86_64/kernel/pci-gart.c +++ /dev/null @@ -1,740 +0,0 @@ -/* - * Dynamic DMA mapping support for AMD Hammer. - * - * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. - * This allows to use PCI devices that only support 32bit addresses on systems - * with more than 4GB. - * - * See Documentation/DMA-mapping.txt for the interface specification. - * - * Copyright 2002 Andi Kleen, SuSE Labs. - */ - -#include <linux/types.h> -#include <linux/ctype.h> -#include <linux/agp_backend.h> -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/spinlock.h> -#include <linux/pci.h> -#include <linux/module.h> -#include <linux/topology.h> -#include <linux/interrupt.h> -#include <linux/bitops.h> -#include <linux/kdebug.h> -#include <asm/atomic.h> -#include <asm/io.h> -#include <asm/mtrr.h> -#include <asm/pgtable.h> -#include <asm/proto.h> -#include <asm/iommu.h> -#include <asm/cacheflush.h> -#include <asm/swiotlb.h> -#include <asm/dma.h> -#include <asm/k8.h> - -unsigned long iommu_bus_base; /* GART remapping area (physical) */ -static unsigned long iommu_size; /* size of remapping area bytes */ -static unsigned long iommu_pages; /* .. and in pages */ - -u32 *iommu_gatt_base; /* Remapping table */ - -/* If this is disabled the IOMMU will use an optimized flushing strategy - of only flushing when an mapping is reused. With it true the GART is flushed - for every mapping. Problem is that doing the lazy flush seems to trigger - bugs with some popular PCI cards, in particular 3ware (but has been also - also seen with Qlogic at least). */ -int iommu_fullflush = 1; - -/* Allocation bitmap for the remapping area */ -static DEFINE_SPINLOCK(iommu_bitmap_lock); -static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ - -static u32 gart_unmapped_entry; - -#define GPTE_VALID 1 -#define GPTE_COHERENT 2 -#define GPTE_ENCODE(x) \ - (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) -#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) - -#define to_pages(addr,size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) - -#define EMERGENCY_PAGES 32 /* = 128KB */ - -#ifdef CONFIG_AGP -#define AGPEXTERN extern -#else -#define AGPEXTERN -#endif - -/* backdoor interface to AGP driver */ -AGPEXTERN int agp_memory_reserved; -AGPEXTERN __u32 *agp_gatt_table; - -static unsigned long next_bit; /* protected by iommu_bitmap_lock */ -static int need_flush; /* global flush state. set for each gart wrap */ - -static unsigned long alloc_iommu(int size) -{ - unsigned long offset, flags; - - spin_lock_irqsave(&iommu_bitmap_lock, flags); - offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); - if (offset == -1) { - need_flush = 1; - offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); - } - if (offset != -1) { - set_bit_string(iommu_gart_bitmap, offset, size); - next_bit = offset+size; - if (next_bit >= iommu_pages) { - next_bit = 0; - need_flush = 1; - } - } - if (iommu_fullflush) - need_flush = 1; - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); - return offset; -} - -static void free_iommu(unsigned long offset, int size) -{ - unsigned long flags; - spin_lock_irqsave(&iommu_bitmap_lock, flags); - __clear_bit_string(iommu_gart_bitmap, offset, size); - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} - -/* - * Use global flush state to avoid races with multiple flushers. - */ -static void flush_gart(void) -{ - unsigned long flags; - spin_lock_irqsave(&iommu_bitmap_lock, flags); - if (need_flush) { - k8_flush_garts(); - need_flush = 0; - } - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} - -#ifdef CONFIG_IOMMU_LEAK - -#define SET_LEAK(x) if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0); -#define CLEAR_LEAK(x) if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; - -/* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; -static int leak_trace; -int iommu_leak_pages = 20; -void dump_leak(void) -{ - int i; - static int dump; - if (dump || !iommu_leak_tab) return; - dump = 1; - show_stack(NULL,NULL); - /* Very crude. dump some from the end of the table too */ - printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i+=2) { - printk("%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); - printk("%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk("\n"); -} -#else -#define SET_LEAK(x) -#define CLEAR_LEAK(x) -#endif - -static void iommu_full(struct device *dev, size_t size, int dir) -{ - /* - * Ran out of IOMMU space for this operation. This is very bad. - * Unfortunately the drivers cannot handle this operation properly. - * Return some non mapped prereserved space in the aperture and - * let the Northbridge deal with it. This will result in garbage - * in the IO operation. When the size exceeds the prereserved space - * memory corruption will occur or random memory will be DMAed - * out. Hopefully no network devices use single mappings that big. - */ - - printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", - size, dev->bus_id); - - if (size > PAGE_SIZE*EMERGENCY_PAGES) { - if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic("PCI-DMA: Memory would be corrupted\n"); - if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); - } - -#ifdef CONFIG_IOMMU_LEAK - dump_leak(); -#endif -} - -static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) -{ - u64 mask = *dev->dma_mask; - int high = addr + size > mask; - int mmu = high; - if (force_iommu) - mmu = 1; - return mmu; -} - -static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) -{ - u64 mask = *dev->dma_mask; - int high = addr + size > mask; - int mmu = high; - return mmu; -} - -/* Map a single continuous physical area into the IOMMU. - * Caller needs to check if the iommu is needed and flush. - */ -static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, - size_t size, int dir) -{ - unsigned long npages = to_pages(phys_mem, size); - unsigned long iommu_page = alloc_iommu(npages); - int i; - if (iommu_page == -1) { - if (!nonforced_iommu(dev, phys_mem, size)) - return phys_mem; - if (panic_on_overflow) - panic("dma_map_area overflow %lu bytes\n", size); - iommu_full(dev, size, dir); - return bad_dma_address; - } - - for (i = 0; i < npages; i++) { - iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - SET_LEAK(iommu_page + i); - phys_mem += PAGE_SIZE; - } - return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); -} - -static dma_addr_t gart_map_simple(struct device *dev, char *buf, - size_t size, int dir) -{ - dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); - flush_gart(); - return map; -} - -/* Map a single area into the IOMMU */ -static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) -{ - unsigned long phys_mem, bus; - - if (!dev) - dev = &fallback_dev; - - phys_mem = virt_to_phys(addr); - if (!need_iommu(dev, phys_mem, size)) - return phys_mem; - - bus = gart_map_simple(dev, addr, size, dir); - return bus; -} - -/* - * Free a DMA mapping. - */ -static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - unsigned long iommu_page; - int npages; - int i; - - if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || - dma_addr >= iommu_bus_base + iommu_size) - return; - iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = to_pages(dma_addr, size); - for (i = 0; i < npages; i++) { - iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); - } - free_iommu(iommu_page, npages); -} - -/* - * Wrapper for pci_unmap_single working with scatterlists. - */ -static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) -{ - int i; - - for (i = 0; i < nents; i++) { - struct scatterlist *s = &sg[i]; - if (!s->dma_length || !s->length) - break; - gart_unmap_single(dev, s->dma_address, s->dma_length, dir); - } -} - -/* Fallback for dma_map_sg in case of overflow */ -static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, - int nents, int dir) -{ - int i; - -#ifdef CONFIG_IOMMU_DEBUG - printk(KERN_DEBUG "dma_map_sg overflow\n"); -#endif - - for (i = 0; i < nents; i++ ) { - struct scatterlist *s = &sg[i]; - unsigned long addr = page_to_phys(s->page) + s->offset; - if (nonforced_iommu(dev, addr, s->length)) { - addr = dma_map_area(dev, addr, s->length, dir); - if (addr == bad_dma_address) { - if (i > 0) - gart_unmap_sg(dev, sg, i, dir); - nents = 0; - sg[0].dma_length = 0; - break; - } - } - s->dma_address = addr; - s->dma_length = s->length; - } - flush_gart(); - return nents; -} - -/* Map multiple scatterlist entries continuous into the first. */ -static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, - struct scatterlist *sout, unsigned long pages) -{ - unsigned long iommu_start = alloc_iommu(pages); - unsigned long iommu_page = iommu_start; - int i; - - if (iommu_start == -1) - return -1; - - for (i = start; i < stopat; i++) { - struct scatterlist *s = &sg[i]; - unsigned long pages, addr; - unsigned long phys_addr = s->dma_address; - - BUG_ON(i > start && s->offset); - if (i == start) { - *sout = *s; - sout->dma_address = iommu_bus_base; - sout->dma_address += iommu_page*PAGE_SIZE + s->offset; - sout->dma_length = s->length; - } else { - sout->dma_length += s->length; - } - - addr = phys_addr; - pages = to_pages(s->offset, s->length); - while (pages--) { - iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); - SET_LEAK(iommu_page); - addr += PAGE_SIZE; - iommu_page++; - } - } - BUG_ON(iommu_page - iommu_start != pages); - return 0; -} - -static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, - struct scatterlist *sout, - unsigned long pages, int need) -{ - if (!need) { - BUG_ON(stopat - start != 1); - *sout = sg[start]; - sout->dma_length = sg[start].length; - return 0; - } - return __dma_map_cont(sg, start, stopat, sout, pages); -} - -/* - * DMA map all entries in a scatterlist. - * Merge chunks that have page aligned sizes into a continuous mapping. - */ -int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) -{ - int i; - int out; - int start; - unsigned long pages = 0; - int need = 0, nextneed; - - if (nents == 0) - return 0; - - if (!dev) - dev = &fallback_dev; - - out = 0; - start = 0; - for (i = 0; i < nents; i++) { - struct scatterlist *s = &sg[i]; - dma_addr_t addr = page_to_phys(s->page) + s->offset; - s->dma_address = addr; - BUG_ON(s->length == 0); - - nextneed = need_iommu(dev, addr, s->length); - - /* Handle the previous not yet processed entries */ - if (i > start) { - struct scatterlist *ps = &sg[i-1]; - /* Can only merge when the last chunk ends on a page - boundary and the new one doesn't have an offset. */ - if (!iommu_merge || !nextneed || !need || s->offset || - (ps->offset + ps->length) % PAGE_SIZE) { - if (dma_map_cont(sg, start, i, sg+out, pages, - need) < 0) - goto error; - out++; - pages = 0; - start = i; - } - } - - need = nextneed; - pages += to_pages(s->offset, s->length); - } - if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) - goto error; - out++; - flush_gart(); - if (out < nents) - sg[out].dma_length = 0; - return out; - -error: - flush_gart(); - gart_unmap_sg(dev, sg, nents, dir); - /* When it was forced or merged try again in a dumb way */ - if (force_iommu || iommu_merge) { - out = dma_map_sg_nonforce(dev, sg, nents, dir); - if (out > 0) - return out; - } - if (panic_on_overflow) - panic("dma_map_sg: overflow on %lu pages\n", pages); - iommu_full(dev, pages << PAGE_SHIFT, dir); - for (i = 0; i < nents; i++) - sg[i].dma_address = bad_dma_address; - return 0; -} - -static int no_agp; - -static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) -{ - unsigned long a; - if (!iommu_size) { - iommu_size = aper_size; - if (!no_agp) - iommu_size /= 2; - } - - a = aper + iommu_size; - iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; - - if (iommu_size < 64*1024*1024) - printk(KERN_WARNING - "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); - - return iommu_size; -} - -static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) -{ - unsigned aper_size = 0, aper_base_32; - u64 aper_base; - unsigned aper_order; - - pci_read_config_dword(dev, 0x94, &aper_base_32); - pci_read_config_dword(dev, 0x90, &aper_order); - aper_order = (aper_order >> 1) & 7; - - aper_base = aper_base_32 & 0x7fff; - aper_base <<= 25; - - aper_size = (32 * 1024 * 1024) << aper_order; - if (aper_base + aper_size > 0x100000000UL || !aper_size) - aper_base = 0; - - *size = aper_size; - return aper_base; -} - -/* - * Private Northbridge GATT initialization in case we cannot use the - * AGP driver for some reason. - */ -static __init int init_k8_gatt(struct agp_kern_info *info) -{ - struct pci_dev *dev; - void *gatt; - unsigned aper_base, new_aper_base; - unsigned aper_size, gatt_size, new_aper_size; - int i; - - printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); - aper_size = aper_base = info->aper_size = 0; - dev = NULL; - for (i = 0; i < num_k8_northbridges; i++) { - dev = k8_northbridges[i]; - new_aper_base = read_aperture(dev, &new_aper_size); - if (!new_aper_base) - goto nommu; - - if (!aper_base) { - aper_size = new_aper_size; - aper_base = new_aper_base; - } - if (aper_size != new_aper_size || aper_base != new_aper_base) - goto nommu; - } - if (!aper_base) - goto nommu; - info->aper_base = aper_base; - info->aper_size = aper_size>>20; - - gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); - gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); - if (!gatt) - panic("Cannot allocate GATT table"); - if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE)) - panic("Could not set GART PTEs to uncacheable pages"); - global_flush_tlb(); - - memset(gatt, 0, gatt_size); - agp_gatt_table = gatt; - - for (i = 0; i < num_k8_northbridges; i++) { - u32 ctl; - u32 gatt_reg; - - dev = k8_northbridges[i]; - gatt_reg = __pa(gatt) >> 12; - gatt_reg <<= 4; - pci_write_config_dword(dev, 0x98, gatt_reg); - pci_read_config_dword(dev, 0x90, &ctl); - - ctl |= 1; - ctl &= ~((1<<4) | (1<<5)); - - pci_write_config_dword(dev, 0x90, ctl); - } - flush_gart(); - - printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); - return 0; - - nommu: - /* Should not happen anymore */ - printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); - return -1; -} - -extern int agp_amd64_init(void); - -static const struct dma_mapping_ops gart_dma_ops = { - .mapping_error = NULL, - .map_single = gart_map_single, - .map_simple = gart_map_simple, - .unmap_single = gart_unmap_single, - .sync_single_for_cpu = NULL, - .sync_single_for_device = NULL, - .sync_single_range_for_cpu = NULL, - .sync_single_range_for_device = NULL, - .sync_sg_for_cpu = NULL, - .sync_sg_for_device = NULL, - .map_sg = gart_map_sg, - .unmap_sg = gart_unmap_sg, -}; - -void gart_iommu_shutdown(void) -{ - struct pci_dev *dev; - int i; - - if (no_agp && (dma_ops != &gart_dma_ops)) - return; - - for (i = 0; i < num_k8_northbridges; i++) { - u32 ctl; - - dev = k8_northbridges[i]; - pci_read_config_dword(dev, 0x90, &ctl); - - ctl &= ~1; - - pci_write_config_dword(dev, 0x90, ctl); - } -} - -void __init gart_iommu_init(void) -{ - struct agp_kern_info info; - unsigned long aper_size; - unsigned long iommu_start; - unsigned long scratch; - long i; - - if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { - printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n"); - return; - } - -#ifndef CONFIG_AGP_AMD64 - no_agp = 1; -#else - /* Makefile puts PCI initialization via subsys_initcall first. */ - /* Add other K8 AGP bridge drivers here */ - no_agp = no_agp || - (agp_amd64_init() < 0) || - (agp_copy_info(agp_bridge, &info) < 0); -#endif - - if (swiotlb) - return; - - /* Did we detect a different HW IOMMU? */ - if (iommu_detected && !iommu_aperture) - return; - - if (no_iommu || - (!force_iommu && end_pfn <= MAX_DMA32_PFN) || - !iommu_aperture || - (no_agp && init_k8_gatt(&info) < 0)) { - if (end_pfn > MAX_DMA32_PFN) { - printk(KERN_ERR "WARNING more than 4GB of memory " - "but GART IOMMU not available.\n" - KERN_ERR "WARNING 32bit PCI may malfunction.\n"); - } - return; - } - - printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); - aper_size = info.aper_size * 1024 * 1024; - iommu_size = check_iommu_size(info.aper_base, aper_size); - iommu_pages = iommu_size >> PAGE_SHIFT; - - iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, - get_order(iommu_pages/8)); - if (!iommu_gart_bitmap) - panic("Cannot allocate iommu bitmap\n"); - memset(iommu_gart_bitmap, 0, iommu_pages/8); - -#ifdef CONFIG_IOMMU_LEAK - if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, - get_order(iommu_pages*sizeof(void *))); - if (iommu_leak_tab) - memset(iommu_leak_tab, 0, iommu_pages * 8); - else - printk("PCI-DMA: Cannot allocate leak trace area\n"); - } -#endif - - /* - * Out of IOMMU space handling. - * Reserve some invalid pages at the beginning of the GART. - */ - set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - - agp_memory_reserved = iommu_size; - printk(KERN_INFO - "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", - iommu_size>>20); - - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; - bad_dma_address = iommu_bus_base; - iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); - - /* - * Unmap the IOMMU part of the GART. The alias of the page is - * always mapped with cache enabled and there is no full cache - * coherency across the GART remapping. The unmapping avoids - * automatic prefetches from the CPU allocating cache lines in - * there. All CPU accesses are done via the direct mapping to - * the backing memory. The GART address is only used by PCI - * devices. - */ - clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); - - /* - * Try to workaround a bug (thanks to BenH) - * Set unmapped entries to a scratch page instead of 0. - * Any prefetches that hit unmapped entries won't get an bus abort - * then. - */ - scratch = get_zeroed_page(GFP_KERNEL); - if (!scratch) - panic("Cannot allocate iommu scratch page"); - gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); - for (i = EMERGENCY_PAGES; i < iommu_pages; i++) - iommu_gatt_base[i] = gart_unmapped_entry; - - flush_gart(); - dma_ops = &gart_dma_ops; -} - -void __init gart_parse_options(char *p) -{ - int arg; - -#ifdef CONFIG_IOMMU_LEAK - if (!strncmp(p,"leak",4)) { - leak_trace = 1; - p += 4; - if (*p == '=') ++p; - if (isdigit(*p) && get_option(&p, &arg)) - iommu_leak_pages = arg; - } -#endif - if (isdigit(*p) && get_option(&p, &arg)) - iommu_size = arg; - if (!strncmp(p, "fullflush",8)) - iommu_fullflush = 1; - if (!strncmp(p, "nofullflush",11)) - iommu_fullflush = 0; - if (!strncmp(p,"noagp",5)) - no_agp = 1; - if (!strncmp(p, "noaperture",10)) - fix_aperture = 0; - /* duplicated from pci-dma.c */ - if (!strncmp(p,"force",5)) - iommu_aperture_allowed = 1; - if (!strncmp(p,"allowed",7)) - iommu_aperture_allowed = 1; - if (!strncmp(p, "memaper", 7)) { - fallback_aper_force = 1; - p += 7; - if (*p == '=') { - ++p; - if (get_option(&p, &arg)) - fallback_aper_order = arg; - } - } -} diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c deleted file mode 100644 index 2a34c6c025a9..000000000000 --- a/arch/x86_64/kernel/pci-nommu.c +++ /dev/null @@ -1,97 +0,0 @@ -/* Fallback functions when the main IOMMU code is not compiled in. This - code is roughly equivalent to i386. */ -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <linux/string.h> -#include <linux/dma-mapping.h> - -#include <asm/iommu.h> -#include <asm/processor.h> -#include <asm/dma.h> - -static int -check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) -{ - if (hwdev && bus + size > *hwdev->dma_mask) { - if (*hwdev->dma_mask >= DMA_32BIT_MASK) - printk(KERN_ERR - "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", - name, (long long)bus, size, - (long long)*hwdev->dma_mask); - return 0; - } - return 1; -} - -static dma_addr_t -nommu_map_single(struct device *hwdev, void *ptr, size_t size, - int direction) -{ - dma_addr_t bus = virt_to_bus(ptr); - if (!check_addr("map_single", hwdev, bus, size)) - return bad_dma_address; - return bus; -} - -static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, - int direction) -{ -} - -/* Map a set of buffers described by scatterlist in streaming - * mode for DMA. This is the scatter-gather version of the - * above pci_map_single interface. Here the scatter gather list - * elements are each tagged with the appropriate dma address - * and length. They are obtained via sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for pci_map_single are - * the same here. - */ -static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, - int nents, int direction) -{ - int i; - - for (i = 0; i < nents; i++ ) { - struct scatterlist *s = &sg[i]; - BUG_ON(!s->page); - s->dma_address = virt_to_bus(page_address(s->page) +s->offset); - if (!check_addr("map_sg", hwdev, s->dma_address, s->length)) - return 0; - s->dma_length = s->length; - } - return nents; -} - -/* Unmap a set of streaming mode DMA translations. - * Again, cpu read rules concerning calls here are the same as for - * pci_unmap_single() above. - */ -static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, int dir) -{ -} - -const struct dma_mapping_ops nommu_dma_ops = { - .map_single = nommu_map_single, - .unmap_single = nommu_unmap_single, - .map_sg = nommu_map_sg, - .unmap_sg = nommu_unmap_sg, - .is_phys = 1, -}; - -void __init no_iommu_init(void) -{ - if (dma_ops) - return; - - force_iommu = 0; /* no HW IOMMU */ - dma_ops = &nommu_dma_ops; -} diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c deleted file mode 100644 index b2f405ea7c85..000000000000 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ /dev/null @@ -1,44 +0,0 @@ -/* Glue code to lib/swiotlb.c */ - -#include <linux/pci.h> -#include <linux/cache.h> -#include <linux/module.h> -#include <linux/dma-mapping.h> - -#include <asm/iommu.h> -#include <asm/swiotlb.h> -#include <asm/dma.h> - -int swiotlb __read_mostly; -EXPORT_SYMBOL(swiotlb); - -const struct dma_mapping_ops swiotlb_dma_ops = { - .mapping_error = swiotlb_dma_mapping_error, - .alloc_coherent = swiotlb_alloc_coherent, - .free_coherent = swiotlb_free_coherent, - .map_single = swiotlb_map_single, - .unmap_single = swiotlb_unmap_single, - .sync_single_for_cpu = swiotlb_sync_single_for_cpu, - .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, - .sync_single_range_for_device = swiotlb_sync_single_range_for_device, - .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device, - .map_sg = swiotlb_map_sg, - .unmap_sg = swiotlb_unmap_sg, - .dma_supported = NULL, -}; - -void __init pci_swiotlb_init(void) -{ - /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) - swiotlb = 1; - if (swiotlb_force) - swiotlb = 1; - if (swiotlb) { - printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_init(); - dma_ops = &swiotlb_dma_ops; - } -} diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c deleted file mode 100644 index ae8f91214f15..000000000000 --- a/arch/x86_64/kernel/pmtimer.c +++ /dev/null @@ -1,69 +0,0 @@ -/* Ported over from i386 by AK, original copyright was: - * - * (C) Dominik Brodowski <linux@brodo.de> 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - * - * Dropped all the hardware bug workarounds for now. Hopefully they - * are not needed on 64bit chipsets. - */ - -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/time.h> -#include <linux/init.h> -#include <linux/cpumask.h> -#include <asm/io.h> -#include <asm/proto.h> -#include <asm/msr.h> -#include <asm/vsyscall.h> - -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -static unsigned pmtimer_wait_tick(void) -{ - u32 a, b; - for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK; - a == b; - b = inl(pmtmr_ioport) & ACPI_PM_MASK) - cpu_relax(); - return b; -} - -/* note: wait time is rounded up to one tick */ -void pmtimer_wait(unsigned us) -{ - u32 a, b; - a = pmtimer_wait_tick(); - do { - b = inl(pmtmr_ioport); - cpu_relax(); - } while (cyc2us(b - a) < us); -} - -static int __init nopmtimer_setup(char *s) -{ - pmtmr_ioport = 0; - return 1; -} - -__setup("nopmtimer", nopmtimer_setup); diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c deleted file mode 100644 index 98956555450b..000000000000 --- a/arch/x86_64/kernel/process.c +++ /dev/null @@ -1,903 +0,0 @@ -/* - * linux/arch/x86-64/kernel/process.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * X86-64 port - * Andi Kleen. - * - * CPU hotplug support - ashok.raj@intel.com - */ - -/* - * This file handles the architecture-dependent parts of process handling.. - */ - -#include <stdarg.h> - -#include <linux/cpu.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/elfcore.h> -#include <linux/smp.h> -#include <linux/slab.h> -#include <linux/user.h> -#include <linux/module.h> -#include <linux/a.out.h> -#include <linux/interrupt.h> -#include <linux/delay.h> -#include <linux/ptrace.h> -#include <linux/utsname.h> -#include <linux/random.h> -#include <linux/notifier.h> -#include <linux/kprobes.h> -#include <linux/kdebug.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/mmu_context.h> -#include <asm/pda.h> -#include <asm/prctl.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/ia32.h> -#include <asm/idle.h> - -asmlinkage extern void ret_from_fork(void); - -unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; - -unsigned long boot_option_idle_override = 0; -EXPORT_SYMBOL(boot_option_idle_override); - -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); -EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); - -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL(idle_notifier_unregister); - -void enter_idle(void) -{ - write_pda(isidle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); -} - -static void __exit_idle(void) -{ - if (test_and_clear_bit_pda(0, isidle) == 0) - return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); -} - -/* Called from interrupts to signify idle end */ -void exit_idle(void) -{ - /* idle loop has pid 0 */ - if (current->pid) - return; - __exit_idle(); -} - -/* - * We use this if we don't have any better - * idle routine.. - */ -static void default_idle(void) -{ - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - local_irq_disable(); - if (!need_resched()) { - /* Enables interrupts one instruction before HLT. - x86 special cases this so there is no race. */ - safe_halt(); - } else - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; -} - -/* - * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->need_resched flag instead of waiting for the - * cross-CPU IPI to arrive. Use this option with caution. - */ -static void poll_idle (void) -{ - local_irq_enable(); - cpu_relax(); -} - -void cpu_idle_wait(void) -{ - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && - !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - -#ifdef CONFIG_HOTPLUG_CPU -DECLARE_PER_CPU(int, cpu_state); - -#include <asm/nmi.h> -/* We halt the CPU with physical CPU hotplug */ -static inline void play_dead(void) -{ - idle_task_exit(); - wbinvd(); - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - local_irq_disable(); - while (1) - halt(); -} -#else -static inline void play_dead(void) -{ - BUG(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle (void) -{ - current_thread_info()->status |= TS_POLLING; - /* endless idle loop with no priority at all */ - while (1) { - while (!need_resched()) { - void (*idle)(void); - - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - - rmb(); - idle = pm_idle; - if (!idle) - idle = default_idle; - if (cpu_is_offline(smp_processor_id())) - play_dead(); - /* - * Idle routines should keep interrupts disabled - * from here on, until they go to idle. - * Otherwise, idle callbacks can misfire. - */ - local_irq_disable(); - enter_idle(); - idle(); - /* In many cases the interrupt that ended idle - has already called exit_idle. But some idle - loops can be woken up without interrupt. */ - __exit_idle(); - } - - preempt_enable_no_resched(); - schedule(); - preempt_disable(); - } -} - -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) -{ - if (!need_resched()) { - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(eax, ecx); - } -} - -/* Default MONITOR/MWAIT with no hints, used for default C1 state */ -static void mwait_idle(void) -{ - if (!need_resched()) { - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __sti_mwait(0, 0); - else - local_irq_enable(); - } else { - local_irq_enable(); - } -} - -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) -{ - static int printed; - if (cpu_has(c, X86_FEATURE_MWAIT)) { - /* - * Skip, if setup has overridden idle. - * One CPU supports mwait => All CPUs supports mwait - */ - if (!pm_idle) { - if (!printed) { - printk(KERN_INFO "using mwait in idle threads.\n"); - printed = 1; - } - pm_idle = mwait_idle; - } - } -} - -static int __init idle_setup (char *str) -{ - if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); - pm_idle = poll_idle; - } else if (!strcmp(str, "mwait")) - force_mwait = 1; - else - return -1; - - boot_option_idle_override = 1; - return 0; -} -early_param("idle", idle_setup); - -/* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs * regs) -{ - unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; - unsigned long d0, d1, d2, d3, d6, d7; - unsigned int fsindex,gsindex; - unsigned int ds,cs,es; - - printk("\n"); - print_modules(); - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); - printk_address(regs->rip); - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, - regs->eflags); - printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->rax, regs->rbx, regs->rcx); - printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->rdx, regs->rsi, regs->rdi); - printk("RBP: %016lx R08: %016lx R09: %016lx\n", - regs->rbp, regs->r8, regs->r9); - printk("R10: %016lx R11: %016lx R12: %016lx\n", - regs->r10, regs->r11, regs->r12); - printk("R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); - - asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); - asm("movl %%es,%0" : "=r" (es)); - asm("movl %%fs,%0" : "=r" (fsindex)); - asm("movl %%gs,%0" : "=r" (gsindex)); - - rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - - cr0 = read_cr0(); - cr2 = read_cr2(); - cr3 = read_cr3(); - cr4 = read_cr4(); - - printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", - fs,fsindex,gs,gsindex,shadowgs); - printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); - - get_debugreg(d0, 0); - get_debugreg(d1, 1); - get_debugreg(d2, 2); - printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); - get_debugreg(d3, 3); - get_debugreg(d6, 6); - get_debugreg(d7, 7); - printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); -} - -void show_regs(struct pt_regs *regs) -{ - printk("CPU %d:", smp_processor_id()); - __show_regs(regs); - show_trace(NULL, regs, (void *)(regs + 1)); -} - -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ - struct task_struct *me = current; - struct thread_struct *t = &me->thread; - - if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); - - kfree(t->io_bitmap_ptr); - t->io_bitmap_ptr = NULL; - clear_thread_flag(TIF_IO_BITMAP); - /* - * Careful, clear this in the TSS too: - */ - memset(tss->io_bitmap, 0xff, t->io_bitmap_max); - t->io_bitmap_max = 0; - put_cpu(); - } -} - -void flush_thread(void) -{ - struct task_struct *tsk = current; - - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } - clear_tsk_thread_flag(tsk, TIF_DEBUG); - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - /* - * Forget coprocessor state.. - */ - clear_fpu(tsk); - clear_used_math(); -} - -void release_thread(struct task_struct *dead_task) -{ - if (dead_task->mm) { - if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt, - dead_task->mm->context.size); - BUG(); - } - } -} - -static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) -{ - struct user_desc ud = { - .base_addr = addr, - .limit = 0xfffff, - .seg_32bit = 1, - .limit_in_pages = 1, - .useable = 1, - }; - struct n_desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - desc->a = LDT_entry_a(&ud); - desc->b = LDT_entry_b(&ud); -} - -static inline u32 read_32bit_tls(struct task_struct *t, int tls) -{ - struct desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - return desc->base0 | - (((u32)desc->base1) << 16) | - (((u32)desc->base2) << 24); -} - -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk); -} - -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, - unsigned long unused, - struct task_struct * p, struct pt_regs * regs) -{ - int err; - struct pt_regs * childregs; - struct task_struct *me = current; - - childregs = ((struct pt_regs *) - (THREAD_SIZE + task_stack_page(p))) - 1; - *childregs = *regs; - - childregs->rax = 0; - childregs->rsp = rsp; - if (rsp == ~0UL) - childregs->rsp = (unsigned long)childregs; - - p->thread.rsp = (unsigned long) childregs; - p->thread.rsp0 = (unsigned long) (childregs+1); - p->thread.userrsp = me->thread.userrsp; - - set_tsk_thread_flag(p, TIF_FORK); - - p->thread.fs = me->thread.fs; - p->thread.gs = me->thread.gs; - - asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); - asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); - asm("mov %%es,%0" : "=m" (p->thread.es)); - asm("mov %%ds,%0" : "=m" (p->thread.ds)); - - if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!p->thread.io_bitmap_ptr) { - p->thread.io_bitmap_max = 0; - return -ENOMEM; - } - memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, - IO_BITMAP_BYTES); - set_tsk_thread_flag(p, TIF_IO_BITMAP); - } - - /* - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) { -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) - err = ia32_child_tls(p, childregs); - else -#endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); - if (err) - goto out; - } - err = 0; -out: - if (err && p->thread.io_bitmap_ptr) { - kfree(p->thread.io_bitmap_ptr); - p->thread.io_bitmap_max = 0; - } - return err; -} - -/* - * This special macro can be used to load a debugging register - */ -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) - -static inline void __switch_to_xtra(struct task_struct *prev_p, - struct task_struct *next_p, - struct tss_struct *tss) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread, - next = &next_p->thread; - - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); - /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); - } - - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { - /* - * Clear any possible leftover bits: - */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); - } -} - -/* - * switch_to(x,y) should switch tasks from x to y. - * - * This could still be optimized: - * - fold all the options into a flag word and test it with a single test. - * - could test fs/gs bitsliced - * - * Kprobes not supported here. Set the probe on schedule instead. - */ -__kprobes struct task_struct * -__switch_to(struct task_struct *prev_p, struct task_struct *next_p) -{ - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; - int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - - /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter>5) - prefetch(&next->i387.fxsave); - - /* - * Reload esp0, LDT and the page table pointer: - */ - tss->rsp0 = next->rsp0; - - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - asm volatile("mov %%es,%0" : "=m" (prev->es)); - if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); - - asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); - if (unlikely(next->ds | prev->ds)) - loadsegment(ds, next->ds); - - load_TLS(next, cpu); - - /* - * Switch FS and GS. - */ - { - unsigned fsindex; - asm volatile("movl %%fs,%0" : "=r" (fsindex)); - /* segment register != 0 always requires a reload. - also reload when it has changed. - when prev process used 64bit base always reload - to avoid an information leak. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { - loadsegment(fs, next->fsindex); - /* check if the user used a selector != 0 - * if yes clear 64bit base, since overloaded base - * is always mapped to the Null selector - */ - if (fsindex) - prev->fs = 0; - } - /* when next process has a 64bit base use it */ - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; - } - { - unsigned gsindex; - asm volatile("movl %%gs,%0" : "=r" (gsindex)); - if (unlikely(gsindex | next->gsindex | prev->gs)) { - load_gs_index(next->gsindex); - if (gsindex) - prev->gs = 0; - } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; - } - - /* Must be after DS reload */ - unlazy_fpu(prev_p); - - /* - * Switch the PDA and FPU contexts. - */ - prev->userrsp = read_pda(oldrsp); - write_pda(oldrsp, next->userrsp); - write_pda(pcurrent, next_p); - - write_pda(kernelstack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); -#ifdef CONFIG_CC_STACKPROTECTOR - write_pda(stack_canary, next_p->stack_canary); - /* - * Build time only check to make sure the stack_canary is at - * offset 40 in the pda; this is a gcc ABI requirement - */ - BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); -#endif - - /* - * Now maybe reload the debug registers and handle I/O bitmaps - */ - if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) - __switch_to_xtra(prev_p, next_p, tss); - - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - */ - if (next_p->fpu_counter>5) - math_state_restore(); - return prev_p; -} - -/* - * sys_execve() executes a new program. - */ -asmlinkage -long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs regs) -{ - long error; - char * filename; - - filename = getname(name); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - return error; - error = do_execve(filename, argv, envp, ®s); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - } - putname(filename); - return error; -} - -void set_personality_64bit(void) -{ - /* inherit personality from parent */ - - /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); - - /* TBD: overwrites user setup. Should have two bits. - But 64bit processes have always behaved this way, - so it's not too bad. The main problem is just that - 32bit childs are affected again. */ - current->personality &= ~READ_IMPLIES_EXEC; -} - -asmlinkage long sys_fork(struct pt_regs *regs) -{ - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); -} - -asmlinkage long -sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) -{ - if (!newsp) - newsp = regs->rsp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -asmlinkage long sys_vfork(struct pt_regs *regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, - NULL, NULL); -} - -unsigned long get_wchan(struct task_struct *p) -{ - unsigned long stack; - u64 fp,rip; - int count = 0; - - if (!p || p == current || p->state==TASK_RUNNING) - return 0; - stack = (unsigned long)task_stack_page(p); - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) - return 0; - fp = *(u64 *)(p->thread.rsp); - do { - if (fp < (unsigned long)stack || - fp > (unsigned long)stack+THREAD_SIZE) - return 0; - rip = *(u64 *)(fp+8); - if (!in_sched_functions(rip)) - return rip; - fp = *(u64 *)fp; - } while (count++ < 16); - return 0; -} - -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) -{ - int ret = 0; - int doit = task == current; - int cpu; - - switch (code) { - case ARCH_SET_GS: - if (addr >= TASK_SIZE_OF(task)) - return -EPERM; - cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to - switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, GS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); - load_gs_index(GS_TLS_SEL); - } - task->thread.gsindex = GS_TLS_SEL; - task->thread.gs = 0; - } else { - task->thread.gsindex = 0; - task->thread.gs = addr; - if (doit) { - load_gs_index(0); - ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); - } - } - put_cpu(); - break; - case ARCH_SET_FS: - /* Not strictly needed for fs, but do it for symmetry - with gs */ - if (addr >= TASK_SIZE_OF(task)) - return -EPERM; - cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to - switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, FS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); - asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); - } - task->thread.fsindex = FS_TLS_SEL; - task->thread.fs = 0; - } else { - task->thread.fsindex = 0; - task->thread.fs = addr; - if (doit) { - /* set the selector to 0 to not confuse - __switch_to */ - asm volatile("movl %0,%%fs" :: "r" (0)); - ret = checking_wrmsrl(MSR_FS_BASE, addr); - } - } - put_cpu(); - break; - case ARCH_GET_FS: { - unsigned long base; - if (task->thread.fsindex == FS_TLS_SEL) - base = read_32bit_tls(task, FS_TLS); - else if (doit) - rdmsrl(MSR_FS_BASE, base); - else - base = task->thread.fs; - ret = put_user(base, (unsigned long __user *)addr); - break; - } - case ARCH_GET_GS: { - unsigned long base; - unsigned gsindex; - if (task->thread.gsindex == GS_TLS_SEL) - base = read_32bit_tls(task, GS_TLS); - else if (doit) { - asm("movl %%gs,%0" : "=r" (gsindex)); - if (gsindex) - rdmsrl(MSR_KERNEL_GS_BASE, base); - else - base = task->thread.gs; - } - else - base = task->thread.gs; - ret = put_user(base, (unsigned long __user *)addr); - break; - } - - default: - ret = -EINVAL; - break; - } - - return ret; -} - -long sys_arch_prctl(int code, unsigned long addr) -{ - return do_arch_prctl(current, code, addr); -} - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs *pp, ptregs; - - pp = task_pt_regs(tsk); - - ptregs = *pp; - ptregs.cs &= 0xffff; - ptregs.ss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - return 1; -} - -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c deleted file mode 100644 index eea3702427b4..000000000000 --- a/arch/x86_64/kernel/ptrace.c +++ /dev/null @@ -1,627 +0,0 @@ -/* ptrace.c */ -/* By Ross Biro 1/23/92 */ -/* - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * x86-64 port 2000-2002 Andi Kleen - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/security.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/debugreg.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/ia32.h> - -/* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. - */ - -/* - * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). - * Also masks reserved bits (63-22, 15, 5, 3, 1). - */ -#define FLAG_MASK 0x54dd5UL - -/* set's the trap flag. */ -#define TRAP_FLAG 0x100UL - -/* - * eflags and offset of eflags on child stack.. - */ -#define EFLAGS offsetof(struct pt_regs, eflags) -#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) - -/* - * this routine will get a word off of the processes privileged stack. - * the offset is how far from the base addr as stored in the TSS. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline unsigned long get_stack_long(struct task_struct *task, int offset) -{ - unsigned char *stack; - - stack = (unsigned char *)task->thread.rsp0; - stack += offset; - return (*((unsigned long *)stack)); -} - -/* - * this routine will put a word on the processes privileged stack. - * the offset is how far from the base addr as stored in the TSS. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline long put_stack_long(struct task_struct *task, int offset, - unsigned long data) -{ - unsigned char * stack; - - stack = (unsigned char *) task->thread.rsp0; - stack += offset; - *(unsigned long *) stack = data; - return 0; -} - -#define LDT_SEGMENT 4 - -unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs) -{ - unsigned long addr, seg; - - addr = regs->rip; - seg = regs->cs & 0xffff; - - /* - * We'll assume that the code segments in the GDT - * are all zero-based. That is largely true: the - * TLS segments are used for data, and the PNPBIOS - * and APM bios ones we just ignore here. - */ - if (seg & LDT_SEGMENT) { - u32 *desc; - unsigned long base; - - seg &= ~7UL; - - down(&child->mm->context.sem); - if (unlikely((seg >> 3) >= child->mm->context.size)) - addr = -1L; /* bogus selector, access would fault */ - else { - desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); - - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; - } - up(&child->mm->context.sem); - } - - return addr; -} - -static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) -{ - int i, copied; - unsigned char opcode[15]; - unsigned long addr = convert_rip_to_linear(child, regs); - - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); - for (i = 0; i < copied; i++) { - switch (opcode[i]) { - /* popf and iret */ - case 0x9d: case 0xcf: - return 1; - - /* CHECKME: 64 65 */ - - /* opcode and address size prefixes */ - case 0x66: case 0x67: - continue; - /* irrelevant prefixes (segment overrides and repeats) */ - case 0x26: case 0x2e: - case 0x36: case 0x3e: - case 0x64: case 0x65: - case 0xf2: case 0xf3: - continue; - - case 0x40 ... 0x4f: - if (regs->cs != __USER_CS) - /* 32-bit mode: register increment */ - return 0; - /* 64-bit mode: REX prefix */ - continue; - - /* CHECKME: f2, f3 */ - - /* - * pushf: NOTE! We should probably not let - * the user see the TF bit being set. But - * it's more pain than it's worth to avoid - * it, and a debugger could emulate this - * all in user space if it _really_ cares. - */ - case 0x9c: - default: - return 0; - } - } - return 0; -} - -static void set_singlestep(struct task_struct *child) -{ - struct pt_regs *regs = task_pt_regs(child); - - /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also - * cause us to set TF when returning to user mode. - */ - set_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* - * If TF was already set, don't do anything else - */ - if (regs->eflags & TRAP_FLAG) - return; - - /* Set TF on the kernel stack.. */ - regs->eflags |= TRAP_FLAG; - - /* - * ..but if TF is changed by the instruction we will trace, - * don't mark it as being "us" that set it, so that we - * won't clear it by hand later. - */ - if (is_setting_trap_flag(child, regs)) - return; - - child->ptrace |= PT_DTRACE; -} - -static void clear_singlestep(struct task_struct *child) -{ - /* Always clear TIF_SINGLESTEP... */ - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* But touch TF only if it was set by us.. */ - if (child->ptrace & PT_DTRACE) { - struct pt_regs *regs = task_pt_regs(child); - regs->eflags &= ~TRAP_FLAG; - child->ptrace &= ~PT_DTRACE; - } -} - -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure the single step bit is not set. - */ -void ptrace_disable(struct task_struct *child) -{ - clear_singlestep(child); -} - -static int putreg(struct task_struct *child, - unsigned long regno, unsigned long value) -{ - unsigned long tmp; - - switch (regno) { - case offsetof(struct user_regs_struct,fs): - if (value && (value & 3) != 3) - return -EIO; - child->thread.fsindex = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,gs): - if (value && (value & 3) != 3) - return -EIO; - child->thread.gsindex = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,ds): - if (value && (value & 3) != 3) - return -EIO; - child->thread.ds = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,es): - if (value && (value & 3) != 3) - return -EIO; - child->thread.es = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,ss): - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - return 0; - case offsetof(struct user_regs_struct,fs_base): - if (value >= TASK_SIZE_OF(child)) - return -EIO; - child->thread.fs = value; - return 0; - case offsetof(struct user_regs_struct,gs_base): - if (value >= TASK_SIZE_OF(child)) - return -EIO; - child->thread.gs = value; - return 0; - case offsetof(struct user_regs_struct, eflags): - value &= FLAG_MASK; - tmp = get_stack_long(child, EFL_OFFSET); - tmp &= ~FLAG_MASK; - value |= tmp; - break; - case offsetof(struct user_regs_struct,cs): - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - } - put_stack_long(child, regno - sizeof(struct pt_regs), value); - return 0; -} - -static unsigned long getreg(struct task_struct *child, unsigned long regno) -{ - unsigned long val; - switch (regno) { - case offsetof(struct user_regs_struct, fs): - return child->thread.fsindex; - case offsetof(struct user_regs_struct, gs): - return child->thread.gsindex; - case offsetof(struct user_regs_struct, ds): - return child->thread.ds; - case offsetof(struct user_regs_struct, es): - return child->thread.es; - case offsetof(struct user_regs_struct, fs_base): - return child->thread.fs; - case offsetof(struct user_regs_struct, gs_base): - return child->thread.gs; - default: - regno = regno - sizeof(struct pt_regs); - val = get_stack_long(child, regno); - if (test_tsk_thread_flag(child, TIF_IA32)) - val &= 0xffffffff; - return val; - } - -} - -long arch_ptrace(struct task_struct *child, long request, long addr, long data) -{ - long i, ret; - unsigned ui; - - switch (request) { - /* when I and D space are separate, these will need to be fixed. */ - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: - ret = generic_ptrace_peekdata(child, addr, data); - break; - - /* read the word at location addr in the USER area. */ - case PTRACE_PEEKUSR: { - unsigned long tmp; - - ret = -EIO; - if ((addr & 7) || - addr > sizeof(struct user) - 7) - break; - - switch (addr) { - case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - tmp = getreg(child, addr); - break; - case offsetof(struct user, u_debugreg[0]): - tmp = child->thread.debugreg0; - break; - case offsetof(struct user, u_debugreg[1]): - tmp = child->thread.debugreg1; - break; - case offsetof(struct user, u_debugreg[2]): - tmp = child->thread.debugreg2; - break; - case offsetof(struct user, u_debugreg[3]): - tmp = child->thread.debugreg3; - break; - case offsetof(struct user, u_debugreg[6]): - tmp = child->thread.debugreg6; - break; - case offsetof(struct user, u_debugreg[7]): - tmp = child->thread.debugreg7; - break; - default: - tmp = 0; - break; - } - ret = put_user(tmp,(unsigned long __user *) data); - break; - } - - /* when I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: - ret = generic_ptrace_pokedata(child, addr, data); - break; - - case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ - { - int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7; - ret = -EIO; - if ((addr & 7) || - addr > sizeof(struct user) - 7) - break; - - switch (addr) { - case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - ret = putreg(child, addr, data); - break; - /* Disallows to set a breakpoint into the vsyscall */ - case offsetof(struct user, u_debugreg[0]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg0 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[1]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg1 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[2]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg2 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[3]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg3 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[6]): - if (data >> 32) - break; - child->thread.debugreg6 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[7]): - /* See arch/i386/kernel/ptrace.c for an explanation of - * this awkward check.*/ - data &= ~DR_CONTROL_RESERVED; - for(i=0; i<4; i++) - if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) - break; - if (i == 4) { - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - ret = 0; - } - break; - } - break; - } - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: /* restart after signal. */ - - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - ret = 0; - break; - -#ifdef CONFIG_IA32_EMULATION - /* This makes only sense with 32bit programs. Allow a - 64bit debugger to fully examine them too. Better - don't use it against 64bit processes, use - PTRACE_ARCH_PRCTL instead. */ - case PTRACE_SET_THREAD_AREA: { - struct user_desc __user *p; - int old; - p = (struct user_desc __user *)data; - get_user(old, &p->entry_number); - put_user(addr, &p->entry_number); - ret = do_set_thread_area(&child->thread, p); - put_user(old, &p->entry_number); - break; - case PTRACE_GET_THREAD_AREA: - p = (struct user_desc __user *)data; - get_user(old, &p->entry_number); - put_user(addr, &p->entry_number); - ret = do_get_thread_area(&child->thread, p); - put_user(old, &p->entry_number); - break; - } -#endif - /* normal 64bit interface to access TLS data. - Works just like arch_prctl, except that the arguments - are reversed. */ - case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); - break; - -/* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - break; - - case PTRACE_SINGLESTEP: /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - set_singlestep(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; - break; - - case PTRACE_DETACH: - /* detach a process that was attached. */ - ret = ptrace_detach(child, data); - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, - sizeof(struct user_regs_struct))) { - ret = -EIO; - break; - } - ret = 0; - for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); - data += sizeof(long); - } - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - if (!access_ok(VERIFY_READ, (unsigned __user *)data, - sizeof(struct user_regs_struct))) { - ret = -EIO; - break; - } - ret = 0; - for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret = __get_user(tmp, (unsigned long __user *) data); - if (ret) - break; - ret = putreg(child, ui, tmp); - if (ret) - break; - data += sizeof(long); - } - break; - } - - case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ - if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - ret = get_fpregs((struct user_i387_struct __user *)data, child); - break; - } - - case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ - if (!access_ok(VERIFY_READ, (unsigned __user *)data, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - ret = set_fpregs(child, (struct user_i387_struct __user *)data); - break; - } - - default: - ret = ptrace_request(child, request, addr, data); - break; - } - return ret; -} - -static void syscall_trace(struct pt_regs *regs) -{ - -#if 0 - printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", - current->comm, - regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), - current_thread_info()->flags, current->ptrace); -#endif - - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } -} - -asmlinkage void syscall_trace_enter(struct pt_regs *regs) -{ - /* do the secure computing check first */ - secure_computing(regs->orig_rax); - - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); - - if (unlikely(current->audit_context)) { - if (test_thread_flag(TIF_IA32)) { - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_rax, - regs->rbx, regs->rcx, - regs->rdx, regs->rsi); - } else { - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_rax, - regs->rdi, regs->rsi, - regs->rdx, regs->r10); - } - } -} - -asmlinkage void syscall_trace_leave(struct pt_regs *regs) -{ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax); - - if ((test_thread_flag(TIF_SYSCALL_TRACE) - || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); -} diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c deleted file mode 100644 index 368db2b9c5ac..000000000000 --- a/arch/x86_64/kernel/reboot.c +++ /dev/null @@ -1,171 +0,0 @@ -/* Various gunk just to reboot the machine. */ -#include <linux/module.h> -#include <linux/reboot.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/string.h> -#include <linux/pm.h> -#include <linux/kdebug.h> -#include <linux/sched.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/hw_irq.h> -#include <asm/system.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/apic.h> -#include <asm/iommu.h> - -/* - * Power off function, if any - */ -void (*pm_power_off)(void); -EXPORT_SYMBOL(pm_power_off); - -static long no_idt[3]; -static enum { - BOOT_TRIPLE = 't', - BOOT_KBD = 'k' -} reboot_type = BOOT_KBD; -static int reboot_mode = 0; -int reboot_force; - -/* reboot=t[riple] | k[bd] [, [w]arm | [c]old] - warm Don't set the cold reboot flag - cold Set the cold reboot flag - triple Force a triple fault (init) - kbd Use the keyboard controller. cold reset (default) - force Avoid anything that could hang. - */ -static int __init reboot_setup(char *str) -{ - for (;;) { - switch (*str) { - case 'w': - reboot_mode = 0x1234; - break; - - case 'c': - reboot_mode = 0; - break; - - case 't': - case 'b': - case 'k': - reboot_type = *str; - break; - case 'f': - reboot_force = 1; - break; - } - if((str = strchr(str,',')) != NULL) - str++; - else - break; - } - return 1; -} - -__setup("reboot=", reboot_setup); - -static inline void kb_wait(void) -{ - int i; - - for (i=0; i<0x10000; i++) - if ((inb_p(0x64) & 0x02) == 0) - break; -} - -void machine_shutdown(void) -{ - unsigned long flags; - - /* Stop the cpus and apics */ -#ifdef CONFIG_SMP - int reboot_cpu_id; - - /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { - reboot_cpu_id = smp_processor_id(); - } - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); - - /* O.K Now that I'm on the appropriate processor, - * stop all of the others. - */ - smp_send_stop(); -#endif - - local_irq_save(flags); - -#ifndef CONFIG_SMP - disable_local_APIC(); -#endif - - disable_IO_APIC(); - - local_irq_restore(flags); - - pci_iommu_shutdown(); -} - -void machine_emergency_restart(void) -{ - int i; - - /* Tell the BIOS if we want cold or warm reboot */ - *((unsigned short *)__va(0x472)) = reboot_mode; - - for (;;) { - /* Could also try the reset bit in the Hammer NB */ - switch (reboot_type) { - case BOOT_KBD: - for (i=0; i<10; i++) { - kb_wait(); - udelay(50); - outb(0xfe,0x64); /* pulse reset low */ - udelay(50); - } - - case BOOT_TRIPLE: - __asm__ __volatile__("lidt (%0)": :"r" (&no_idt)); - __asm__ __volatile__("int3"); - - reboot_type = BOOT_KBD; - break; - } - } -} - -void machine_restart(char * __unused) -{ - printk("machine restart\n"); - - if (!reboot_force) { - machine_shutdown(); - } - machine_emergency_restart(); -} - -void machine_halt(void) -{ -} - -void machine_power_off(void) -{ - if (pm_power_off) { - if (!reboot_force) { - machine_shutdown(); - } - pm_power_off(); - } -} - diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S deleted file mode 100644 index 14e95872c6a3..000000000000 --- a/arch/x86_64/kernel/relocate_kernel.S +++ /dev/null @@ -1,276 +0,0 @@ -/* - * relocate_kernel.S - put the kernel image in place to boot - * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <linux/linkage.h> -#include <asm/page.h> -#include <asm/kexec.h> - -/* - * Must be relocatable PIC code callable as a C function - */ - -#define PTR(x) (x << 3) -#define PAGE_ALIGNED (1 << PAGE_SHIFT) -#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ - - .text - .align PAGE_ALIGNED - .code64 - .globl relocate_kernel -relocate_kernel: - /* %rdi indirection_page - * %rsi page_list - * %rdx start address - */ - - /* map the control page at its virtual address */ - - movq $0x0000ff8000000000, %r10 /* mask */ - mov $(39 - 3), %cl /* bits to shift */ - movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ - - movq %r11, %r9 - andq %r10, %r9 - shrq %cl, %r9 - - movq PTR(VA_PGD)(%rsi), %r8 - addq %r8, %r9 - movq PTR(PA_PUD_0)(%rsi), %r8 - orq $PAGE_ATTR, %r8 - movq %r8, (%r9) - - shrq $9, %r10 - sub $9, %cl - - movq %r11, %r9 - andq %r10, %r9 - shrq %cl, %r9 - - movq PTR(VA_PUD_0)(%rsi), %r8 - addq %r8, %r9 - movq PTR(PA_PMD_0)(%rsi), %r8 - orq $PAGE_ATTR, %r8 - movq %r8, (%r9) - - shrq $9, %r10 - sub $9, %cl - - movq %r11, %r9 - andq %r10, %r9 - shrq %cl, %r9 - - movq PTR(VA_PMD_0)(%rsi), %r8 - addq %r8, %r9 - movq PTR(PA_PTE_0)(%rsi), %r8 - orq $PAGE_ATTR, %r8 - movq %r8, (%r9) - - shrq $9, %r10 - sub $9, %cl - - movq %r11, %r9 - andq %r10, %r9 - shrq %cl, %r9 - - movq PTR(VA_PTE_0)(%rsi), %r8 - addq %r8, %r9 - movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 - orq $PAGE_ATTR, %r8 - movq %r8, (%r9) - - /* identity map the control page at its physical address */ - - movq $0x0000ff8000000000, %r10 /* mask */ - mov $(39 - 3), %cl /* bits to shift */ - movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ - - movq %r11, %r9 - andq %r10, %r9 - shrq %cl, %r9 - - movq PTR(VA_PGD)(%rsi), %r8 - addq %r8, %r9 - movq PTR(PA_PUD_1)(%rsi), %r8 - orq $PAGE_ATTR, %r8 - movq %r8, (%r9) - - shrq $9, %r10 - sub $9, %cl - - movq %r11, %r9 - andq %r10, %r9 - shrq %cl, %r9 - - movq PTR(VA_PUD_1)(%rsi), %r8 - addq %r8, %r9 - movq PTR(PA_PMD_1)(%rsi), %r8 - orq $PAGE_ATTR, %r8 - movq %r8, (%r9) - - shrq $9, %r10 - sub $9, %cl - - movq %r11, %r9 - andq %r10, %r9 - shrq %cl, %r9 - - movq PTR(VA_PMD_1)(%rsi), %r8 - addq %r8, %r9 - movq PTR(PA_PTE_1)(%rsi), %r8 - orq $PAGE_ATTR, %r8 - movq %r8, (%r9) - - shrq $9, %r10 - sub $9, %cl - - movq %r11, %r9 - andq %r10, %r9 - shrq %cl, %r9 - - movq PTR(VA_PTE_1)(%rsi), %r8 - addq %r8, %r9 - movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 - orq $PAGE_ATTR, %r8 - movq %r8, (%r9) - -relocate_new_kernel: - /* %rdi indirection_page - * %rsi page_list - * %rdx start address - */ - - /* zero out flags, and disable interrupts */ - pushq $0 - popfq - - /* get physical address of control page now */ - /* this is impossible after page table switch */ - movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 - - /* get physical address of page table now too */ - movq PTR(PA_TABLE_PAGE)(%rsi), %rcx - - /* switch to new set of page tables */ - movq PTR(PA_PGD)(%rsi), %r9 - movq %r9, %cr3 - - /* setup a new stack at the end of the physical control page */ - lea 4096(%r8), %rsp - - /* jump to identity mapped page */ - addq $(identity_mapped - relocate_kernel), %r8 - pushq %r8 - ret - -identity_mapped: - /* store the start address on the stack */ - pushq %rdx - - /* Set cr0 to a known state: - * 31 1 == Paging enabled - * 18 0 == Alignment check disabled - * 16 0 == Write protect disabled - * 3 0 == No task switch - * 2 0 == Don't do FP software emulation. - * 0 1 == Proctected mode enabled - */ - movq %cr0, %rax - andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax - orl $((1<<31)|(1<<0)), %eax - movq %rax, %cr0 - - /* Set cr4 to a known state: - * 10 0 == xmm exceptions disabled - * 9 0 == xmm registers instructions disabled - * 8 0 == performance monitoring counter disabled - * 7 0 == page global disabled - * 6 0 == machine check exceptions disabled - * 5 1 == physical address extension enabled - * 4 0 == page size extensions disabled - * 3 0 == Debug extensions disabled - * 2 0 == Time stamp disable (disabled) - * 1 0 == Protected mode virtual interrupts disabled - * 0 0 == VME disabled - */ - - movq $((1<<5)), %rax - movq %rax, %cr4 - - jmp 1f -1: - - /* Switch to the identity mapped page tables, - * and flush the TLB. - */ - movq %rcx, %cr3 - - /* Do the copies */ - movq %rdi, %rcx /* Put the page_list in %rcx */ - xorq %rdi, %rdi - xorq %rsi, %rsi - jmp 1f - -0: /* top, read another word for the indirection page */ - - movq (%rbx), %rcx - addq $8, %rbx -1: - testq $0x1, %rcx /* is it a destination page? */ - jz 2f - movq %rcx, %rdi - andq $0xfffffffffffff000, %rdi - jmp 0b -2: - testq $0x2, %rcx /* is it an indirection page? */ - jz 2f - movq %rcx, %rbx - andq $0xfffffffffffff000, %rbx - jmp 0b -2: - testq $0x4, %rcx /* is it the done indicator? */ - jz 2f - jmp 3f -2: - testq $0x8, %rcx /* is it the source indicator? */ - jz 0b /* Ignore it otherwise */ - movq %rcx, %rsi /* For ever source page do a copy */ - andq $0xfffffffffffff000, %rsi - - movq $512, %rcx - rep ; movsq - jmp 0b -3: - - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB by reloading %cr3 here, it's handy, - * and not processor dependent. - */ - movq %cr3, %rax - movq %rax, %cr3 - - /* set all of the registers to known values */ - /* leave %rsp alone */ - - xorq %rax, %rax - xorq %rbx, %rbx - xorq %rcx, %rcx - xorq %rdx, %rdx - xorq %rsi, %rsi - xorq %rdi, %rdi - xorq %rbp, %rbp - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r9 - xorq %r11, %r11 - xorq %r12, %r12 - xorq %r13, %r13 - xorq %r14, %r14 - xorq %r15, %r15 - - ret diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c deleted file mode 100644 index af838f6b0b7f..000000000000 --- a/arch/x86_64/kernel/setup.c +++ /dev/null @@ -1,1117 +0,0 @@ -/* - * linux/arch/x86-64/kernel/setup.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Nov 2001 Dave Jones <davej@suse.de> - * Forked from i386 setup code. - */ - -/* - * This file handles the architecture-dependent parts of initialization - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/slab.h> -#include <linux/user.h> -#include <linux/a.out.h> -#include <linux/screen_info.h> -#include <linux/ioport.h> -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/initrd.h> -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <linux/module.h> -#include <asm/processor.h> -#include <linux/console.h> -#include <linux/seq_file.h> -#include <linux/crash_dump.h> -#include <linux/root_dev.h> -#include <linux/pci.h> -#include <linux/acpi.h> -#include <linux/kallsyms.h> -#include <linux/edd.h> -#include <linux/mmzone.h> -#include <linux/kexec.h> -#include <linux/cpufreq.h> -#include <linux/dmi.h> -#include <linux/dma-mapping.h> -#include <linux/ctype.h> - -#include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/msr.h> -#include <asm/desc.h> -#include <video/edid.h> -#include <asm/e820.h> -#include <asm/dma.h> -#include <asm/mpspec.h> -#include <asm/mmu_context.h> -#include <asm/bootsetup.h> -#include <asm/proto.h> -#include <asm/setup.h> -#include <asm/mach_apic.h> -#include <asm/numa.h> -#include <asm/sections.h> -#include <asm/dmi.h> - -/* - * Machine setup.. - */ - -struct cpuinfo_x86 boot_cpu_data __read_mostly; -EXPORT_SYMBOL(boot_cpu_data); - -unsigned long mmu_cr4_features; - -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; - -unsigned long saved_video_mode; - -int force_mwait __cpuinitdata; - -/* - * Early DMI memory - */ -int dmi_alloc_index; -char dmi_alloc_data[DMI_MAX_DATA]; - -/* - * Setup options - */ -struct screen_info screen_info; -EXPORT_SYMBOL(screen_info); -struct sys_desc_table_struct { - unsigned short length; - unsigned char table[0]; -}; - -struct edid_info edid_info; -EXPORT_SYMBOL_GPL(edid_info); - -extern int root_mountflags; - -char __initdata command_line[COMMAND_LINE_SIZE]; - -struct resource standard_io_resources[] = { - { .name = "dma1", .start = 0x00, .end = 0x1f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic1", .start = 0x20, .end = 0x21, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer0", .start = 0x40, .end = 0x43, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer1", .start = 0x50, .end = 0x53, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x60, .end = 0x6f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma page reg", .start = 0x80, .end = 0x8f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic2", .start = 0xa0, .end = 0xa1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma2", .start = 0xc0, .end = 0xdf, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "fpu", .start = 0xf0, .end = 0xff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO } -}; - -#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) - -struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_RAM, -}; -struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_RAM, -}; - -#ifdef CONFIG_PROC_VMCORE -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - -#ifndef CONFIG_NUMA -static void __init -contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long bootmap_size, bootmap; - - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); - if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n",bootmap_size); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); - e820_register_active_regions(0, start_pfn, end_pfn); - free_bootmem_with_active_regions(0, end_pfn); - reserve_bootmem(bootmap, bootmap_size); -} -#endif - -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) -struct edd edd; -#ifdef CONFIG_EDD_MODULE -EXPORT_SYMBOL(edd); -#endif -/** - * copy_edd() - Copy the BIOS EDD information - * from boot_params into a safe place. - * - */ -static inline void copy_edd(void) -{ - memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); - memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); - edd.mbr_signature_nr = EDD_MBR_SIG_NR; - edd.edd_info_nr = EDD_NR; -} -#else -static inline void copy_edd(void) -{ -} -#endif - -#define EBDA_ADDR_POINTER 0x40E - -unsigned __initdata ebda_addr; -unsigned __initdata ebda_size; - -static void discover_ebda(void) -{ - /* - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E - */ - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); - ebda_addr <<= 4; - - ebda_size = *(unsigned short *)__va(ebda_addr); - - /* Round EBDA up to pages */ - if (ebda_size == 0) - ebda_size = 1; - ebda_size <<= 10; - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); - if (ebda_size > 64*1024) - ebda_size = 64*1024; -} - -void __init setup_arch(char **cmdline_p) -{ - printk(KERN_INFO "Command line: %s\n", boot_command_line); - - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); - screen_info = SCREEN_INFO; - edid_info = EDID_INFO; - saved_video_mode = SAVED_VIDEO_MODE; - bootloader_type = LOADER_TYPE; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); -#endif - setup_memory_region(); - copy_edd(); - - if (!MOUNT_ROOT_RDONLY) - root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) &_text; - init_mm.end_code = (unsigned long) &_etext; - init_mm.end_data = (unsigned long) &_edata; - init_mm.brk = (unsigned long) &_end; - - code_resource.start = virt_to_phys(&_text); - code_resource.end = virt_to_phys(&_etext)-1; - data_resource.start = virt_to_phys(&_etext); - data_resource.end = virt_to_phys(&_edata)-1; - - early_identify_cpu(&boot_cpu_data); - - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - - parse_early_param(); - - finish_e820_parsing(); - - e820_register_active_regions(0, 0, -1UL); - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - end_pfn = e820_end_of_ram(); - num_physpages = end_pfn; - - check_efer(); - - discover_ebda(); - - init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); - - dmi_scan_machine(); - -#ifdef CONFIG_ACPI - /* - * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). - * Call this early for SRAT node setup. - */ - acpi_boot_table_init(); -#endif - - /* How many end-of-memory variables you have, grandma! */ - max_low_pfn = end_pfn; - max_pfn = end_pfn; - high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; - - /* Remove active ranges so rediscovery with NUMA-awareness happens */ - remove_all_active_ranges(); - -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi_numa_init(); -#endif - -#ifdef CONFIG_NUMA - numa_initmem_init(0, end_pfn); -#else - contig_initmem_init(0, end_pfn); -#endif - - /* Reserve direct mapping */ - reserve_bootmem_generic(table_start << PAGE_SHIFT, - (table_end - table_start) << PAGE_SHIFT); - - /* reserve kernel */ - reserve_bootmem_generic(__pa_symbol(&_text), - __pa_symbol(&_end) - __pa_symbol(&_text)); - - /* - * reserve physical page 0 - it's a special BIOS page on many boxes, - * enabling clean reboots, SMP operation, laptop functions. - */ - reserve_bootmem_generic(0, PAGE_SIZE); - - /* reserve ebda region */ - if (ebda_addr) - reserve_bootmem_generic(ebda_addr, ebda_size); -#ifdef CONFIG_NUMA - /* reserve nodemap region */ - if (nodemap_addr) - reserve_bootmem_generic(nodemap_addr, nodemap_size); -#endif - -#ifdef CONFIG_SMP - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); -#endif - -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); -#ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { - reserve_bootmem_generic(INITRD_START, INITRD_SIZE); - initrd_start = INITRD_START + PAGE_OFFSET; - initrd_end = initrd_start+INITRD_SIZE; - } - else { - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - (unsigned long)(INITRD_START + INITRD_SIZE), - (unsigned long)(end_pfn << PAGE_SHIFT)); - initrd_start = 0; - } - } -#endif -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) { - reserve_bootmem_generic(crashk_res.start, - crashk_res.end - crashk_res.start + 1); - } -#endif - - paging_init(); - -#ifdef CONFIG_PCI - early_quirks(); -#endif - - /* - * set this early, so we dont allocate cpu0 - * if MADT list doesnt list BSP first - * mpparse.c/MP_processor_info() allocates logical cpu numbers. - */ - cpu_set(0, cpu_present_map); -#ifdef CONFIG_ACPI - /* - * Read APIC and some other early information from ACPI tables. - */ - acpi_boot_init(); -#endif - - init_cpu_to_node(); - - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - get_smp_config(); - init_apic_mappings(); - - /* - * We trust e820 completely. No explicit ROM probing in memory. - */ - e820_reserve_resources(); - e820_mark_nosave_regions(); - - { - unsigned i; - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - } - - e820_setup_gap(); - -#ifdef CONFIG_VT -#if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; -#elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; -#endif -#endif -} - -static int __cpuinit get_model_name(struct cpuinfo_x86 *c) -{ - unsigned int *v; - - if (c->extended_cpuid_level < 0x80000004) - return 0; - - v = (unsigned int *) c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); - c->x86_model_id[48] = 0; - return 1; -} - - -static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) -{ - unsigned int n, dummy, eax, ebx, ecx, edx; - - n = c->extended_cpuid_level; - - if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size=(ecx>>24)+(edx>>24); - /* On K8 L1 TLB is inclusive, so don't count it */ - c->x86_tlbsize = 0; - } - - if (n >= 0x80000006) { - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); - ecx = cpuid_ecx(0x80000006); - c->x86_cache_size = ecx >> 16; - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - c->x86_cache_size, ecx & 0xFF); - } - - if (n >= 0x80000007) - cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); - if (n >= 0x80000008) { - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } -} - -#ifdef CONFIG_NUMA -static int nearby_node(int apicid) -{ - int i; - for (i = apicid - 1; i >= 0; i--) { - int node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - int node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - return first_node(node_online_map); /* Shouldn't happen */ -} -#endif - -/* - * On a AMD dual core setup the lower bits of the APIC id distingush the cores. - * Assumes number of cores is a power of two. - */ -static void __init amd_detect_cmp(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits; -#ifdef CONFIG_NUMA - int cpu = smp_processor_id(); - int node = 0; - unsigned apicid = hard_smp_processor_id(); -#endif - unsigned ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); - /* Convert the APIC ID into the socket ID */ - c->phys_proc_id = phys_pkg_id(bits); - -#ifdef CONFIG_NUMA - node = c->phys_proc_id; - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits); - if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; - /* Pick a nearby node */ - if (!node_online(node)) - node = nearby_node(apicid); - } - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -#endif -} - -static void __cpuinit init_amd(struct cpuinfo_x86 *c) -{ - unsigned level; - -#ifdef CONFIG_SMP - unsigned long value; - - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 15) { - rdmsrl(MSR_K8_HWCR, value); - value |= 1 << 6; - wrmsrl(MSR_K8_HWCR, value); - } -#endif - - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_bit(0*32+31, &c->x86_capability); - - /* On C+ stepping K8 rep microcode works well for copy/memset */ - level = cpuid_eax(1); - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - if (c->x86 == 0x10) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) - set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); - - level = get_model_name(c); - if (!level) { - switch (c->x86) { - case 15: - /* Should distinguish Models here, but this is only - a fallback anyways. */ - strcpy(c->x86_model_id, "Hammer"); - break; - } - } - display_cacheinfo(c); - - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - - /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) - amd_detect_cmp(c); - - if (c->extended_cpuid_level >= 0x80000006 && - (cpuid_edx(0x80000006) & 0xf000)) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) - set_bit(X86_FEATURE_K8, &c->x86_capability); - - /* RDTSC can be speculated around */ - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - - /* Family 10 doesn't support C states in MWAIT so don't use it */ - if (c->x86 == 0x10 && !force_mwait) - clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); -} - -static void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - - if (!cpu_has(c, X86_FEATURE_HT)) - return; - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - goto out; - - smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1 ) { - - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; - } - - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = phys_pkg_id(index_msb); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings) ; - - core_bits = get_count_order(c->x86_max_cores); - - c->cpu_core_id = phys_pkg_id(index_msb) & - ((1 << core_bits) - 1); - } -out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); - } - -#endif -} - -/* - * find out the number of processor cores on the die - */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, t; - - if (c->cpuid_level < 4) - return 1; - - cpuid_count(4, 0, &eax, &t, &t, &t); - - if (eax & 0x1f) - return ((eax >> 26) + 1); - else - return 1; -} - -static void srat_detect_node(void) -{ -#ifdef CONFIG_NUMA - unsigned node; - int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); - - /* Don't do the funky fallback heuristics the AMD version employs - for now. */ - node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE) - node = first_node(node_online_map); - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -} - -static void __cpuinit init_intel(struct cpuinfo_x86 *c) -{ - /* Cache sizes */ - unsigned n; - - init_intel_cacheinfo(c); - if (c->cpuid_level > 9 ) { - unsigned eax = cpuid_eax(10); - /* Check for version and the number of counters */ - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) - set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); - } - - if (cpu_has_ds) { - unsigned int l1, l2; - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) - set_bit(X86_FEATURE_BTS, c->x86_capability); - if (!(l1 & (1<<12))) - set_bit(X86_FEATURE_PEBS, c->x86_capability); - } - - n = c->extended_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - /* CPUID workaround for Intel 0F34 CPU */ - if (c->x86_vendor == X86_VENDOR_INTEL && - c->x86 == 0xF && c->x86_model == 0x3 && - c->x86_mask == 0x4) - c->x86_phys_bits = 36; - } - - if (c->x86 == 15) - c->x86_cache_alignment = c->x86_clflush_size * 2; - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - if (c->x86 == 6) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - if (c->x86 == 15) - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - else - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - c->x86_max_cores = intel_num_cpu_cores(c); - - srat_detect_node(); -} - -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) -{ - char *v = c->x86_vendor_id; - - if (!strcmp(v, "AuthenticAMD")) - c->x86_vendor = X86_VENDOR_AMD; - else if (!strcmp(v, "GenuineIntel")) - c->x86_vendor = X86_VENDOR_INTEL; - else - c->x86_vendor = X86_VENDOR_UNKNOWN; -} - -struct cpu_model_info { - int vendor; - int family; - char *model_names[16]; -}; - -/* Do some early cpuid on the boot CPU to get some parameter that are - needed before check_bugs. Everything advanced is in identify_cpu - below. */ -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) -{ - u32 tfms; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_clflush_size = 64; - c->x86_cache_alignment = c->x86_clflush_size; - c->x86_max_cores = 1; - c->extended_cpuid_level = 0; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c); - - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - __u32 misc; - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], - &c->x86_capability[0]); - c->x86 = (tfms >> 8) & 0xf; - c->x86_model = (tfms >> 4) & 0xf; - c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (c->x86_capability[0] & (1<<19)) - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } - -#ifdef CONFIG_SMP - c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; -#endif -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - u32 xlvl; - - early_identify_cpu(c); - - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - c->extended_cpuid_level = xlvl; - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ - } - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - /* Don't set x86_cpuid_level here for now to not confuse. */ - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } - - init_scattered_cpuid_features(c); - - c->apicid = phys_pkg_id(0); - - /* - * Vendor-specific initialization. In this section we - * canonicalize the feature flags, meaning if there are - * features a certain CPU supports which CPUID doesn't - * tell us, CPUID claiming incorrect flags, or other bugs, - * we handle them here. - * - * At the end of this section, c->x86_capability better - * indicate the features this CPU genuinely supports! - */ - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - init_amd(c); - break; - - case X86_VENDOR_INTEL: - init_intel(c); - break; - - case X86_VENDOR_UNKNOWN: - default: - display_cacheinfo(c); - break; - } - - select_idle_routine(c); - detect_ht(c); - - /* - * On SMP, boot_cpu_data holds the common feature set between - * all CPUs; so make sure that we indicate which features are - * common between the CPUs. The first time this routine gets - * executed, c == &boot_cpu_data. - */ - if (c != &boot_cpu_data) { - /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) - boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; - } - -#ifdef CONFIG_X86_MCE - mcheck_init(c); -#endif - if (c != &boot_cpu_data) - mtrr_ap_init(); -#ifdef CONFIG_NUMA - numa_add_cpu(smp_processor_id()); -#endif -} - - -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) -{ - if (c->x86_model_id[0]) - printk("%s", c->x86_model_id); - - if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); - else - printk("\n"); -} - -/* - * Get CPU information for use by the procfs. - */ - -static int show_cpuinfo(struct seq_file *m, void *v) -{ - struct cpuinfo_x86 *c = v; - - /* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ - static char *x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, "sync_rdtsc", - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy", - "altmovcr8", "abm", "sse4a", - "misalignsse", "3dnowprefetch", - "osvw", "ibs", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - static char *x86_power_flags[] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ - }; - - -#ifdef CONFIG_SMP - if (!cpu_online(c-cpu_data)) - return 0; -#endif - - seq_printf(m,"processor\t: %u\n" - "vendor_id\t: %s\n" - "cpu family\t: %d\n" - "model\t\t: %d\n" - "model name\t: %s\n", - (unsigned)(c-cpu_data), - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", - c->x86, - (int)c->x86_model, - c->x86_model_id[0] ? c->x86_model_id : "unknown"); - - if (c->x86_mask || c->cpuid_level >= 0) - seq_printf(m, "stepping\t: %d\n", c->x86_mask); - else - seq_printf(m, "stepping\t: unknown\n"); - - if (cpu_has(c,X86_FEATURE_TSC)) { - unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data)); - if (!freq) - freq = cpu_khz; - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); - } - - /* Cache size */ - if (c->x86_cache_size >= 0) - seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); - -#ifdef CONFIG_SMP - if (smp_num_siblings * c->x86_max_cores > 1) { - int cpu = c - cpu_data; - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); - seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); - seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); - } -#endif - - seq_printf(m, - "fpu\t\t: yes\n" - "fpu_exception\t: yes\n" - "cpuid level\t: %d\n" - "wp\t\t: yes\n" - "flags\t\t:", - c->cpuid_level); - - { - int i; - for ( i = 0 ; i < 32*NCAPINTS ; i++ ) - if (cpu_has(c, i) && x86_cap_flags[i] != NULL) - seq_printf(m, " %s", x86_cap_flags[i]); - } - - seq_printf(m, "\nbogomips\t: %lu.%02lu\n", - c->loops_per_jiffy/(500000/HZ), - (c->loops_per_jiffy/(5000/HZ)) % 100); - - if (c->x86_tlbsize > 0) - seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); - seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); - seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); - - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", - c->x86_phys_bits, c->x86_virt_bits); - - seq_printf(m, "power management:"); - { - unsigned i; - for (i = 0; i < 32; i++) - if (c->x86_power & (1 << i)) { - if (i < ARRAY_SIZE(x86_power_flags) && - x86_power_flags[i]) - seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", - x86_power_flags[i]); - else - seq_printf(m, " [%d]", i); - } - } - - seq_printf(m, "\n\n"); - - return 0; -} - -static void *c_start(struct seq_file *m, loff_t *pos) -{ - return *pos < NR_CPUS ? cpu_data + *pos : NULL; -} - -static void *c_next(struct seq_file *m, void *v, loff_t *pos) -{ - ++*pos; - return c_start(m, pos); -} - -static void c_stop(struct seq_file *m, void *v) -{ -} - -struct seq_operations cpuinfo_op = { - .start =c_start, - .next = c_next, - .stop = c_stop, - .show = show_cpuinfo, -}; diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c deleted file mode 100644 index 1200aaac403e..000000000000 --- a/arch/x86_64/kernel/setup64.c +++ /dev/null @@ -1,289 +0,0 @@ -/* - * X86-64 specific CPU setup. - * Copyright (C) 1995 Linus Torvalds - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. - * See setup.c for older changelog. - */ -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/bootmem.h> -#include <linux/bitops.h> -#include <linux/module.h> -#include <asm/bootsetup.h> -#include <asm/pda.h> -#include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/mmu_context.h> -#include <asm/smp.h> -#include <asm/i387.h> -#include <asm/percpu.h> -#include <asm/proto.h> -#include <asm/sections.h> - -char x86_boot_params[BOOT_PARAM_SIZE] __initdata; - -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; - -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(_cpu_pda); -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; - -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; - -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); - -unsigned long __supported_pte_mask __read_mostly = ~0UL; -static int do_not_nx __cpuinitdata = 0; - -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -int force_personality32 = 0; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes -off PROT_READ implies PROT_EXEC (default) -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 1; -} -__setup("noexec32=", nonx32_setup); - -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ -void __init setup_per_cpu_areas(void) -{ - int i; - unsigned long size; - -#ifdef CONFIG_HOTPLUG_CPU - prefill_possible_map(); -#endif - - /* Copy section for each CPU (we discard the original) */ - size = PERCPU_ENOUGH_ROOM; - - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); - for_each_cpu_mask (i, cpu_possible_map) { - char *ptr; - - if (!NODE_DATA(cpu_to_node(i))) { - printk("cpu with no node %d, num_online_nodes %d\n", - i, num_online_nodes()); - ptr = alloc_bootmem_pages(size); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); - } - if (!ptr) - panic("Cannot allocate cpu data for CPU %d\n", i); - cpu_pda(i)->data_offset = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - } -} - -void pda_init(int cpu) -{ - struct x8664_pda *pda = cpu_pda(cpu); - - /* Setup up data that may be needed in __get_free_pages early */ - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); - /* Memory clobbers used to order PDA accessed */ - mb(); - wrmsrl(MSR_GS_BASE, pda); - mb(); - - pda->cpunumber = cpu; - pda->irqcount = -1; - pda->kernelstack = - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; - - if (cpu == 0) { - /* others are initialized in smpboot.c */ - pda->pcurrent = &init_task; - pda->irqstackptr = boot_cpu_stack; - } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); - } - - - pda->irqstackptr += IRQSTACKSIZE-64; -} - -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ] -__attribute__((section(".bss.page_aligned"))); - -extern asmlinkage void ignore_sysret(void); - -/* May not be marked __init: used by software suspend */ -void syscall_init(void) -{ - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to set CS/DS - * but only a 32bit target. LSTAR sets the 64bit rip. - */ - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); - wrmsrl(MSR_CSTAR, ignore_sysret); - -#ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init (); -#endif - - /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); -} - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) { - __supported_pte_mask &= ~_PAGE_NX; - } -} - -unsigned long kernel_eflags; - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init. - */ -void __cpuinit cpu_init (void) -{ - int cpu = stack_smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); - unsigned long v; - char *estacks = NULL; - struct task_struct *me; - int i; - - /* CPU 0 is initialised in head64.c */ - if (cpu != 0) { - pda_init(cpu); - } else - estacks = boot_exception_stacks; - - me = current; - - if (cpu_test_and_set(cpu, cpu_initialized)) - panic("CPU#%d already initialized!\n", cpu); - - printk("Initializing CPU#%d\n", cpu); - - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - if (cpu) - memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); - - cpu_gdt_descr[cpu].size = GDT_SIZE; - asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); - asm volatile("lidt %0" :: "m" (idt_descr)); - - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); - - check_efer(); - - /* - * set up and load the per-CPU TSS - */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); - } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; - } - - t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - - atomic_inc(&init_mm.mm_count); - me->active_mm = &init_mm; - if (me->mm) - BUG(); - enter_lazy_tlb(&init_mm, me); - - set_tss_desc(cpu, t); - load_TR_desc(); - load_LDT(&init_mm.context); - - /* - * Clear all 6 debug registers: - */ - - set_debugreg(0UL, 0); - set_debugreg(0UL, 1); - set_debugreg(0UL, 2); - set_debugreg(0UL, 3); - set_debugreg(0UL, 6); - set_debugreg(0UL, 7); - - fpu_init(); - - raw_local_save_flags(kernel_eflags); -} diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c deleted file mode 100644 index 739175b01e06..000000000000 --- a/arch/x86_64/kernel/signal.c +++ /dev/null @@ -1,495 +0,0 @@ -/* - * linux/arch/x86_64/kernel/signal.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes - * 2000-2002 x86-64 support by Andi Kleen - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/kernel.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/wait.h> -#include <linux/ptrace.h> -#include <linux/unistd.h> -#include <linux/stddef.h> -#include <linux/personality.h> -#include <linux/compiler.h> -#include <asm/ucontext.h> -#include <asm/uaccess.h> -#include <asm/i387.h> -#include <asm/proto.h> -#include <asm/ia32_unistd.h> -#include <asm/mce.h> - -/* #define DEBUG_SIG 1 */ - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs * regs); - -asmlinkage long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->rsp); -} - - -/* - * Do a signal return; undo the signal stack. - */ - -struct rt_sigframe -{ - char __user *pretcode; - struct ucontext uc; - struct siginfo info; -}; - -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax) -{ - unsigned int err = 0; - - /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; - -#define COPY(x) err |= __get_user(regs->x, &sc->x) - - COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); - COPY(rdx); COPY(rcx); COPY(rip); - COPY(r8); - COPY(r9); - COPY(r10); - COPY(r11); - COPY(r12); - COPY(r13); - COPY(r14); - COPY(r15); - - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - { - unsigned cs; - err |= __get_user(cs, &sc->cs); - regs->cs = cs | 3; /* Force into user mode */ - } - - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); - regs->orig_rax = -1; /* disable syscall checks */ - } - - { - struct _fpstate __user * buf; - err |= __get_user(buf, &sc->fpstate); - - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } else { - struct task_struct *me = current; - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } - } - - err |= __get_user(*prax, &sc->rax); - return err; - -badframe: - return 1; -} - -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - sigset_t set; - unsigned long eax; - - frame = (struct rt_sigframe __user *)(regs->rsp - 8); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { - goto badframe; - } - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { - goto badframe; - } - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) - goto badframe; - -#ifdef DEBUG_SIG - printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); -#endif - - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) - goto badframe; - - return eax; - -badframe: - signal_fault(regs,frame,"sigreturn"); - return 0; -} - -/* - * Set up a signal frame. - */ - -static inline int -setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) -{ - int err = 0; - - err |= __put_user(regs->cs, &sc->cs); - err |= __put_user(0, &sc->gs); - err |= __put_user(0, &sc->fs); - - err |= __put_user(regs->rdi, &sc->rdi); - err |= __put_user(regs->rsi, &sc->rsi); - err |= __put_user(regs->rbp, &sc->rbp); - err |= __put_user(regs->rsp, &sc->rsp); - err |= __put_user(regs->rbx, &sc->rbx); - err |= __put_user(regs->rdx, &sc->rdx); - err |= __put_user(regs->rcx, &sc->rcx); - err |= __put_user(regs->rax, &sc->rax); - err |= __put_user(regs->r8, &sc->r8); - err |= __put_user(regs->r9, &sc->r9); - err |= __put_user(regs->r10, &sc->r10); - err |= __put_user(regs->r11, &sc->r11); - err |= __put_user(regs->r12, &sc->r12); - err |= __put_user(regs->r13, &sc->r13); - err |= __put_user(regs->r14, &sc->r14); - err |= __put_user(regs->r15, &sc->r15); - err |= __put_user(me->thread.trap_no, &sc->trapno); - err |= __put_user(me->thread.error_code, &sc->err); - err |= __put_user(regs->rip, &sc->rip); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(me->thread.cr2, &sc->cr2); - - return err; -} - -/* - * Determine which stack to use.. - */ - -static void __user * -get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) -{ - unsigned long rsp; - - /* Default to using normal stack - redzone*/ - rsp = regs->rsp - 128; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(rsp) == 0) - rsp = current->sas_ss_sp + current->sas_ss_size; - } - - return (void __user *)round_down(rsp - size, 16); -} - -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs) -{ - struct rt_sigframe __user *frame; - struct _fpstate __user *fp = NULL; - int err = 0; - struct task_struct *me = current; - - if (used_math()) { - fp = get_stack(ka, regs, sizeof(struct _fpstate)); - frame = (void __user *)round_down( - (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - - if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) - goto give_sigsegv; - - if (save_i387(fp) < 0) - err |= -1; - } else - frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; - - if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); - if (err) - goto give_sigsegv; - } - - /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->rsp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); - err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); - if (sizeof(*set) == 16) { - __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); - } else - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) { - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); - } else { - /* could use a vstub here */ - goto give_sigsegv; - } - - if (err) - goto give_sigsegv; - -#ifdef DEBUG_SIG - printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); -#endif - - /* Set up registers for signal handler */ - regs->rdi = sig; - /* In case the signal handler was declared without prototypes */ - regs->rax = 0; - - /* This also works for non SA_SIGINFO handlers because they expect the - next argument after the signal number on the stack. */ - regs->rsi = (unsigned long)&frame->info; - regs->rdx = (unsigned long)&frame->uc; - regs->rip = (unsigned long) ka->sa.sa_handler; - - regs->rsp = (unsigned long)frame; - - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ - regs->cs = __USER_CS; - - /* This, by contrast, has nothing to do with segment registers - - see include/asm-x86_64/uaccess.h for details. */ - set_fs(USER_DS); - - regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); -#ifdef DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); -#endif - - return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; -} - -/* - * OK, we're invoking a handler - */ - -static int -handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) -{ - int ret; - -#ifdef DEBUG_SIG - printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", - current->pid, sig, - regs->rip, regs->rsp, regs); -#endif - - /* Are we from a system call? */ - if ((long)regs->orig_rax >= 0) { - /* If so, check system call restarting.. */ - switch (regs->rax) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->rax = -EINTR; - break; - - case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->rax = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - regs->rax = regs->orig_rax; - regs->rip -= 2; - break; - } - } - - /* - * If TF is set due to a debugger (PT_DTRACE), clear the TF - * flag so that register information in the sigcontext is - * correct. - */ - if (unlikely(regs->eflags & TF_MASK)) { - if (likely(current->ptrace & PT_DTRACE)) { - current->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } - } - -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) { - if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = ia32_setup_frame(sig, ka, oldset, regs); - } else -#endif - ret = setup_rt_frame(sig, ka, info, oldset, regs); - - if (ret == 0) { - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked,sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } - - return ret; -} - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - */ -static void do_signal(struct pt_regs *regs) -{ - struct k_sigaction ka; - siginfo_t info; - int signr; - sigset_t *oldset; - - /* - * We want the common case to go fast, which - * is why we may in certain cases get here from - * kernel mode. Just return without doing anything - * if so. - */ - if (!user_mode(regs)) - return; - - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - - signr = get_signal_to_deliver(&info, &ka, regs, NULL); - if (signr > 0) { - /* Reenable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - - /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { - /* a signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag */ - clear_thread_flag(TIF_RESTORE_SIGMASK); - } - return; - } - - /* Did we come from a system call? */ - if ((long)regs->orig_rax >= 0) { - /* Restart the system call - no handlers present */ - long res = regs->rax; - switch (res) { - case -ERESTARTNOHAND: - case -ERESTARTSYS: - case -ERESTARTNOINTR: - regs->rax = regs->orig_rax; - regs->rip -= 2; - break; - case -ERESTART_RESTARTBLOCK: - regs->rax = test_thread_flag(TIF_IA32) ? - __NR_ia32_restart_syscall : - __NR_restart_syscall; - regs->rip -= 2; - break; - } - } - - /* if there's no signal to deliver, we just put the saved sigmask - back. */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } -} - -void -do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) -{ -#ifdef DEBUG_SIG - printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", - thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); -#endif - - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->eflags |= TF_MASK; - clear_thread_flag(TIF_SINGLESTEP); - } - -#ifdef CONFIG_X86_MCE - /* notify userspace of pending MCEs */ - if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_user(); -#endif /* CONFIG_X86_MCE */ - - /* deal with pending signal delivery */ - if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) - do_signal(regs); -} - -void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -{ - struct task_struct *me = current; - if (show_unhandled_signals && printk_ratelimit()) - printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", - me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); - - force_sig(SIGSEGV, me); -} diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c deleted file mode 100644 index df4a82812adb..000000000000 --- a/arch/x86_64/kernel/smp.c +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Intel SMP support routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> - * (c) 2002,2003 Andi Kleen, SuSE Labs. - * - * This code is released under the GNU General Public License version 2 or - * later. - */ - -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/interrupt.h> - -#include <asm/mtrr.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm/mach_apic.h> -#include <asm/mmu_context.h> -#include <asm/proto.h> -#include <asm/apicdef.h> -#include <asm/idle.h> - -/* - * Smarter SMP flushing macros. - * c/o Linus Torvalds. - * - * These mean you can really definitely utterly forget about - * writing to user space from interrupts. (Its not allowed anyway). - * - * Optimizations Manfred Spraul <manfred@colorfullife.com> - * - * More scalable flush, from Andi Kleen - * - * To avoid global state use 8 different call vectors. - * Each CPU uses a specific vector to trigger flushes on other - * CPUs. Depending on the received vector the target CPUs look into - * the right per cpu variable for the flush data. - * - * With more than 8 CPUs they are hashed to the 8 available - * vectors. The limited global vector space forces us to this right now. - * In future when interrupts are split into per CPU domains this could be - * fixed, at the cost of triggering multiple IPIs in some cases. - */ - -union smp_flush_state { - struct { - cpumask_t flush_cpumask; - struct mm_struct *flush_mm; - unsigned long flush_va; -#define FLUSH_ALL -1ULL - spinlock_t tlbstate_lock; - }; - char pad[SMP_CACHE_BYTES]; -} ____cacheline_aligned; - -/* State is put into the per CPU data section, but padded - to a full cache line because other CPUs can access it and we don't - want false sharing in the per cpu data segment. */ -static DEFINE_PER_CPU(union smp_flush_state, flush_state); - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - */ -static inline void leave_mm(int cpu) -{ - if (read_pda(mmu_state) == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); - load_cr3(swapper_pg_dir); -} - -/* - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. - * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - * cpu active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu mmu_state to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. - * - * The good news is that cpu mmu_state is local to each cpu, no - * write/read ordering problems. - */ - -/* - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - * - * Interrupts are disabled. - */ - -asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) -{ - int cpu; - int sender; - union smp_flush_state *f; - - cpu = smp_processor_id(); - /* - * orig_rax contains the negated interrupt vector. - * Use that to determine where the sender put the data. - */ - sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; - f = &per_cpu(flush_state, sender); - - if (!cpu_isset(cpu, f->flush_cpumask)) - goto out; - /* - * This was a BUG() but until someone can quote me the - * line from the intel manual that guarantees an IPI to - * multiple CPUs is retried _only_ on the erroring CPUs - * its staying as a return - * - * BUG(); - */ - - if (f->flush_mm == read_pda(active_mm)) { - if (read_pda(mmu_state) == TLBSTATE_OK) { - if (f->flush_va == FLUSH_ALL) - local_flush_tlb(); - else - __flush_tlb_one(f->flush_va); - } else - leave_mm(cpu); - } -out: - ack_APIC_irq(); - cpu_clear(cpu, f->flush_cpumask); -} - -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) -{ - int sender; - union smp_flush_state *f; - - /* Caller has disabled preemption */ - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; - f = &per_cpu(flush_state, sender); - - /* Could avoid this lock when - num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is - probably not worth checking this for a cache-hot lock. */ - spin_lock(&f->tlbstate_lock); - - f->flush_mm = mm; - f->flush_va = va; - cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); - - /* - * We have to send the IPI only to - * CPUs affected. - */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); - - while (!cpus_empty(f->flush_cpumask)) - cpu_relax(); - - f->flush_mm = NULL; - f->flush_va = 0; - spin_unlock(&f->tlbstate_lock); -} - -int __cpuinit init_smp_flush(void) -{ - int i; - for_each_cpu_mask(i, cpu_possible_map) { - spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); - } - return 0; -} - -core_initcall(init_smp_flush); - -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - local_flush_tlb(); - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - preempt_enable(); -} -EXPORT_SYMBOL(flush_tlb_current_task); - -void flush_tlb_mm (struct mm_struct * mm) -{ - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if (current->mm) - local_flush_tlb(); - else - leave_mm(smp_processor_id()); - } - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - - preempt_enable(); -} -EXPORT_SYMBOL(flush_tlb_mm); - -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) -{ - struct mm_struct *mm = vma->vm_mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if(current->mm) - __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); - } - - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, va); - - preempt_enable(); -} -EXPORT_SYMBOL(flush_tlb_page); - -static void do_flush_tlb_all(void* info) -{ - unsigned long cpu = smp_processor_id(); - - __flush_tlb_all(); - if (read_pda(mmu_state) == TLBSTATE_LAZY) - leave_mm(cpu); -} - -void flush_tlb_all(void) -{ - on_each_cpu(do_flush_tlb_all, NULL, 1, 1); -} - -/* - * this function sends a 'reschedule' IPI to another CPU. - * it goes straight through and wastes no time serializing - * anything. Worst case is that we lose a reschedule ... - */ - -void smp_send_reschedule(int cpu) -{ - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); -} - -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; - -static struct call_data_struct * call_data; - -void lock_ipi_call_lock(void) -{ - spin_lock_irq(&call_lock); -} - -void unlock_ipi_call_lock(void) -{ - spin_unlock_irq(&call_lock); -} - -/* - * this function sends a 'generic call function' IPI to one other CPU - * in the system. - * - * cpu is a standard Linux logical CPU number. - */ -static void -__smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = 1; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - -/* - * smp_call_function_single - Run a function on a specific CPU - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Currently unused. - * @wait: If true, wait until function has completed on other CPUs. - * - * Retrurns 0 on success, else a negative status code. - * - * Does not return until the remote CPU is nearly ready to execute <func> - * or is or has executed. - */ - -int smp_call_function_single (int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - /* prevent preemption and reschedule on another processor */ - int me = get_cpu(); - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - if (cpu == me) { - local_irq_disable(); - func(info); - local_irq_enable(); - put_cpu(); - return 0; - } - - spin_lock(&call_lock); - __smp_call_function_single(cpu, func, info, nonatomic, wait); - spin_unlock(&call_lock); - put_cpu(); - return 0; -} -EXPORT_SYMBOL(smp_call_function_single); - -/* - * this function sends a 'generic call function' IPI to all other CPUs - * in the system. - */ -static void __smp_call_function (void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = num_online_cpus()-1; - - if (!cpus) - return; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - -/* - * smp_call_function - run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: currently unused. - * @wait: If true, wait (atomically) until function has completed on other - * CPUs. - * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute func or are or have executed. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - * Actually there are a few legal cases, like panic. - */ -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) -{ - spin_lock(&call_lock); - __smp_call_function(func,info,nonatomic,wait); - spin_unlock(&call_lock); - return 0; -} -EXPORT_SYMBOL(smp_call_function); - -static void stop_this_cpu(void *dummy) -{ - local_irq_disable(); - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - disable_local_APIC(); - for (;;) - halt(); -} - -void smp_send_stop(void) -{ - int nolock; - unsigned long flags; - - if (reboot_force) - return; - - /* Don't deadlock on the call lock in panic */ - nolock = !spin_trylock(&call_lock); - local_irq_save(flags); - __smp_call_function(stop_this_cpu, NULL, 0, 0); - if (!nolock) - spin_unlock(&call_lock); - disable_local_APIC(); - local_irq_restore(flags); -} - -/* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. - */ -asmlinkage void smp_reschedule_interrupt(void) -{ - ack_APIC_irq(); -} - -asmlinkage void smp_call_function_interrupt(void) -{ - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - - ack_APIC_irq(); - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ - exit_idle(); - irq_enter(); - (*func)(info); - irq_exit(); - if (wait) { - mb(); - atomic_inc(&call_data->finished); - } -} - diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c deleted file mode 100644 index 32f50783edc8..000000000000 --- a/arch/x86_64/kernel/smpboot.c +++ /dev/null @@ -1,1085 +0,0 @@ -/* - * x86 SMP booting functions - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> - * Copyright 2001 Andi Kleen, SuSE Labs. - * - * Much of the core SMP work is based on previous work by Thomas Radke, to - * whom a great many thanks are extended. - * - * Thanks to Intel for making available several different Pentium, - * Pentium Pro and Pentium-II/Xeon MP machines. - * Original development of Linux SMP code supported by Caldera. - * - * This code is released under the GNU General Public License version 2 - * - * Fixes - * Felix Koop : NR_CPUS used properly - * Jose Renau : Handle single CPU case. - * Alan Cox : By repeated request 8) - Total BogoMIP report. - * Greg Wright : Fix for kernel stacks panic. - * Erich Boleyn : MP v1.4 and additional changes. - * Matthias Sattler : Changes for 2.1 kernel map. - * Michel Lespinasse : Changes for 2.1 kernel map. - * Michael Chastain : Change trampoline.S to gnu as. - * Alan Cox : Dumb bug: 'B' step PPro's are fine - * Ingo Molnar : Added APIC timers, based on code - * from Jose Renau - * Ingo Molnar : various cleanups and rewrites - * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. - * Maciej W. Rozycki : Bits for genuine 82489DX APICs - * Andi Kleen : Changed for SMP boot into long mode. - * Rusty Russell : Hacked into shape for new "hotplug" boot process. - * Andi Kleen : Converted to new state machine. - * Various cleanups. - * Probably mostly hotplug CPU ready now. - * Ashok Raj : CPU hotplug support - */ - - -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/kernel_stat.h> -#include <linux/bootmem.h> -#include <linux/thread_info.h> -#include <linux/module.h> -#include <linux/delay.h> -#include <linux/mc146818rtc.h> -#include <linux/smp.h> -#include <linux/kdebug.h> - -#include <asm/mtrr.h> -#include <asm/pgalloc.h> -#include <asm/desc.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> -#include <asm/nmi.h> -#include <asm/irq.h> -#include <asm/hw_irq.h> -#include <asm/numa.h> - -/* Number of siblings per CPU package */ -int smp_num_siblings = 1; -EXPORT_SYMBOL(smp_num_siblings); - -/* Last level cache ID of each logical CPU */ -u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; - -/* Bitmask of currently online CPUs */ -cpumask_t cpu_online_map __read_mostly; - -EXPORT_SYMBOL(cpu_online_map); - -/* - * Private maps to synchronize booting between AP and BP. - * Probably not needed anymore, but it makes for easier debugging. -AK - */ -cpumask_t cpu_callin_map; -cpumask_t cpu_callout_map; -EXPORT_SYMBOL(cpu_callout_map); - -cpumask_t cpu_possible_map; -EXPORT_SYMBOL(cpu_possible_map); - -/* Per CPU bogomips and other parameters */ -struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; -EXPORT_SYMBOL(cpu_data); - -/* Set when the idlers are all forked */ -int smp_threads_ready; - -/* representing HT siblings of each logical CPU */ -cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(cpu_sibling_map); - -/* representing HT and core siblings of each logical CPU */ -cpumask_t cpu_core_map[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(cpu_core_map); - -/* - * Trampoline 80x86 program as an array. - */ - -extern unsigned char trampoline_data[]; -extern unsigned char trampoline_end[]; - -/* State of each CPU */ -DEFINE_PER_CPU(int, cpu_state) = { 0 }; - -/* - * Store all idle threads, this can be reused instead of creating - * a new thread. Also avoids complicated thread destroy functionality - * for idle threads. - */ -struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; - -#define get_idle_for_cpu(x) (idle_thread_array[(x)]) -#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) - -/* - * Currently trivial. Write the real->protected mode - * bootstrap into the page concerned. The caller - * has made sure it's suitably aligned. - */ - -static unsigned long __cpuinit setup_trampoline(void) -{ - void *tramp = __va(SMP_TRAMPOLINE_BASE); - memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); - return virt_to_phys(tramp); -} - -/* - * The bootstrap kernel entry code has set these up. Save them for - * a given CPU - */ - -static void __cpuinit smp_store_cpu_info(int id) -{ - struct cpuinfo_x86 *c = cpu_data + id; - - *c = boot_cpu_data; - identify_cpu(c); - print_cpu_info(c); -} - -static atomic_t init_deasserted __cpuinitdata; - -/* - * Report back to the Boot Processor. - * Running on AP. - */ -void __cpuinit smp_callin(void) -{ - int cpuid, phys_id; - unsigned long timeout; - - /* - * If waken up by an INIT in an 82489DX configuration - * we may get here before an INIT-deassert IPI reaches - * our local APIC. We have to wait for the IPI or we'll - * lock up on an APIC access. - */ - while (!atomic_read(&init_deasserted)) - cpu_relax(); - - /* - * (This works even if the APIC is not enabled.) - */ - phys_id = GET_APIC_ID(apic_read(APIC_ID)); - cpuid = smp_processor_id(); - if (cpu_isset(cpuid, cpu_callin_map)) { - panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", - phys_id, cpuid); - } - Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); - - /* - * STARTUP IPIs are fragile beasts as they might sometimes - * trigger some glue motherboard logic. Complete APIC bus - * silence for 1 second, this overestimates the time the - * boot CPU is spending to send the up to 2 STARTUP IPIs - * by a factor of two. This should be enough. - */ - - /* - * Waiting 2s total for startup (udelay is not yet working) - */ - timeout = jiffies + 2*HZ; - while (time_before(jiffies, timeout)) { - /* - * Has the boot CPU finished it's STARTUP sequence? - */ - if (cpu_isset(cpuid, cpu_callout_map)) - break; - cpu_relax(); - } - - if (!time_before(jiffies, timeout)) { - panic("smp_callin: CPU%d started up but did not get a callout!\n", - cpuid); - } - - /* - * the boot CPU has finished the init stage and is spinning - * on callin_map until we finish. We are free to set up this - * CPU, first the APIC. (this is probably redundant on most - * boards) - */ - - Dprintk("CALLIN, before setup_local_APIC().\n"); - setup_local_APIC(); - - /* - * Get our bogomips. - * - * Need to enable IRQs because it can take longer and then - * the NMI watchdog might kill us. - */ - local_irq_enable(); - calibrate_delay(); - local_irq_disable(); - Dprintk("Stack at about %p\n",&cpuid); - - disable_APIC_timer(); - - /* - * Save our processor parameters - */ - smp_store_cpu_info(cpuid); - - /* - * Allow the master to continue. - */ - cpu_set(cpuid, cpu_callin_map); -} - -/* maps the cpu to the sched domain representing multi-core */ -cpumask_t cpu_coregroup_map(int cpu) -{ - struct cpuinfo_x86 *c = cpu_data + cpu; - /* - * For perf, we return last level cache shared map. - * And for power savings, we return cpu_core_map - */ - if (sched_mc_power_savings || sched_smt_power_savings) - return cpu_core_map[cpu]; - else - return c->llc_shared_map; -} - -/* representing cpus for which sibling maps can be computed */ -static cpumask_t cpu_sibling_setup_map; - -static inline void set_cpu_sibling_map(int cpu) -{ - int i; - struct cpuinfo_x86 *c = cpu_data; - - cpu_set(cpu, cpu_sibling_setup_map); - - if (smp_num_siblings > 1) { - for_each_cpu_mask(i, cpu_sibling_setup_map) { - if (c[cpu].phys_proc_id == c[i].phys_proc_id && - c[cpu].cpu_core_id == c[i].cpu_core_id) { - cpu_set(i, cpu_sibling_map[cpu]); - cpu_set(cpu, cpu_sibling_map[i]); - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); - cpu_set(i, c[cpu].llc_shared_map); - cpu_set(cpu, c[i].llc_shared_map); - } - } - } else { - cpu_set(cpu, cpu_sibling_map[cpu]); - } - - cpu_set(cpu, c[cpu].llc_shared_map); - - if (current_cpu_data.x86_max_cores == 1) { - cpu_core_map[cpu] = cpu_sibling_map[cpu]; - c[cpu].booted_cores = 1; - return; - } - - for_each_cpu_mask(i, cpu_sibling_setup_map) { - if (cpu_llc_id[cpu] != BAD_APICID && - cpu_llc_id[cpu] == cpu_llc_id[i]) { - cpu_set(i, c[cpu].llc_shared_map); - cpu_set(cpu, c[i].llc_shared_map); - } - if (c[cpu].phys_proc_id == c[i].phys_proc_id) { - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); - /* - * Does this new cpu bringup a new core? - */ - if (cpus_weight(cpu_sibling_map[cpu]) == 1) { - /* - * for each core in package, increment - * the booted_cores for this new cpu - */ - if (first_cpu(cpu_sibling_map[i]) == i) - c[cpu].booted_cores++; - /* - * increment the core count for all - * the other cpus in this package - */ - if (i != cpu) - c[i].booted_cores++; - } else if (i != cpu && !c[cpu].booted_cores) - c[cpu].booted_cores = c[i].booted_cores; - } - } -} - -/* - * Setup code on secondary processor (after comming out of the trampoline) - */ -void __cpuinit start_secondary(void) -{ - /* - * Dont put anything before smp_callin(), SMP - * booting is too fragile that we want to limit the - * things done here to the most necessary things. - */ - cpu_init(); - preempt_disable(); - smp_callin(); - - /* otherwise gcc will move up the smp_processor_id before the cpu_init */ - barrier(); - - /* - * Check TSC sync first: - */ - check_tsc_sync_target(); - - Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); - setup_secondary_APIC_clock(); - - Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); - - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - enable_NMI_through_LVT0(NULL); - enable_8259A_irq(0); - } - - enable_APIC_timer(); - - /* - * The sibling maps must be set before turing the online map on for - * this cpu - */ - set_cpu_sibling_map(smp_processor_id()); - - /* - * We need to hold call_lock, so there is no inconsistency - * between the time smp_call_function() determines number of - * IPI receipients, and the time when the determination is made - * for which cpus receive the IPI in genapic_flat.c. Holding this - * lock helps us to not include this cpu in a currently in progress - * smp_call_function(). - */ - lock_ipi_call_lock(); - spin_lock(&vector_lock); - - /* Setup the per cpu irq handling data structures */ - __setup_vector_irq(smp_processor_id()); - /* - * Allow the master to continue. - */ - cpu_set(smp_processor_id(), cpu_online_map); - per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; - spin_unlock(&vector_lock); - - unlock_ipi_call_lock(); - - cpu_idle(); -} - -extern volatile unsigned long init_rsp; -extern void (*initial_code)(void); - -#ifdef APIC_DEBUG -static void inquire_remote_apic(int apicid) -{ - unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; - char *names[] = { "ID", "VERSION", "SPIV" }; - int timeout; - unsigned int status; - - printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); - - for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { - printk("... APIC #%d %s: ", apicid, names[i]); - - /* - * Wait for idle. - */ - status = safe_apic_wait_icr_idle(); - if (status) - printk("a previous APIC delivery may have failed\n"); - - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); - - timeout = 0; - do { - udelay(100); - status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; - } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); - - switch (status) { - case APIC_ICR_RR_VALID: - status = apic_read(APIC_RRR); - printk("%08x\n", status); - break; - default: - printk("failed\n"); - } - } -} -#endif - -/* - * Kick the secondary to wake up. - */ -static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) -{ - unsigned long send_status, accept_status = 0; - int maxlvt, num_starts, j; - - Dprintk("Asserting INIT.\n"); - - /* - * Turn INIT on target chip - */ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* - * Send IPI - */ - apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT - | APIC_DM_INIT); - - Dprintk("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); - - mdelay(10); - - Dprintk("Deasserting INIT.\n"); - - /* Target chip */ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* Send IPI */ - apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); - - Dprintk("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); - - mb(); - atomic_set(&init_deasserted, 1); - - num_starts = 2; - - /* - * Run STARTUP IPI loop. - */ - Dprintk("#startup loops: %d.\n", num_starts); - - maxlvt = get_maxlvt(); - - for (j = 1; j <= num_starts; j++) { - Dprintk("Sending STARTUP #%d.\n",j); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - Dprintk("After apic_write.\n"); - - /* - * STARTUP IPI - */ - - /* Target chip */ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* Boot on the stack */ - /* Kick the second */ - apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12)); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(300); - - Dprintk("Startup point 1.\n"); - - Dprintk("Waiting for send to finish...\n"); - send_status = safe_apic_wait_icr_idle(); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ - if (maxlvt > 3) { - apic_write(APIC_ESR, 0); - } - accept_status = (apic_read(APIC_ESR) & 0xEF); - if (send_status || accept_status) - break; - } - Dprintk("After Startup.\n"); - - if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); - if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); - - return (send_status | accept_status); -} - -struct create_idle { - struct work_struct work; - struct task_struct *idle; - struct completion done; - int cpu; -}; - -void do_fork_idle(struct work_struct *work) -{ - struct create_idle *c_idle = - container_of(work, struct create_idle, work); - - c_idle->idle = fork_idle(c_idle->cpu); - complete(&c_idle->done); -} - -/* - * Boot one CPU. - */ -static int __cpuinit do_boot_cpu(int cpu, int apicid) -{ - unsigned long boot_error; - int timeout; - unsigned long start_rip; - struct create_idle c_idle = { - .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle), - .cpu = cpu, - .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), - }; - - /* allocate memory for gdts of secondary cpus. Hotplug is considered */ - if (!cpu_gdt_descr[cpu].address && - !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { - printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu); - return -1; - } - - /* Allocate node local memory for AP pdas */ - if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { - struct x8664_pda *newpda, *pda; - int node = cpu_to_node(cpu); - pda = cpu_pda(cpu); - newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC, - node); - if (newpda) { - memcpy(newpda, pda, sizeof (struct x8664_pda)); - cpu_pda(cpu) = newpda; - } else - printk(KERN_ERR - "Could not allocate node local PDA for CPU %d on node %d\n", - cpu, node); - } - - alternatives_smp_switch(1); - - c_idle.idle = get_idle_for_cpu(cpu); - - if (c_idle.idle) { - c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); - init_idle(c_idle.idle, cpu); - goto do_rest; - } - - /* - * During cold boot process, keventd thread is not spun up yet. - * When we do cpu hot-add, we create idle threads on the fly, we should - * not acquire any attributes from the calling context. Hence the clean - * way to create kernel_threads() is to do that from keventd(). - * We do the current_is_keventd() due to the fact that ACPI notifier - * was also queuing to keventd() and when the caller is already running - * in context of keventd(), we would end up with locking up the keventd - * thread. - */ - if (!keventd_up() || current_is_keventd()) - c_idle.work.func(&c_idle.work); - else { - schedule_work(&c_idle.work); - wait_for_completion(&c_idle.done); - } - - if (IS_ERR(c_idle.idle)) { - printk("failed fork for CPU %d\n", cpu); - return PTR_ERR(c_idle.idle); - } - - set_idle_for_cpu(cpu, c_idle.idle); - -do_rest: - - cpu_pda(cpu)->pcurrent = c_idle.idle; - - start_rip = setup_trampoline(); - - init_rsp = c_idle.idle->thread.rsp; - per_cpu(init_tss,cpu).rsp0 = init_rsp; - initial_code = start_secondary; - clear_tsk_thread_flag(c_idle.idle, TIF_FORK); - - printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, - cpus_weight(cpu_present_map), - apicid); - - /* - * This grunge runs the startup process for - * the targeted processor. - */ - - atomic_set(&init_deasserted, 0); - - Dprintk("Setting warm reset code and vector.\n"); - - CMOS_WRITE(0xa, 0xf); - local_flush_tlb(); - Dprintk("1.\n"); - *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4; - Dprintk("2.\n"); - *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf; - Dprintk("3.\n"); - - /* - * Be paranoid about clearing APIC errors. - */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - - /* - * Status is now clean - */ - boot_error = 0; - - /* - * Starting actual IPI sequence... - */ - boot_error = wakeup_secondary_via_INIT(apicid, start_rip); - - if (!boot_error) { - /* - * allow APs to start initializing. - */ - Dprintk("Before Callout %d.\n", cpu); - cpu_set(cpu, cpu_callout_map); - Dprintk("After Callout %d.\n", cpu); - - /* - * Wait 5s total for a response - */ - for (timeout = 0; timeout < 50000; timeout++) { - if (cpu_isset(cpu, cpu_callin_map)) - break; /* It has booted */ - udelay(100); - } - - if (cpu_isset(cpu, cpu_callin_map)) { - /* number CPUs logically, starting from 1 (BSP is 0) */ - Dprintk("CPU has booted.\n"); - } else { - boot_error = 1; - if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE)) - == 0xA5) - /* trampoline started but...? */ - printk("Stuck ??\n"); - else - /* trampoline code not run */ - printk("Not responding.\n"); -#ifdef APIC_DEBUG - inquire_remote_apic(apicid); -#endif - } - } - if (boot_error) { - cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ - clear_node_cpumask(cpu); /* was set by numa_add_cpu */ - cpu_clear(cpu, cpu_present_map); - cpu_clear(cpu, cpu_possible_map); - x86_cpu_to_apicid[cpu] = BAD_APICID; - x86_cpu_to_log_apicid[cpu] = BAD_APICID; - return -EIO; - } - - return 0; -} - -cycles_t cacheflush_time; -unsigned long cache_decay_ticks; - -/* - * Cleanup possible dangling ends... - */ -static __cpuinit void smp_cleanup_boot(void) -{ - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - CMOS_WRITE(0, 0xf); - - /* - * Reset trampoline flag - */ - *((volatile int *) phys_to_virt(0x467)) = 0; -} - -/* - * Fall back to non SMP mode after errors. - * - * RED-PEN audit/test this more. I bet there is more state messed up here. - */ -static __init void disable_smp(void) -{ - cpu_present_map = cpumask_of_cpu(0); - cpu_possible_map = cpumask_of_cpu(0); - if (smp_found_config) - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); - else - phys_cpu_present_map = physid_mask_of_physid(0); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); -} - -#ifdef CONFIG_HOTPLUG_CPU - -int additional_cpus __initdata = -1; - -/* - * cpu_possible_map should be static, it cannot change as cpu's - * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and dont expect to - * do this dynamically on cpu arrival/departure. - * cpu_present_map on the other hand can change dynamically. - * In case when cpu_hotplug is not compiled, then we resort to current - * behaviour, which is cpu_possible == cpu_present. - * - Ashok Raj - * - * Three ways to find out the number of additional hotplug CPUs: - * - If the BIOS specified disabled CPUs in ACPI/mptables use that. - * - The user can overwrite it with additional_cpus=NUM - * - Otherwise don't reserve additional CPUs. - * We do this because additional CPUs waste a lot of memory. - * -AK - */ -__init void prefill_possible_map(void) -{ - int i; - int possible; - - if (additional_cpus == -1) { - if (disabled_cpus > 0) - additional_cpus = disabled_cpus; - else - additional_cpus = 0; - } - possible = num_processors + additional_cpus; - if (possible > NR_CPUS) - possible = NR_CPUS; - - printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", - possible, - max_t(int, possible - num_processors, 0)); - - for (i = 0; i < possible; i++) - cpu_set(i, cpu_possible_map); -} -#endif - -/* - * Various sanity checks. - */ -static int __init smp_sanity_check(unsigned max_cpus) -{ - if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk("weird, boot CPU (#%d) not listed by the BIOS.\n", - hard_smp_processor_id()); - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - - /* - * If we couldn't find an SMP configuration at boot time, - * get out of here now! - */ - if (!smp_found_config) { - printk(KERN_NOTICE "SMP motherboard not detected.\n"); - disable_smp(); - if (APIC_init_uniprocessor()) - printk(KERN_NOTICE "Local APIC not detected." - " Using dummy APIC emulation.\n"); - return -1; - } - - /* - * Should not be necessary because the MP table should list the boot - * CPU too, but we do it for the sake of robustness anyway. - */ - if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) { - printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n", - boot_cpu_id); - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - - /* - * If we couldn't find a local APIC, then get out of here now! - */ - if (!cpu_has_apic) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_id); - printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); - nr_ioapics = 0; - return -1; - } - - /* - * If SMP should be disabled, then really disable it! - */ - if (!max_cpus) { - printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); - nr_ioapics = 0; - return -1; - } - - return 0; -} - -/* - * Prepare for SMP bootup. The MP table or ACPI has been read - * earlier. Just do some sanity checking here and enable APIC mode. - */ -void __init smp_prepare_cpus(unsigned int max_cpus) -{ - nmi_watchdog_default(); - current_cpu_data = boot_cpu_data; - current_thread_info()->cpu = 0; /* needed? */ - set_cpu_sibling_map(0); - - if (smp_sanity_check(max_cpus) < 0) { - printk(KERN_INFO "SMP disabled\n"); - disable_smp(); - return; - } - - - /* - * Switch from PIC to APIC mode. - */ - setup_local_APIC(); - - if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { - panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); - /* Or can we switch back to PIC here? */ - } - - /* - * Now start the IO-APICs - */ - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else - nr_ioapics = 0; - - /* - * Set up local APIC timer on boot CPU. - */ - - setup_boot_APIC_clock(); -} - -/* - * Early setup to make printk work. - */ -void __init smp_prepare_boot_cpu(void) -{ - int me = smp_processor_id(); - cpu_set(me, cpu_online_map); - cpu_set(me, cpu_callout_map); - per_cpu(cpu_state, me) = CPU_ONLINE; -} - -/* - * Entry point to boot a CPU. - */ -int __cpuinit __cpu_up(unsigned int cpu) -{ - int apicid = cpu_present_to_apicid(cpu); - unsigned long flags; - int err; - - WARN_ON(irqs_disabled()); - - Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); - - if (apicid == BAD_APICID || apicid == boot_cpu_id || - !physid_isset(apicid, phys_cpu_present_map)) { - printk("__cpu_up: bad cpu %d\n", cpu); - return -EINVAL; - } - - /* - * Already booted CPU? - */ - if (cpu_isset(cpu, cpu_callin_map)) { - Dprintk("do_boot_cpu %d Already started\n", cpu); - return -ENOSYS; - } - - /* - * Save current MTRR state in case it was changed since early boot - * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: - */ - mtrr_save_state(); - - per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; - /* Boot it! */ - err = do_boot_cpu(cpu, apicid); - if (err < 0) { - Dprintk("do_boot_cpu failed %d\n", err); - return err; - } - - /* Unleash the CPU! */ - Dprintk("waiting for cpu %d\n", cpu); - - /* - * Make sure and check TSC sync: - */ - local_irq_save(flags); - check_tsc_sync_source(cpu); - local_irq_restore(flags); - - while (!cpu_isset(cpu, cpu_online_map)) - cpu_relax(); - err = 0; - - return err; -} - -/* - * Finish the SMP boot. - */ -void __init smp_cpus_done(unsigned int max_cpus) -{ - smp_cleanup_boot(); - setup_ioapic_dest(); - check_nmi_watchdog(); -} - -#ifdef CONFIG_HOTPLUG_CPU - -static void remove_siblinginfo(int cpu) -{ - int sibling; - struct cpuinfo_x86 *c = cpu_data; - - for_each_cpu_mask(sibling, cpu_core_map[cpu]) { - cpu_clear(cpu, cpu_core_map[sibling]); - /* - * last thread sibling in this cpu core going down - */ - if (cpus_weight(cpu_sibling_map[cpu]) == 1) - c[sibling].booted_cores--; - } - - for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) - cpu_clear(cpu, cpu_sibling_map[sibling]); - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); - c[cpu].phys_proc_id = 0; - c[cpu].cpu_core_id = 0; - cpu_clear(cpu, cpu_sibling_setup_map); -} - -void remove_cpu_from_maps(void) -{ - int cpu = smp_processor_id(); - - cpu_clear(cpu, cpu_callout_map); - cpu_clear(cpu, cpu_callin_map); - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ - clear_node_cpumask(cpu); -} - -int __cpu_disable(void) -{ - int cpu = smp_processor_id(); - - /* - * Perhaps use cpufreq to drop frequency, but that could go - * into generic code. - * - * We won't take down the boot processor on i386 due to some - * interrupts only being able to be serviced by the BSP. - * Especially so if we're not using an IOAPIC -zwane - */ - if (cpu == 0) - return -EBUSY; - - if (nmi_watchdog == NMI_LOCAL_APIC) - stop_apic_nmi_watchdog(NULL); - clear_local_APIC(); - - /* - * HACK: - * Allow any queued timer interrupts to get serviced - * This is only a temporary solution until we cleanup - * fixup_irqs as we do for IA64. - */ - local_irq_enable(); - mdelay(1); - - local_irq_disable(); - remove_siblinginfo(cpu); - - spin_lock(&vector_lock); - /* It's now safe to remove this processor from the online map */ - cpu_clear(cpu, cpu_online_map); - spin_unlock(&vector_lock); - remove_cpu_from_maps(); - fixup_irqs(cpu_online_map); - return 0; -} - -void __cpu_die(unsigned int cpu) -{ - /* We don't do anything here: idle task is faking death itself. */ - unsigned int i; - - for (i = 0; i < 10; i++) { - /* They ack this in play_dead by setting CPU_DEAD */ - if (per_cpu(cpu_state, cpu) == CPU_DEAD) { - printk ("CPU %d is now offline\n", cpu); - if (1 == num_online_cpus()) - alternatives_smp_switch(0); - return; - } - msleep(100); - } - printk(KERN_ERR "CPU %u didn't die...\n", cpu); -} - -static __init int setup_additional_cpus(char *s) -{ - return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; -} -early_param("additional_cpus", setup_additional_cpus); - -#else /* ... !CONFIG_HOTPLUG_CPU */ - -int __cpu_disable(void) -{ - return -ENOSYS; -} - -void __cpu_die(unsigned int cpu) -{ - /* We said "no" in __cpu_disable */ - BUG(); -} -#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c deleted file mode 100644 index cb9109113584..000000000000 --- a/arch/x86_64/kernel/stacktrace.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * arch/x86_64/kernel/stacktrace.c - * - * Stack trace management functions - * - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - */ -#include <linux/sched.h> -#include <linux/stacktrace.h> -#include <linux/module.h> -#include <asm/stacktrace.h> - -static void save_stack_warning(void *data, char *msg) -{ -} - -static void -save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) -{ -} - -static int save_stack_stack(void *data, char *name) -{ - return -1; -} - -static void save_stack_address(void *data, unsigned long addr) -{ - struct stack_trace *trace = (struct stack_trace *)data; - if (trace->skip > 0) { - trace->skip--; - return; - } - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = addr; -} - -static struct stacktrace_ops save_stack_ops = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, - .stack = save_stack_stack, - .address = save_stack_address, -}; - -/* - * Save stack-backtrace addresses into a stack_trace buffer. - */ -void save_stack_trace(struct stack_trace *trace) -{ - dump_trace(current, NULL, NULL, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; -} -EXPORT_SYMBOL(save_stack_trace); diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c deleted file mode 100644 index 573c0a6e0ac6..000000000000 --- a/arch/x86_64/kernel/suspend.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Suspend support specific for i386. - * - * Distribute under GPLv2 - * - * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> - * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> - */ - -#include <linux/smp.h> -#include <linux/suspend.h> -#include <asm/proto.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/mtrr.h> - -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - -struct saved_context saved_context; - -unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx; -unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi; -unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11; -unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15; -unsigned long saved_context_eflags; - -void __save_processor_state(struct saved_context *ctxt) -{ - kernel_fpu_begin(); - - /* - * descriptor tables - */ - asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); - asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); - asm volatile ("str %0" : "=m" (ctxt->tr)); - - /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ - /* - * segment registers - */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); - - rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - mtrr_save_fixed_ranges(NULL); - - /* - * control registers - */ - rdmsrl(MSR_EFER, ctxt->efer); - ctxt->cr0 = read_cr0(); - ctxt->cr2 = read_cr2(); - ctxt->cr3 = read_cr3(); - ctxt->cr4 = read_cr4(); - ctxt->cr8 = read_cr8(); -} - -void save_processor_state(void) -{ - __save_processor_state(&saved_context); -} - -static void do_fpu_end(void) -{ - /* - * Restore FPU regs if necessary - */ - kernel_fpu_end(); -} - -void __restore_processor_state(struct saved_context *ctxt) -{ - /* - * control registers - */ - wrmsrl(MSR_EFER, ctxt->efer); - write_cr8(ctxt->cr8); - write_cr4(ctxt->cr4); - write_cr3(ctxt->cr3); - write_cr2(ctxt->cr2); - write_cr0(ctxt->cr0); - - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ - asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); - asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); - - /* - * segment registers - */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); - load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); - - wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_GS_BASE, ctxt->gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - - fix_processor_context(); - - do_fpu_end(); - mtrr_ap_init(); -} - -void restore_processor_state(void) -{ - __restore_processor_state(&saved_context); -} - -void fix_processor_context(void) -{ - int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - - set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ - - cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9; - - syscall_init(); /* This sets MSR_*STAR and related */ - load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7){ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); - } - -} - -#ifdef CONFIG_HIBERNATION -/* Defined in arch/x86_64/kernel/suspend_asm.S */ -extern int restore_image(void); - -pgd_t *temp_level4_pgt; - -static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) -{ - long i, j; - - i = pud_index(address); - pud = pud + i; - for (; i < PTRS_PER_PUD; pud++, i++) { - unsigned long paddr; - pmd_t *pmd; - - paddr = address + i*PUD_SIZE; - if (paddr >= end) - break; - - pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) - break; - pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } - } - return 0; -} - -static int set_up_temporary_mappings(void) -{ - unsigned long start, end, next; - int error; - - temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!temp_level4_pgt) - return -ENOMEM; - - /* It is safe to reuse the original kernel mapping */ - set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), - init_level4_pgt[pgd_index(__START_KERNEL_map)]); - - /* Set up the direct mapping from scratch */ - start = (unsigned long)pfn_to_kaddr(0); - end = (unsigned long)pfn_to_kaddr(end_pfn); - - for (; start < end; start = next) { - pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!pud) - return -ENOMEM; - next = start + PGDIR_SIZE; - if (next > end) - next = end; - if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) - return error; - set_pgd(temp_level4_pgt + pgd_index(start), - mk_kernel_pgd(__pa(pud))); - } - return 0; -} - -int swsusp_arch_resume(void) -{ - int error; - - /* We have got enough memory and from now on we cannot recover */ - if ((error = set_up_temporary_mappings())) - return error; - restore_image(); - return 0; -} - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} -#endif /* CONFIG_HIBERNATION */ diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S deleted file mode 100644 index 16d183f67bc1..000000000000 --- a/arch/x86_64/kernel/suspend_asm.S +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl> - * - * Distribute under GPLv2. - * - * swsusp_arch_resume may not use any stack, nor any variable that is - * not "NoSave" during copying pages: - * - * Its rewriting one kernel image with another. What is stack in "old" - * image could very well be data page in "new" image, and overwriting - * your own stack under you is bad idea. - */ - - .text -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/asm-offsets.h> - -ENTRY(swsusp_arch_suspend) - - movq %rsp, saved_context_esp(%rip) - movq %rax, saved_context_eax(%rip) - movq %rbx, saved_context_ebx(%rip) - movq %rcx, saved_context_ecx(%rip) - movq %rdx, saved_context_edx(%rip) - movq %rbp, saved_context_ebp(%rip) - movq %rsi, saved_context_esi(%rip) - movq %rdi, saved_context_edi(%rip) - movq %r8, saved_context_r08(%rip) - movq %r9, saved_context_r09(%rip) - movq %r10, saved_context_r10(%rip) - movq %r11, saved_context_r11(%rip) - movq %r12, saved_context_r12(%rip) - movq %r13, saved_context_r13(%rip) - movq %r14, saved_context_r14(%rip) - movq %r15, saved_context_r15(%rip) - pushfq ; popq saved_context_eflags(%rip) - - call swsusp_save - ret - -ENTRY(restore_image) - /* switch to temporary page tables */ - movq $__PAGE_OFFSET, %rdx - movq temp_level4_pgt(%rip), %rax - subq %rdx, %rax - movq %rax, %cr3 - /* Flush TLB */ - movq mmu_cr4_features(%rip), %rax - movq %rax, %rdx - andq $~(1<<7), %rdx # PGE - movq %rdx, %cr4; # turn off PGE - movq %cr3, %rcx; # flush TLB - movq %rcx, %cr3; - movq %rax, %cr4; # turn PGE back on - - movq restore_pblist(%rip), %rdx -loop: - testq %rdx, %rdx - jz done - - /* get addresses from the pbe and copy the page */ - movq pbe_address(%rdx), %rsi - movq pbe_orig_address(%rdx), %rdi - movq $512, %rcx - rep - movsq - - /* progress to the next pbe */ - movq pbe_next(%rdx), %rdx - jmp loop -done: - /* go back to the original page tables */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax - addq phys_base(%rip), %rax - movq %rax, %cr3 - - /* Flush TLB, including "global" things (vmalloc) */ - movq mmu_cr4_features(%rip), %rax - movq %rax, %rdx - andq $~(1<<7), %rdx; # PGE - movq %rdx, %cr4; # turn off PGE - movq %cr3, %rcx; # flush TLB - movq %rcx, %cr3 - movq %rax, %cr4; # turn PGE back on - - movl $24, %eax - movl %eax, %ds - - movq saved_context_esp(%rip), %rsp - movq saved_context_ebp(%rip), %rbp - /* Don't restore %rax, it must be 0 anyway */ - movq saved_context_ebx(%rip), %rbx - movq saved_context_ecx(%rip), %rcx - movq saved_context_edx(%rip), %rdx - movq saved_context_esi(%rip), %rsi - movq saved_context_edi(%rip), %rdi - movq saved_context_r08(%rip), %r8 - movq saved_context_r09(%rip), %r9 - movq saved_context_r10(%rip), %r10 - movq saved_context_r11(%rip), %r11 - movq saved_context_r12(%rip), %r12 - movq saved_context_r13(%rip), %r13 - movq saved_context_r14(%rip), %r14 - movq saved_context_r15(%rip), %r15 - pushq saved_context_eflags(%rip) ; popfq - - xorq %rax, %rax - - ret diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c deleted file mode 100644 index 4770b7a2052c..000000000000 --- a/arch/x86_64/kernel/sys_x86_64.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * linux/arch/x86_64/kernel/sys_x86_64.c - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/syscalls.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/smp.h> -#include <linux/sem.h> -#include <linux/msg.h> -#include <linux/shm.h> -#include <linux/stat.h> -#include <linux/mman.h> -#include <linux/file.h> -#include <linux/utsname.h> -#include <linux/personality.h> - -#include <asm/uaccess.h> -#include <asm/ia32.h> - -/* - * sys_pipe() is the normal C calling standard for creating - * a pipe. It's not the way Unix traditionally does this, though. - */ -asmlinkage long sys_pipe(int __user *fildes) -{ - int fd[2]; - int error; - - error = do_pipe(fd); - if (!error) { - if (copy_to_user(fildes, fd, 2*sizeof(int))) - error = -EFAULT; - } - return error; -} - -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off) -{ - long error; - struct file * file; - - error = -EINVAL; - if (off & ~PAGE_MASK) - goto out; - - error = -EBADF; - file = NULL; - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -static void find_start_end(unsigned long flags, unsigned long *begin, - unsigned long *end) -{ - if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { - /* This is usually used needed to map code in small - model, so it needs to be in the first 31bit. Limit - it to that. This means we need to move the - unmapped base down for this case. This can give - conflicts with the heap, but we assume that glibc - malloc knows how to fall back to mmap. Give it 1GB - of playground for now. -AK */ - *begin = 0x40000000; - *end = 0x80000000; - } else { - *begin = TASK_UNMAPPED_BASE; - *end = TASK_SIZE; - } -} - -unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start_addr; - unsigned long begin, end; - - if (flags & MAP_FIXED) - return addr; - - find_start_end(flags, &begin, &end); - - if (len > end) - return -ENOMEM; - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (end - len >= addr && - (!vma || addr + len <= vma->vm_start)) - return addr; - } - if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) - && len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = begin; - } - addr = mm->free_area_cache; - if (addr < begin) - addr = begin; - start_addr = addr; - -full_search: - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (end - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != begin) { - start_addr = addr = begin; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - addr = vma->vm_end; - } -} - -asmlinkage long sys_uname(struct new_utsname __user * name) -{ - int err; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - return err ? -EFAULT : 0; -} diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c deleted file mode 100644 index 63d592c276cc..000000000000 --- a/arch/x86_64/kernel/syscall.c +++ /dev/null @@ -1,26 +0,0 @@ -/* System call table for x86-64. */ - -#include <linux/linkage.h> -#include <linux/sys.h> -#include <linux/cache.h> -#include <asm/asm-offsets.h> - -#define __NO_STUBS - -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_64_UNISTD_H_ -#include <asm-x86_64/unistd.h> - -#undef __SYSCALL -#define __SYSCALL(nr, sym) [ nr ] = sym, -#undef _ASM_X86_64_UNISTD_H_ - -typedef void (*sys_call_ptr_t)(void); - -extern void sys_ni_syscall(void); - -const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { - /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include <asm-x86_64/unistd.h> -}; diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c deleted file mode 100644 index e3f2569b2c44..000000000000 --- a/arch/x86_64/kernel/tce.c +++ /dev/null @@ -1,189 +0,0 @@ -/* - * This file manages the translation entries for the IBM Calgary IOMMU. - * - * Derived from arch/powerpc/platforms/pseries/iommu.c - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Jon Mason <jdmason@us.ibm.com> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/pci.h> -#include <linux/dma-mapping.h> -#include <linux/bootmem.h> -#include <asm/tce.h> -#include <asm/calgary.h> -#include <asm/proto.h> - -/* flush a tce at 'tceaddr' to main memory */ -static inline void flush_tce(void* tceaddr) -{ - /* a single tce can't cross a cache line */ - if (cpu_has_clflush) - asm volatile("clflush (%0)" :: "r" (tceaddr)); - else - asm volatile("wbinvd":::"memory"); -} - -void tce_build(struct iommu_table *tbl, unsigned long index, - unsigned int npages, unsigned long uaddr, int direction) -{ - u64* tp; - u64 t; - u64 rpn; - - t = (1 << TCE_READ_SHIFT); - if (direction != DMA_TO_DEVICE) - t |= (1 << TCE_WRITE_SHIFT); - - tp = ((u64*)tbl->it_base) + index; - - while (npages--) { - rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; - t &= ~TCE_RPN_MASK; - t |= (rpn << TCE_RPN_SHIFT); - - *tp = cpu_to_be64(t); - flush_tce(tp); - - uaddr += PAGE_SIZE; - tp++; - } -} - -void tce_free(struct iommu_table *tbl, long index, unsigned int npages) -{ - u64* tp; - - tp = ((u64*)tbl->it_base) + index; - - while (npages--) { - *tp = cpu_to_be64(0); - flush_tce(tp); - tp++; - } -} - -static inline unsigned int table_size_to_number_of_entries(unsigned char size) -{ - /* - * size is the order of the table, 0-7 - * smallest table is 8K entries, so shift result by 13 to - * multiply by 8K - */ - return (1 << size) << 13; -} - -static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) -{ - unsigned int bitmapsz; - unsigned long bmppages; - int ret; - - tbl->it_busno = dev->bus->number; - - /* set the tce table size - measured in entries */ - tbl->it_size = table_size_to_number_of_entries(specified_table_size); - - /* - * number of bytes needed for the bitmap size in number of - * entries; we need one bit per entry - */ - bitmapsz = tbl->it_size / BITS_PER_BYTE; - bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); - if (!bmppages) { - printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); - ret = -ENOMEM; - goto done; - } - - tbl->it_map = (unsigned long*)bmppages; - - memset(tbl->it_map, 0, bitmapsz); - - tbl->it_hint = 0; - - spin_lock_init(&tbl->it_lock); - - return 0; - -done: - return ret; -} - -int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar) -{ - struct iommu_table *tbl; - int ret; - - if (pci_iommu(dev->bus)) { - printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", - dev, pci_iommu(dev->bus)); - BUG(); - } - - tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); - if (!tbl) { - printk(KERN_ERR "Calgary: error allocating iommu_table\n"); - ret = -ENOMEM; - goto done; - } - - ret = tce_table_setparms(dev, tbl); - if (ret) - goto free_tbl; - - tbl->bbar = bbar; - - set_pci_iommu(dev->bus, tbl); - - return 0; - -free_tbl: - kfree(tbl); -done: - return ret; -} - -void * __init alloc_tce_table(void) -{ - unsigned int size; - - size = table_size_to_number_of_entries(specified_table_size); - size *= TCE_ENTRY_SIZE; - - return __alloc_bootmem_low(size, size, 0); -} - -void __init free_tce_table(void *tbl) -{ - unsigned int size; - - if (!tbl) - return; - - size = table_size_to_number_of_entries(specified_table_size); - size *= TCE_ENTRY_SIZE; - - free_bootmem(__pa(tbl), size); -} diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c deleted file mode 100644 index 6d48a4e826d9..000000000000 --- a/arch/x86_64/kernel/time.c +++ /dev/null @@ -1,447 +0,0 @@ -/* - * linux/arch/x86-64/kernel/time.c - * - * "High Precision Event Timer" based timekeeping. - * - * Copyright (c) 1991,1992,1995 Linus Torvalds - * Copyright (c) 1994 Alan Modra - * Copyright (c) 1995 Markus Kuhn - * Copyright (c) 1996 Ingo Molnar - * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002,2006 Vojtech Pavlik - * Copyright (c) 2003 Andi Kleen - * RTC support code taken from arch/i386/kernel/timers/time_hpet.c - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/mc146818rtc.h> -#include <linux/time.h> -#include <linux/ioport.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/sysdev.h> -#include <linux/bcd.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/kallsyms.h> -#include <linux/acpi.h> -#ifdef CONFIG_ACPI -#include <acpi/achware.h> /* for PM timer frequency */ -#include <acpi/acpi_bus.h> -#endif -#include <asm/8253pit.h> -#include <asm/i8253.h> -#include <asm/pgtable.h> -#include <asm/vsyscall.h> -#include <asm/timex.h> -#include <asm/proto.h> -#include <asm/hpet.h> -#include <asm/sections.h> -#include <linux/hpet.h> -#include <asm/apic.h> -#include <asm/hpet.h> -#include <asm/mpspec.h> -#include <asm/nmi.h> -#include <asm/vgtod.h> - -static char *timename = NULL; - -DEFINE_SPINLOCK(rtc_lock); -EXPORT_SYMBOL(rtc_lock); -DEFINE_SPINLOCK(i8253_lock); -EXPORT_SYMBOL(i8253_lock); - -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - - /* Assume the lock function has either no stack frame or a copy - of eflags from PUSHF - Eflags always has bits 22 and up cleared unlike kernel addresses. */ - if (!user_mode(regs) && in_lock_functions(pc)) { - unsigned long *sp = (unsigned long *)regs->rsp; - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; - } - return pc; -} -EXPORT_SYMBOL(profile_pc); - -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 - * ms after the second nowtime has started, because when nowtime is written - * into the registers of the CMOS clock, it will jump to the next second - * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data - * sheet for details. - */ - -static int set_rtc_mmss(unsigned long nowtime) -{ - int retval = 0; - int real_seconds, real_minutes, cmos_minutes; - unsigned char control, freq_select; - -/* - * IRQs are disabled when we're called from the timer interrupt, - * no need for spin_lock_irqsave() - */ - - spin_lock(&rtc_lock); - -/* - * Tell the clock it's being set and stop it. - */ - - control = CMOS_READ(RTC_CONTROL); - CMOS_WRITE(control | RTC_SET, RTC_CONTROL); - - freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - BCD_TO_BIN(cmos_minutes); - -/* - * since we're only adjusting minutes and seconds, don't interfere with hour - * overflow. This avoids messing with unknown time zones but requires your RTC - * not to be off by more than 15 minutes. Since we're calling it only when - * our clock is externally synchronized using NTP, this shouldn't be a problem. - */ - - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) - real_minutes += 30; /* correct for half hour time zone */ - real_minutes %= 60; - - if (abs(real_minutes - cmos_minutes) >= 30) { - printk(KERN_WARNING "time.c: can't update CMOS clock " - "from %d to %d\n", cmos_minutes, real_minutes); - retval = -1; - } else { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); - CMOS_WRITE(real_seconds, RTC_SECONDS); - CMOS_WRITE(real_minutes, RTC_MINUTES); - } - -/* - * The following flags have to be released exactly in this order, otherwise the - * DS12887 (popular MC146818A clone with integrated battery and quartz) will - * not reset the oscillator and will not update precisely 500 ms later. You - * won't find this mentioned in the Dallas Semiconductor data sheets, but who - * believes data sheets anyway ... -- Markus Kuhn - */ - - CMOS_WRITE(control, RTC_CONTROL); - CMOS_WRITE(freq_select, RTC_FREQ_SELECT); - - spin_unlock(&rtc_lock); - - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); -} - -void main_timer_handler(void) -{ -/* - * Here we are in the timer irq handler. We have irqs locally disabled (so we - * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running - * on the other CPU, so we need a lock. We also need to lock the vsyscall - * variables, because both do_timer() and us change them -arca+vojtech - */ - - write_seqlock(&xtime_lock); - -/* - * Do the timer stuff. - */ - - do_timer(1); -#ifndef CONFIG_SMP - update_process_times(user_mode(get_irq_regs())); -#endif - -/* - * In the SMP case we use the local APIC timer interrupt to do the profiling, - * except when we simulate SMP mode on a uniprocessor system, in that case we - * have to call the local interrupt handler. - */ - - if (!using_apic_timer) - smp_local_timer_interrupt(); - - write_sequnlock(&xtime_lock); -} - -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - if (apic_runs_main_timer > 1) - return IRQ_HANDLED; - main_timer_handler(); - if (using_apic_timer) - smp_send_timer_broadcast_ipi(); - return IRQ_HANDLED; -} - -unsigned long read_persistent_clock(void) -{ - unsigned int year, mon, day, hour, min, sec; - unsigned long flags; - unsigned century = 0; - - spin_lock_irqsave(&rtc_lock, flags); - - do { - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - acpi_gbl_FADT.century) - century = CMOS_READ(acpi_gbl_FADT.century); -#endif - } while (sec != CMOS_READ(RTC_SECONDS)); - - spin_unlock_irqrestore(&rtc_lock, flags); - - /* - * We know that x86-64 always uses BCD format, no need to check the - * config register. - */ - - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); - - if (century) { - BCD_TO_BIN(century); - year += century * 100; - printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); - } else { - /* - * x86-64 systems only exists since 2002. - * This will work up to Dec 31, 2100 - */ - year += 2000; - } - - return mktime(year, mon, day, hour, min, sec); -} - -/* calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency */ -#define TICK_COUNT 100000000 -static unsigned int __init tsc_calibrate_cpu_khz(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start meauring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles_sync(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} - -/* - * pit_calibrate_tsc() uses the speaker output (channel 2) of - * the PIT. This is better than using the timer interrupt output, - * because we can read the value of the speaker with just one inb(), - * where we need three i/o operations for the interrupt channel. - * We count how many ticks the TSC does in 50 ms. - */ - -static unsigned int __init pit_calibrate_tsc(void) -{ - unsigned long start, end; - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - outb(0xb0, 0x43); - outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); - start = get_cycles_sync(); - while ((inb(0x61) & 0x20) == 0); - end = get_cycles_sync(); - - spin_unlock_irqrestore(&i8253_lock, flags); - - return (end - start) / 50; -} - -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 - -static void __pit_init(int val, u8 mode) -{ - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(mode, PIT_MODE); - outb_p(val & 0xff, PIT_CH0); /* LSB */ - outb_p(val >> 8, PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} - -void __init pit_init(void) -{ - __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ -} - -void pit_stop_interrupt(void) -{ - __pit_init(0, 0x30); /* mode 0 */ -} - -void stop_timer_interrupt(void) -{ - char *name; - if (hpet_address) { - name = "HPET"; - hpet_timer_stop_set_go(0); - } else { - name = "PIT"; - pit_stop_interrupt(); - } - printk(KERN_INFO "timer: %s interrupt stopped.\n", name); -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL, - .mask = CPU_MASK_NONE, - .name = "timer" -}; - -void __init time_init(void) -{ - if (nohpet) - hpet_address = 0; - - if (hpet_arch_init()) - hpet_address = 0; - - if (hpet_use_timer) { - /* set tick_nsec to use the proper rate for HPET */ - tick_nsec = TICK_NSEC_HPET; - tsc_khz = hpet_calibrate_tsc(); - timename = "HPET"; - } else { - pit_init(); - tsc_khz = pit_calibrate_tsc(); - timename = "PIT"; - } - - cpu_khz = tsc_khz; - if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 16) - cpu_khz = tsc_calibrate_cpu_khz(); - - if (unsynchronized_tsc()) - mark_tsc_unstable("TSCs unsynchronized"); - - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) - vgetcpu_mode = VGETCPU_RDTSCP; - else - vgetcpu_mode = VGETCPU_LSL; - - set_cyc2ns_scale(tsc_khz); - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - init_tsc_clocksource(); - - setup_irq(0, &irq0); -} - -/* - * sysfs support for the timer. - */ - -static int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - return 0; -} - -static int timer_resume(struct sys_device *dev) -{ - if (hpet_address) - hpet_reenable(); - else - i8254_timer_resume(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - -/* XXX this sysfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S deleted file mode 100644 index e7e2764c461b..000000000000 --- a/arch/x86_64/kernel/trampoline.S +++ /dev/null @@ -1,166 +0,0 @@ -/* - * - * Trampoline.S Derived from Setup.S by Linus Torvalds - * - * 4 Jan 1997 Michael Chastain: changed to gnu as. - * 15 Sept 2005 Eric Biederman: 64bit PIC support - * - * Entry: CS:IP point to the start of our code, we are - * in real mode with no stack, but the rest of the - * trampoline page to make our stack and everything else - * is a mystery. - * - * In fact we don't actually need a stack so we don't - * set one up. - * - * On entry to trampoline_data, the processor is in real mode - * with 16-bit addressing and 16-bit data. CS has some value - * and IP is zero. Thus, data addresses need to be absolute - * (no relocation) and are taken with regard to r_base. - * - * With the addition of trampoline_level4_pgt this code can - * now enter a 64bit kernel that lives at arbitrary 64bit - * physical addresses. - * - * If you work on this file, check the object module with objdump - * --full-contents --reloc to make sure there are no relocation - * entries. - */ - -#include <linux/linkage.h> -#include <asm/pgtable.h> -#include <asm/page.h> -#include <asm/msr.h> -#include <asm/segment.h> - -.data - -.code16 - -ENTRY(trampoline_data) -r_base = . - cli # We should be safe anyway - wbinvd - mov %cs, %ax # Code and data in the same place - mov %ax, %ds - mov %ax, %es - mov %ax, %ss - - - movl $0xA5A5A5A5, trampoline_data - r_base - # write marker for master knows we're running - - # Setup stack - movw $(trampoline_stack_end - r_base), %sp - - call verify_cpu # Verify the cpu supports long mode - testl %eax, %eax # Check for return code - jnz no_longmode - - mov %cs, %ax - movzx %ax, %esi # Find the 32bit trampoline location - shll $4, %esi - - # Fixup the vectors - addl %esi, startup_32_vector - r_base - addl %esi, startup_64_vector - r_base - addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer - - /* - * GDT tables in non default location kernel can be beyond 16MB and - * lgdt will not be able to load the address as in real mode default - * operand size is 16bit. Use lgdtl instead to force operand size - * to 32 bit. - */ - - lidtl tidt - r_base # load idt with 0, 0 - lgdtl tgdt - r_base # load gdt with whatever is appropriate - - xor %ax, %ax - inc %ax # protected mode (PE) bit - lmsw %ax # into protected mode - - # flush prefetch and jump to startup_32 - ljmpl *(startup_32_vector - r_base) - - .code32 - .balign 4 -startup_32: - movl $__KERNEL_DS, %eax # Initialize the %ds segment register - movl %eax, %ds - - xorl %eax, %eax - btsl $5, %eax # Enable PAE mode - movl %eax, %cr4 - - # Setup trampoline 4 level pagetables - leal (trampoline_level4_pgt - r_base)(%esi), %eax - movl %eax, %cr3 - - movl $MSR_EFER, %ecx - movl $(1 << _EFER_LME), %eax # Enable Long Mode - xorl %edx, %edx - wrmsr - - xorl %eax, %eax - btsl $31, %eax # Enable paging and in turn activate Long Mode - btsl $0, %eax # Enable protected mode - movl %eax, %cr0 - - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. - */ - ljmp *(startup_64_vector - r_base)(%esi) - - .code64 - .balign 4 -startup_64: - # Now jump into the kernel using virtual addresses - movq $secondary_startup_64, %rax - jmp *%rax - - .code16 -no_longmode: - hlt - jmp no_longmode -#include "verify_cpu.S" - - # Careful these need to be in the same 64K segment as the above; -tidt: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L - - # Duplicate the global descriptor table - # so the kernel can live anywhere - .balign 4 -tgdt: - .short tgdt_end - tgdt # gdt limit - .long tgdt - r_base - .short 0 - .quad 0x00cf9b000000ffff # __KERNEL32_CS - .quad 0x00af9b000000ffff # __KERNEL_CS - .quad 0x00cf93000000ffff # __KERNEL_DS -tgdt_end: - - .balign 4 -startup_32_vector: - .long startup_32 - r_base - .word __KERNEL32_CS, 0 - - .balign 4 -startup_64_vector: - .long startup_64 - r_base - .word __KERNEL_CS, 0 - -trampoline_stack: - .org 0x1000 -trampoline_stack_end: -ENTRY(trampoline_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 510,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - -ENTRY(trampoline_end) diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c deleted file mode 100644 index 03888420775d..000000000000 --- a/arch/x86_64/kernel/traps.c +++ /dev/null @@ -1,1138 +0,0 @@ -/* - * linux/arch/x86-64/traps.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - */ - -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'entry.S'. - */ -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/timer.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/kallsyms.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/nmi.h> -#include <linux/kprobes.h> -#include <linux/kexec.h> -#include <linux/unwind.h> -#include <linux/uaccess.h> -#include <linux/bug.h> -#include <linux/kdebug.h> - -#if defined(CONFIG_EDAC) -#include <linux/edac.h> -#endif - -#include <asm/system.h> -#include <asm/io.h> -#include <asm/atomic.h> -#include <asm/debugreg.h> -#include <asm/desc.h> -#include <asm/i387.h> -#include <asm/processor.h> -#include <asm/unwind.h> -#include <asm/smp.h> -#include <asm/pgalloc.h> -#include <asm/pda.h> -#include <asm/proto.h> -#include <asm/nmi.h> -#include <asm/stacktrace.h> - -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void double_fault(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void reserved(void); -asmlinkage void alignment_check(void); -asmlinkage void machine_check(void); -asmlinkage void spurious_interrupt_bug(void); - -static inline void conditional_sti(struct pt_regs *regs) -{ - if (regs->eflags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - preempt_disable(); - if (regs->eflags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) -{ - if (regs->eflags & X86_EFLAGS_IF) - local_irq_disable(); - /* Make sure to not schedule here because we could be running - on an exception stack. */ - preempt_enable_no_resched(); -} - -int kstack_depth_to_print = 12; - -#ifdef CONFIG_KALLSYMS -void printk_address(unsigned long address) -{ - unsigned long offset = 0, symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[128]; - - symname = kallsyms_lookup(address, &symsize, &offset, - &modname, namebuf); - if (!symname) { - printk(" [<%016lx>]\n", address); - return; - } - if (!modname) - modname = delim = ""; - printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", - address, delim, modname, delim, symname, offset, symsize); -} -#else -void printk_address(unsigned long address) -{ - printk(" [<%016lx>]\n", address); -} -#endif - -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ - static char ids[][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" -#endif - }; - unsigned k; - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; - - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = ids[j]; - return (unsigned long *)end; - } -#endif - } - return NULL; -} - -#define MSG(txt) ops->warning(data, txt) - -/* - * x86-64 can have upto three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) -{ - void *t = (void *)tinfo; - return p > t && p < t + THREAD_SIZE - 3; -} - -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, - struct stacktrace_ops *ops, void *data) -{ - const unsigned cpu = get_cpu(); - unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; - unsigned used = 0; - struct thread_info *tinfo; - - if (!tsk) - tsk = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (tsk && tsk != current) - stack = (unsigned long *)tsk->thread.rsp; - } - - /* - * Print function call entries within a stack. 'cond' is the - * "end of stackframe" condition, that the 'stack++' - * iteration will eventually trigger. - */ -#define HANDLE_STACK(cond) \ - do while (cond) { \ - unsigned long addr = *stack++; \ - /* Use unlocked access here because except for NMIs \ - we should be already protected against module unloads */ \ - if (__kernel_text_address(addr)) { \ - /* \ - * If the address is either in the text segment of the \ - * kernel, or in the region which contains vmalloc'ed \ - * memory, it *may* be the address of a calling \ - * routine; if so, print it so that someone tracing \ - * down the cause of the crash will be able to figure \ - * out the call path that was taken. \ - */ \ - ops->address(data, addr); \ - } \ - } while (0) - - /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions - */ - for (;;) { - char *id; - unsigned long *estack_end; - estack_end = in_exception_stack(cpu, (unsigned long)stack, - &used, &id); - - if (estack_end) { - if (ops->stack(data, id) < 0) - break; - HANDLE_STACK (stack < estack_end); - ops->stack(data, "<EOE>"); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) estack_end[-2]; - continue; - } - if (irqstack_end) { - unsigned long *irqstack; - irqstack = irqstack_end - - (IRQSTACKSIZE - 64) / sizeof(*irqstack); - - if (stack >= irqstack && stack < irqstack_end) { - if (ops->stack(data, "IRQ") < 0) - break; - HANDLE_STACK (stack < irqstack_end); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (irqstack_end[-1]); - irqstack_end = NULL; - ops->stack(data, "EOI"); - continue; - } - } - break; - } - - /* - * This handles the process stack: - */ - tinfo = task_thread_info(tsk); - HANDLE_STACK (valid_stack_ptr(tinfo, stack)); -#undef HANDLE_STACK - put_cpu(); -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s\n", msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk(" <%s> ", name); - return 0; -} - -static void print_trace_address(void *data, unsigned long addr) -{ - touch_nmi_watchdog(); - printk_address(addr); -} - -static struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -void -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) -{ - printk("\nCall Trace:\n"); - dump_trace(tsk, regs, stack, &print_trace_ops, NULL); - printk("\n"); -} - -static void -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) -{ - unsigned long *stack; - int i; - const int cpu = smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); - unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); - - // debugging aid: "show_stack(NULL, NULL);" prints the - // back trace for this cpu. - - if (rsp == NULL) { - if (tsk) - rsp = (unsigned long *)tsk->thread.rsp; - else - rsp = (unsigned long *)&rsp; - } - - stack = rsp; - for(i=0; i < kstack_depth_to_print; i++) { - if (stack >= irqstack && stack <= irqstack_end) { - if (stack == irqstack_end) { - stack = (unsigned long *) (irqstack_end[-1]); - printk(" <EOI> "); - } - } else { - if (((long) stack & (THREAD_SIZE-1)) == 0) - break; - } - if (i && ((i % 4) == 0)) - printk("\n"); - printk(" %016lx", *stack++); - touch_nmi_watchdog(); - } - show_trace(tsk, regs, rsp); -} - -void show_stack(struct task_struct *tsk, unsigned long * rsp) -{ - _show_stack(tsk, NULL, rsp); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long dummy; - show_trace(NULL, NULL, &dummy); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - int in_kernel = !user_mode(regs); - unsigned long rsp; - const int cpu = smp_processor_id(); - struct task_struct *cur = cpu_pda(cpu)->pcurrent; - - rsp = regs->rsp; - printk("CPU %d ", cpu); - __show_regs(regs); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); - - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (in_kernel) { - printk("Stack: "); - _show_stack(NULL, regs, (unsigned long*)rsp); - - printk("\nCode: "); - if (regs->rip < PAGE_OFFSET) - goto bad; - - for (i=0; i<20; i++) { - unsigned char c; - if (__get_user(c, &((unsigned char*)regs->rip)[i])) { -bad: - printk(" Bad RIP value."); - break; - } - printk("%02x ", c); - } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long rip) -{ - unsigned short ud2; - - if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) - return 0; - - return ud2 == 0x0b0f; -} - -#ifdef CONFIG_BUG -void out_of_line_bug(void) -{ - BUG(); -} -EXPORT_SYMBOL(out_of_line_bug); -#endif - -static DEFINE_SPINLOCK(die_lock); -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - local_irq_save(flags); - cpu = smp_processor_id(); - if (!spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags) -{ - die_owner = -1; - bust_spinlocks(0); - die_nest_count--; - if (die_nest_count) - /* We still own the lock */ - local_irq_restore(flags); - else - /* Nest count reaches zero, release the lock. */ - spin_unlock_irqrestore(&die_lock, flags); - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); -} - -void __kprobes __die(const char * str, struct pt_regs * regs, long err) -{ - static int die_counter; - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); - show_registers(regs); - add_taint(TAINT_DIE); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->rip); - printk(" RSP <%016lx>\n", regs->rsp); - if (kexec_should_crash(current)) - crash_kexec(regs); -} - -void die(const char * str, struct pt_regs * regs, long err) -{ - unsigned long flags = oops_begin(); - - if (!user_mode(regs)) - report_bug(regs->rip, regs); - - __die(str, regs, err); - oops_end(flags); - do_exit(SIGSEGV); -} - -void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags = oops_begin(); - - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(str, smp_processor_id()); - show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - oops_end(flags); - nmi_exit(); - local_irq_enable(); - do_exit(SIGSEGV); -} - -static void __kprobes do_trap(int trapnr, int signr, char *str, - struct pt_regs * regs, long error_code, - siginfo_t *info) -{ - struct task_struct *tsk = current; - - if (user_mode(regs)) { - /* - * We want error_code and trap_no set for userspace - * faults and kernelspace faults which result in - * die(), but not kernelspace faults which are fixed - * up. die() gives the process no chance to handle - * the signal and notice the kernel fault information, - * so that won't result in polluting the information - * about previously queued, but not yet delivered, - * faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) - printk(KERN_INFO - "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", - tsk->comm, tsk->pid, str, - regs->rip, regs->rsp, error_code); - - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - } - - - /* kernel trap */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) - regs->rip = fixup->fixup; - else { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; - } -} - -#define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) -DO_ERROR( 4, SIGSEGV, "overflow", overflow) -DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -DO_ERROR(18, SIGSEGV, "reserved", reserved) - -/* Runs on IST stack */ -asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) -{ - static const char str[] = "double fault"; - struct task_struct *tsk = current; - - /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; - - /* This is always a kernel trap and never fixable (and thus must - never return). */ - for (;;) - die(str, regs, error_code); -} - -asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, - long error_code) -{ - struct task_struct *tsk = current; - - conditional_sti(regs); - - if (user_mode(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) - printk(KERN_INFO - "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", - tsk->comm, tsk->pid, - regs->rip, regs->rsp, error_code); - - force_sig(SIGSEGV, tsk); - return; - } - - /* kernel gp */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } -} - -static __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs * regs) -{ - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); - -#if defined(CONFIG_EDAC) - if(edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); - - /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -} - -static __kprobes void -io_check_error(unsigned char reason, struct pt_regs * regs) -{ - printk("NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - mdelay(2000); - reason &= ~8; - outb(reason, 0x61); -} - -static __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); -} - -/* Runs on IST stack. This code must keep interrupts off all the time. - Nested NMIs are prevented by the CPU. */ -asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - int cpu; - - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs,reason)) - return; - if (!do_nmi_callback(regs,cpu)) - unknown_nmi_error(reason, regs); - - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); -} - -/* runs on IST stack. */ -asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) -{ - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { - return; - } - preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) -{ - struct pt_regs *regs = eregs; - /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->rsp) - ; - /* Exception from user space */ - else if (user_mode(eregs)) - regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ - else if (eregs->eflags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); - if (eregs != regs) - *regs = *eregs; - return regs; -} - -/* runs on IST stack. */ -asmlinkage void __kprobes do_debug(struct pt_regs * regs, - unsigned long error_code) -{ - unsigned long condition; - struct task_struct *tsk = current; - siginfo_t info; - - get_debugreg(condition, 6); - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; - - preempt_conditional_sti(regs); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) { - goto clear_dr7; - } - } - - tsk->thread.debugreg6 = condition; - - /* Mask out spurious TF errors due to lazy TF clearing */ - if (condition & DR_STEP) { - /* - * The TF error should be masked out only if the current - * process is not traced and if the TRAP flag has been set - * previously by a tracing process (condition detected by - * the PT_DTRACE flag); remember that the i386 TRAP flag - * can be modified by the process itself in user mode, - * allowing programs to debug themselves without the ptrace() - * interface. - */ - if (!user_mode(regs)) - goto clear_TF_reenable; - /* - * Was the TF flag set by a debugger? If so, clear it now, - * so that register information is correct. - */ - if (tsk->ptrace & PT_DTRACE) { - regs->eflags &= ~TF_MASK; - tsk->ptrace &= ~PT_DTRACE; - } - } - - /* Ok, finally something we can handle */ - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_BRKPT; - info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; - force_sig_info(SIGTRAP, &info, tsk); - -clear_dr7: - set_debugreg(0UL, 7); - preempt_conditional_cli(regs); - return; - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; - preempt_conditional_cli(regs); -} - -static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) -{ - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return 1; - } - notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); - /* Illegal floating point operation in the kernel */ - current->thread.trap_no = trapnr; - die(str, regs, 0); - return 0; -} - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -asmlinkage void do_coprocessor_error(struct pt_regs *regs) -{ - void __user *rip = (void __user *)(regs->rip); - struct task_struct * task; - siginfo_t info; - unsigned short cwd, swd; - - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = rip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void bad_intr(void) -{ - printk("bad interrupt"); -} - -asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) -{ - void __user *rip = (void __user *)(regs->rip); - struct task_struct * task; - siginfo_t info; - unsigned short mxcsr; - - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = rip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) -{ -} - -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) -{ -} - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - */ -asmlinkage void math_state_restore(void) -{ - struct task_struct *me = current; - clts(); /* Allow maths ops (or we recurse) */ - - if (!used_math()) - init_fpu(me); - restore_fpu_checking(&me->thread.i387.fxsave); - task_thread_info(me)->status |= TS_USEDFPU; - me->fpu_counter++; -} - -void __init trap_init(void) -{ - set_intr_gate(0,÷_error); - set_intr_gate_ist(1,&debug,DEBUG_STACK); - set_intr_gate_ist(2,&nmi,NMI_STACK); - set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ - set_system_gate(4,&overflow); /* int4 can be called from all */ - set_intr_gate(5,&bounds); - set_intr_gate(6,&invalid_op); - set_intr_gate(7,&device_not_available); - set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); - set_intr_gate(9,&coprocessor_segment_overrun); - set_intr_gate(10,&invalid_TSS); - set_intr_gate(11,&segment_not_present); - set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); - set_intr_gate(13,&general_protection); - set_intr_gate(14,&page_fault); - set_intr_gate(15,&spurious_interrupt_bug); - set_intr_gate(16,&coprocessor_error); - set_intr_gate(17,&alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(18,&machine_check, MCE_STACK); -#endif - set_intr_gate(19,&simd_coprocessor_error); - -#ifdef CONFIG_IA32_EMULATION - set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); -#endif - - /* - * Should be a barrier for any external CPU state. - */ - cpu_init(); -} - - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s,NULL,0); - return 0; -} -early_param("kstack", kstack_setup); diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c deleted file mode 100644 index 2a59bde663f2..000000000000 --- a/arch/x86_64/kernel/tsc.c +++ /dev/null @@ -1,207 +0,0 @@ -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/clocksource.h> -#include <linux/time.h> -#include <linux/acpi.h> -#include <linux/cpufreq.h> - -#include <asm/timex.h> - -static int notsc __initdata = 0; - -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -EXPORT_SYMBOL(cpu_khz); -unsigned int tsc_khz; -EXPORT_SYMBOL(tsc_khz); - -static unsigned int cyc2ns_scale __read_mostly; - -void set_cyc2ns_scale(unsigned long khz) -{ - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; -} - -static unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; -} - -unsigned long long sched_clock(void) -{ - unsigned long a = 0; - - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, - * which means it is not completely exact and may not be monotonous - * between CPUs. But the errors should be too small to matter for - * scheduling purposes. - */ - - rdtscll(a); - return cycles_2_ns(a); -} - -static int tsc_unstable; - -inline int check_tsc_unstable(void) -{ - return tsc_unstable; -} -#ifdef CONFIG_CPU_FREQ - -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency - * changes. - * - * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - * not that important because current Opteron setups do not support - * scaling on SMP anyroads. - * - * Should fix up last_tsc too. Currently gettimeofday in the - * first tick after the change will be slightly wrong. - */ - -static unsigned int ref_freq; -static unsigned long loops_per_jiffy_ref; -static unsigned long tsc_khz_ref; - -static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; - - if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) - return 0; - - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -#ifdef CONFIG_SMP - lpj = &cpu_data[freq->cpu].loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; -#endif - - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; - tsc_khz_ref = tsc_khz; - } - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - *lpj = - cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - - tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - mark_tsc_unstable("cpufreq changes"); - } - - set_cyc2ns_scale(tsc_khz_ref); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - return 0; -} - -core_initcall(cpufreq_tsc); - -#endif - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ - if (tsc_unstable) - return 1; - -#ifdef CONFIG_SMP - if (apic_is_clustered_box()) - return 1; -#endif - /* Most intel systems have synchronized TSCs except for - multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { -#ifdef CONFIG_ACPI - /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && - acpi_gbl_FADT.C3latency < 1000) - return 1; -#endif - return 0; - } - - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; -} - -int __init notsc_setup(char *s) -{ - notsc = 1; - return 1; -} - -__setup("notsc", notsc_setup); - - -/* clock source code: */ -static cycle_t read_tsc(void) -{ - cycle_t ret = (cycle_t)get_cycles_sync(); - return ret; -} - -static cycle_t __vsyscall_fn vread_tsc(void) -{ - cycle_t ret = (cycle_t)get_cycles_sync(); - return ret; -} - -static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS | - CLOCK_SOURCE_MUST_VERIFY, - .vread = vread_tsc, -}; - -void mark_tsc_unstable(char *reason) -{ - if (!tsc_unstable) { - tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); - /* Change only the rating, when not registered */ - if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else - clocksource_tsc.rating = 0; - } -} -EXPORT_SYMBOL_GPL(mark_tsc_unstable); - -void __init init_tsc_clocksource(void) -{ - if (!notsc) { - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); - if (check_tsc_unstable()) - clocksource_tsc.rating = 0; - - clocksource_register(&clocksource_tsc); - } -} diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c deleted file mode 100644 index 355f5f506c81..000000000000 --- a/arch/x86_64/kernel/tsc_sync.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization. - * - * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar - * - * We check whether all boot CPUs have their TSC's synchronized, - * print a warning if not and turn off the TSC clock-source. - * - * The warp-check is point-to-point between two CPUs, the CPU - * initiating the bootup is the 'source CPU', the freshly booting - * CPU is the 'target CPU'. - * - * Only two CPUs may participate - they can enter in any order. - * ( The serial nature of the boot logic and the CPU hotplug lock - * protects against more than 2 CPUs entering this code. ) - */ -#include <linux/spinlock.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/nmi.h> -#include <asm/tsc.h> - -/* - * Entry/exit counters that make sure that both CPUs - * run the measurement code at once: - */ -static __cpuinitdata atomic_t start_count; -static __cpuinitdata atomic_t stop_count; - -/* - * We use a raw spinlock in this exceptional case, because - * we want to have the fastest, inlined, non-debug version - * of a critical section, to be able to prove TSC time-warps: - */ -static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; -static __cpuinitdata cycles_t last_tsc; -static __cpuinitdata cycles_t max_warp; -static __cpuinitdata int nr_warps; - -/* - * TSC-warp measurement loop running on both CPUs: - */ -static __cpuinit void check_tsc_warp(void) -{ - cycles_t start, now, prev, end; - int i; - - start = get_cycles_sync(); - /* - * The measurement runs for 20 msecs: - */ - end = start + tsc_khz * 20ULL; - now = start; - - for (i = 0; ; i++) { - /* - * We take the global lock, measure TSC, save the - * previous TSC that was measured (possibly on - * another CPU) and update the previous TSC timestamp. - */ - __raw_spin_lock(&sync_lock); - prev = last_tsc; - now = get_cycles_sync(); - last_tsc = now; - __raw_spin_unlock(&sync_lock); - - /* - * Be nice every now and then (and also check whether - * measurement is done [we also insert a 100 million - * loops safety exit, so we dont lock up in case the - * TSC readout is totally broken]): - */ - if (unlikely(!(i & 7))) { - if (now > end || i > 100000000) - break; - cpu_relax(); - touch_nmi_watchdog(); - } - /* - * Outside the critical section we can now see whether - * we saw a time-warp of the TSC going backwards: - */ - if (unlikely(prev > now)) { - __raw_spin_lock(&sync_lock); - max_warp = max(max_warp, prev - now); - nr_warps++; - __raw_spin_unlock(&sync_lock); - } - - } -} - -/* - * Source CPU calls into this - it waits for the freshly booted - * target CPU to arrive and then starts the measurement: - */ -void __cpuinit check_tsc_sync_source(int cpu) -{ - int cpus = 2; - - /* - * No need to check if we already know that the TSC is not - * synchronized: - */ - if (unsynchronized_tsc()) - return; - - printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); - - /* - * Reset it - in case this is a second bootup: - */ - atomic_set(&stop_count, 0); - - /* - * Wait for the target to arrive: - */ - while (atomic_read(&start_count) != cpus-1) - cpu_relax(); - /* - * Trigger the target to continue into the measurement too: - */ - atomic_inc(&start_count); - - check_tsc_warp(); - - while (atomic_read(&stop_count) != cpus-1) - cpu_relax(); - - /* - * Reset it - just in case we boot another CPU later: - */ - atomic_set(&start_count, 0); - - if (nr_warps) { - printk("\n"); - printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," - " turning off TSC clock.\n", max_warp); - mark_tsc_unstable("check_tsc_sync_source failed"); - nr_warps = 0; - max_warp = 0; - last_tsc = 0; - } else { - printk(" passed.\n"); - } - - /* - * Let the target continue with the bootup: - */ - atomic_inc(&stop_count); -} - -/* - * Freshly booted CPUs call into this: - */ -void __cpuinit check_tsc_sync_target(void) -{ - int cpus = 2; - - if (unsynchronized_tsc()) - return; - - /* - * Register this CPU's participation and wait for the - * source CPU to start the measurement: - */ - atomic_inc(&start_count); - while (atomic_read(&start_count) != cpus) - cpu_relax(); - - check_tsc_warp(); - - /* - * Ok, we are done: - */ - atomic_inc(&stop_count); - - /* - * Wait for the source CPU to print stuff: - */ - while (atomic_read(&stop_count) != cpus) - cpu_relax(); -} -#undef NR_LOOPS - diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S deleted file mode 100644 index 45b6f8a975a1..000000000000 --- a/arch/x86_64/kernel/verify_cpu.S +++ /dev/null @@ -1,105 +0,0 @@ -/* - * - * verify_cpu.S - Code for cpu long mode and SSE verification. This - * code has been borrowed from boot/setup.S and was introduced by - * Andi Kleen. - * - * Copyright (c) 2007 Andi Kleen (ak@suse.de) - * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) - * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - * - * This is a common code for verification whether CPU supports - * long mode and SSE or not. It is not called directly instead this - * file is included at various places and compiled in that context. - * Following are the current usage. - * - * This file is included by both 16bit and 32bit code. - * - * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) - * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) - * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) - * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) - * - * verify_cpu, returns the status of cpu check in register %eax. - * 0: Success 1: Failure - * - * The caller needs to check for the error code and take the action - * appropriately. Either display a message or halt. - */ - -#include <asm/cpufeature.h> - -verify_cpu: - pushfl # Save caller passed flags - pushl $0 # Kill any dangerous flags - popfl - - pushfl # standard way to check for cpuid - popl %eax - movl %eax,%ebx - xorl $0x200000,%eax - pushl %eax - popfl - pushfl - popl %eax - cmpl %eax,%ebx - jz verify_cpu_no_longmode # cpu has no cpuid - - movl $0x0,%eax # See if cpuid 1 is implemented - cpuid - cmpl $0x1,%eax - jb verify_cpu_no_longmode # no cpuid 1 - - xor %di,%di - cmpl $0x68747541,%ebx # AuthenticAMD - jnz verify_cpu_noamd - cmpl $0x69746e65,%edx - jnz verify_cpu_noamd - cmpl $0x444d4163,%ecx - jnz verify_cpu_noamd - mov $1,%di # cpu is from AMD - -verify_cpu_noamd: - movl $0x1,%eax # Does the cpu have what it takes - cpuid - andl $REQUIRED_MASK0,%edx - xorl $REQUIRED_MASK0,%edx - jnz verify_cpu_no_longmode - - movl $0x80000000,%eax # See if extended cpuid is implemented - cpuid - cmpl $0x80000001,%eax - jb verify_cpu_no_longmode # no extended cpuid - - movl $0x80000001,%eax # Does the cpu have what it takes - cpuid - andl $REQUIRED_MASK1,%edx - xorl $REQUIRED_MASK1,%edx - jnz verify_cpu_no_longmode - -verify_cpu_sse_test: - movl $1,%eax - cpuid - andl $SSE_MASK,%edx - cmpl $SSE_MASK,%edx - je verify_cpu_sse_ok - test %di,%di - jz verify_cpu_no_longmode # only try to force SSE on AMD - movl $0xc0010015,%ecx # HWCR - rdmsr - btr $15,%eax # enable SSE - wrmsr - xor %di,%di # don't loop - jmp verify_cpu_sse_test # try again - -verify_cpu_no_longmode: - popfl # Restore caller passed flags - movl $1,%eax - ret -verify_cpu_sse_ok: - popfl # Restore caller passed flags - xorl %eax, %eax - ret diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S deleted file mode 100644 index ba8ea97abd21..000000000000 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ /dev/null @@ -1,235 +0,0 @@ -/* ld script to make x86-64 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - */ - -#define LOAD_OFFSET __START_KERNEL_map - -#include <asm-generic/vmlinux.lds.h> -#include <asm/page.h> - -#undef i386 /* in case the preprocessor is a 32bit one */ - -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(phys_startup_64) -jiffies_64 = jiffies; -_proxy_pda = 1; -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(4); /* R__ */ -} -SECTIONS -{ - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - _text = .; /* Text and read-only data */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - *(.fixup) - *(.gnu.warning) - } :text = 0x9090 - /* out-of-line lock text */ - .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } - - _etext = .; /* End of text section */ - - . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } - __stop___ex_table = .; - - NOTES :text :note - - BUG_TABLE :text - - RODATA - - . = ALIGN(4); - .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { - __tracedata_start = .; - *(.tracedata) - __tracedata_end = .; - } - - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - } :data - - _edata = .; /* End of data section */ - - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } - -#define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) - { *(.vsyscall_gtod_data) } - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) - { *(.vsyscall_clock) } - vsyscall_clock = VVIRT(.vsyscall_clock); - - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } - - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); - - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) - { *(.vsyscall_3) } - - . = VSYSCALL_VIRT_ADDR + 4096; - -#undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - - . = ALIGN(8192); /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - }:data.init - - . = ALIGN(4096); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - } - - /* might get freed after init */ - . = ALIGN(4096); - __smp_alt_begin = .; - __smp_locks = .; - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - *(.smp_locks) - } - __smp_locks_end = .; - . = ALIGN(4096); - __smp_alt_end = .; - - . = ALIGN(4096); /* Init code and data */ - __init_begin = .; - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - *(.init.text) - _einittext = .; - } - __initdata_begin = .; - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } - __initdata_end = .; - . = ALIGN(16); - __setup_start = .; - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } - __setup_end = .; - __initcall_start = .; - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - INITCALLS - } - __initcall_end = .; - __con_initcall_start = .; - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - *(.con_initcall.init) - } - __con_initcall_end = .; - SECURITY_INIT - . = ALIGN(8); - __alt_instructions = .; - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - *(.altinstructions) - } - __alt_instructions_end = .; - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } - -/* vdso blob that is mapped into user space */ - vdso_start = . ; - .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) } - . = ALIGN(4096); - vdso_end = .; - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(4096); - __initramfs_start = .; - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } - __initramfs_end = .; -#endif - - PERCPU(4096) - - . = ALIGN(4096); - __init_end = .; - - . = ALIGN(4096); - __nosave_begin = .; - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } - . = ALIGN(4096); - __nosave_end = .; - - __bss_start = .; /* BSS */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - *(.bss.page_aligned) - *(.bss) - } - __bss_stop = .; - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - } - - STABS_DEBUG - - DWARF_DEBUG -} diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c deleted file mode 100644 index 414caf0c5f9a..000000000000 --- a/arch/x86_64/kernel/vsmp.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * vSMPowered(tm) systems specific initialization - * Copyright (C) 2005 ScaleMP Inc. - * - * Use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - * - * Ravikiran Thirumalai <kiran@scalemp.com>, - * Shai Fultheim <shai@scalemp.com> - */ - -#include <linux/init.h> -#include <linux/pci_ids.h> -#include <linux/pci_regs.h> -#include <asm/pci-direct.h> -#include <asm/io.h> - -static int __init vsmp_init(void) -{ - void *address; - unsigned int cap, ctl; - - if (!early_pci_allowed()) - return 0; - - /* Check if we are running on a ScaleMP vSMP box */ - if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || - (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) - return 0; - - /* set vSMP magic bits to indicate vSMP capable kernel */ - address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8); - cap = readl(address); - ctl = readl(address + 4); - printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); - if (cap & ctl & (1 << 4)) { - /* Turn on vSMP IRQ fastpath handling (see system.h) */ - ctl &= ~(1 << 4); - writel(ctl, address + 4); - ctl = readl(address + 4); - printk("vSMP CTL: control set to:0x%08x\n", ctl); - } - - iounmap(address); - return 0; -} - -core_initcall(vsmp_init); diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c deleted file mode 100644 index 06c34949bfdc..000000000000 --- a/arch/x86_64/kernel/vsyscall.c +++ /dev/null @@ -1,349 +0,0 @@ -/* - * linux/arch/x86_64/kernel/vsyscall.c - * - * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. - * - * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located - * at virtual address -10Mbyte+1024bytes etc... There are at max 4 - * vsyscalls. One vsyscall can reserve more than 1 slot to avoid - * jumping out of line if necessary. We cannot add more with this - * mechanism because older kernels won't return -ENOSYS. - * If we want more than four we need a vDSO. - * - * Note: the concept clashes with user mode linux. If you use UML and - * want per guest time just set the kernel.vsyscall64 sysctl to 0. - */ - -#include <linux/time.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <linux/seqlock.h> -#include <linux/jiffies.h> -#include <linux/sysctl.h> -#include <linux/clocksource.h> -#include <linux/getcpu.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/notifier.h> - -#include <asm/vsyscall.h> -#include <asm/pgtable.h> -#include <asm/page.h> -#include <asm/unistd.h> -#include <asm/fixmap.h> -#include <asm/errno.h> -#include <asm/io.h> -#include <asm/segment.h> -#include <asm/desc.h> -#include <asm/topology.h> -#include <asm/vgtod.h> - -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) -#define __syscall_clobber "r11","rcx","memory" -#define __pa_vsymbol(x) \ - ({unsigned long v; \ - extern char __vsyscall_0; \ - asm("" : "=r" (v) : "0" (x)); \ - ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) - -/* - * vsyscall_gtod_data contains data that is : - * - readonly from vsyscalls - * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) - * Try to keep this structure as small as possible to avoid cache line ping pongs - */ -int __vgetcpu_mode __section_vgetcpu_mode; - -struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = -{ - .lock = SEQLOCK_UNLOCKED, - .sysctl_enabled = 1, -}; - -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) -{ - unsigned long flags; - - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); - /* copy vsyscall data */ - vsyscall_gtod_data.clock.vread = clock->vread; - vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; - vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = clock->mult; - vsyscall_gtod_data.clock.shift = clock->shift; - vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.sys_tz = sys_tz; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); -} - -/* RED-PEN may want to readd seq locking, but then the variable should be - * write-once. - */ -static __always_inline void do_get_tz(struct timezone * tz) -{ - *tz = __vsyscall_gtod_data.sys_tz; -} - -static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret; - asm volatile("vsysc2: syscall" - : "=a" (ret) - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) - : __syscall_clobber ); - return ret; -} - -static __always_inline long time_syscall(long *t) -{ - long secs; - asm volatile("vsysc1: syscall" - : "=a" (secs) - : "0" (__NR_time),"D" (t) : __syscall_clobber); - return secs; -} - -static __always_inline void do_vgettimeofday(struct timeval * tv) -{ - cycle_t now, base, mask, cycle_delta; - unsigned seq; - unsigned long mult, shift, nsec; - cycle_t (*vread)(void); - do { - seq = read_seqbegin(&__vsyscall_gtod_data.lock); - - vread = __vsyscall_gtod_data.clock.vread; - if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { - gettimeofday(tv,NULL); - return; - } - now = vread(); - base = __vsyscall_gtod_data.clock.cycle_last; - mask = __vsyscall_gtod_data.clock.mask; - mult = __vsyscall_gtod_data.clock.mult; - shift = __vsyscall_gtod_data.clock.shift; - - tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; - nsec = __vsyscall_gtod_data.wall_time_nsec; - } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); - - /* calculate interval: */ - cycle_delta = (now - base) & mask; - /* convert to nsecs: */ - nsec += (cycle_delta * mult) >> shift; - - while (nsec >= NSEC_PER_SEC) { - tv->tv_sec += 1; - nsec -= NSEC_PER_SEC; - } - tv->tv_usec = nsec / NSEC_PER_USEC; -} - -int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) -{ - if (tv) - do_vgettimeofday(tv); - if (tz) - do_get_tz(tz); - return 0; -} - -/* This will break when the xtime seconds get inaccurate, but that is - * unlikely */ -time_t __vsyscall(1) vtime(time_t *t) -{ - struct timeval tv; - time_t result; - if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) - return time_syscall(t); - - vgettimeofday(&tv, 0); - result = tv.tv_sec; - if (t) - *t = result; - return result; -} - -/* Fast way to get current CPU and node. - This helps to do per node and per CPU caches in user space. - The result is not guaranteed without CPU affinity, but usually - works out because the scheduler tries to keep a thread on the same - CPU. - - tcache must point to a two element sized long array. - All arguments can be NULL. */ -long __vsyscall(2) -vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) -{ - unsigned int dummy, p; - unsigned long j = 0; - - /* Fast cache - only recompute value once per jiffies and avoid - relatively costly rdtscp/cpuid otherwise. - This works because the scheduler usually keeps the process - on the same CPU and this syscall doesn't guarantee its - results anyways. - We do this here because otherwise user space would do it on - its own in a likely inferior way (no access to jiffies). - If you don't like it pass NULL. */ - if (tcache && tcache->blob[0] == (j = __jiffies)) { - p = tcache->blob[1]; - } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - rdtscp(dummy, dummy, p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } - if (tcache) { - tcache->blob[0] = j; - tcache->blob[1] = p; - } - if (cpu) - *cpu = p & 0xfff; - if (node) - *node = p >> 12; - return 0; -} - -long __vsyscall(3) venosys_1(void) -{ - return -ENOSYS; -} - -#ifdef CONFIG_SYSCTL - -#define SYSCALL 0x050f -#define NOP2 0x9090 - -/* - * NOP out syscall in vsyscall page when not needed. - */ -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - extern u16 vsysc1, vsysc2; - u16 __iomem *map1; - u16 __iomem *map2; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (!write) - return ret; - /* gcc has some trouble with __va(__pa()), so just do it this - way. */ - map1 = ioremap(__pa_vsymbol(&vsysc1), 2); - if (!map1) - return -ENOMEM; - map2 = ioremap(__pa_vsymbol(&vsysc2), 2); - if (!map2) { - ret = -ENOMEM; - goto out; - } - if (!vsyscall_gtod_data.sysctl_enabled) { - writew(SYSCALL, map1); - writew(SYSCALL, map2); - } else { - writew(NOP2, map1); - writew(NOP2, map2); - } - iounmap(map2); -out: - iounmap(map1); - return ret; -} - -static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -static ctl_table kernel_table2[] = { - { .ctl_name = 99, .procname = "vsyscall64", - .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644, - .strategy = vsyscall_sysctl_nostrat, - .proc_handler = vsyscall_sysctl_change }, - {} -}; - -static ctl_table kernel_root_table2[] = { - { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, - .child = kernel_table2 }, - {} -}; - -#endif - -/* Assume __initcall executes before all user space. Hopefully kmod - doesn't violate that. We'll find out if it does. */ -static void __cpuinit vsyscall_set_cpu(int cpu) -{ - unsigned long *d; - unsigned long node = 0; -#ifdef CONFIG_NUMA - node = cpu_to_node[cpu]; -#endif - if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); - - /* Store cpu number in limit so that it can be loaded quickly - in user space in vgetcpu. - 12 bits for the CPU and 8 bits for the node. */ - d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); - *d = 0x0f40000000000ULL; - *d |= cpu; - *d |= (node & 0xf) << 12; - *d |= (node >> 4) << 48; -} - -static void __cpuinit cpu_vsyscall_init(void *arg) -{ - /* preemption should be already off */ - vsyscall_set_cpu(raw_smp_processor_id()); -} - -static int __cpuinit -cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) -{ - long cpu = (long)arg; - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); - return NOTIFY_DONE; -} - -static void __init map_vsyscall(void) -{ - extern char __vsyscall_0; - unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); - - /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); -} - -static int __init vsyscall_init(void) -{ - BUG_ON(((unsigned long) &vgettimeofday != - VSYSCALL_ADDR(__NR_vgettimeofday))); - BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); - BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); - BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); - map_vsyscall(); -#ifdef CONFIG_SYSCTL - register_sysctl_table(kernel_root_table2); -#endif - on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); - hotcpu_notifier(cpu_vsyscall_notifier, 0); - return 0; -} - -__initcall(vsyscall_init); diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c deleted file mode 100644 index 77c25b307635..000000000000 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ /dev/null @@ -1,62 +0,0 @@ -/* Exports for assembly files. - All C exports should go in the respective C files. */ - -#include <linux/module.h> -#include <linux/smp.h> - -#include <asm/semaphore.h> -#include <asm/processor.h> -#include <asm/uaccess.h> -#include <asm/pgtable.h> - -EXPORT_SYMBOL(kernel_thread); - -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); - -EXPORT_SYMBOL(__get_user_1); -EXPORT_SYMBOL(__get_user_2); -EXPORT_SYMBOL(__get_user_4); -EXPORT_SYMBOL(__get_user_8); -EXPORT_SYMBOL(__put_user_1); -EXPORT_SYMBOL(__put_user_2); -EXPORT_SYMBOL(__put_user_4); -EXPORT_SYMBOL(__put_user_8); - -EXPORT_SYMBOL(copy_user_generic); -EXPORT_SYMBOL(__copy_user_nocache); -EXPORT_SYMBOL(copy_from_user); -EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(__copy_from_user_inatomic); - -EXPORT_SYMBOL(copy_page); -EXPORT_SYMBOL(clear_page); - -#ifdef CONFIG_SMP -extern void __write_lock_failed(rwlock_t *rw); -extern void __read_lock_failed(rwlock_t *rw); -EXPORT_SYMBOL(__write_lock_failed); -EXPORT_SYMBOL(__read_lock_failed); -#endif - -/* Export string functions. We normally rely on gcc builtin for most of these, - but gcc sometimes decides not to inline them. */ -#undef memcpy -#undef memset -#undef memmove - -extern void * memset(void *,int,__kernel_size_t); -extern void * memcpy(void *,const void *,__kernel_size_t); -extern void * __memcpy(void *,const void *,__kernel_size_t); - -EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(__memcpy); - -EXPORT_SYMBOL(empty_zero_page); -EXPORT_SYMBOL(init_level4_pgt); -EXPORT_SYMBOL(load_gs_index); - -EXPORT_SYMBOL(_proxy_pda); diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile deleted file mode 100644 index c94327178398..000000000000 --- a/arch/x86_64/lib/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -# -# Makefile for x86_64-specific library files. -# - -CFLAGS_csum-partial.o := -funroll-loops - -obj-y := io.o iomap_copy.o -obj-$(CONFIG_SMP) += msr-on-cpu.o - -lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ - usercopy.o getuser.o putuser.o \ - thunk.o clear_page.o copy_page.o bitstr.o bitops.o -lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c deleted file mode 100644 index 95b6d9639fba..000000000000 --- a/arch/x86_64/lib/bitops.c +++ /dev/null @@ -1,175 +0,0 @@ -#include <linux/bitops.h> - -#undef find_first_zero_bit -#undef find_next_zero_bit -#undef find_first_bit -#undef find_next_bit - -static inline long -__find_first_zero_bit(const unsigned long * addr, unsigned long size) -{ - long d0, d1, d2; - long res; - - /* - * We must test the size in words, not in bits, because - * otherwise incoming sizes in the range -63..-1 will not run - * any scasq instructions, and then the flags used by the je - * instruction will have whatever random value was in place - * before. Nobody should call us like that, but - * find_next_zero_bit() does when offset and size are at the - * same word and it fails to find a zero itself. - */ - size += 63; - size >>= 6; - if (!size) - return 0; - asm volatile( - " repe; scasq\n" - " je 1f\n" - " xorq -8(%%rdi),%%rax\n" - " subq $8,%%rdi\n" - " bsfq %%rax,%%rdx\n" - "1: subq %[addr],%%rdi\n" - " shlq $3,%%rdi\n" - " addq %%rdi,%%rdx" - :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) - :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL), - [addr] "S" (addr) : "memory"); - /* - * Any register would do for [addr] above, but GCC tends to - * prefer rbx over rsi, even though rsi is readily available - * and doesn't have to be saved. - */ - return res; -} - -/** - * find_first_zero_bit - find the first zero bit in a memory region - * @addr: The address to start the search at - * @size: The maximum size to search - * - * Returns the bit-number of the first zero bit, not the number of the byte - * containing a bit. - */ -long find_first_zero_bit(const unsigned long * addr, unsigned long size) -{ - return __find_first_zero_bit (addr, size); -} - -/** - * find_next_zero_bit - find the first zero bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -long find_next_zero_bit (const unsigned long * addr, long size, long offset) -{ - const unsigned long * p = addr + (offset >> 6); - unsigned long set = 0; - unsigned long res, bit = offset&63; - - if (bit) { - /* - * Look for zero in first word - */ - asm("bsfq %1,%0\n\t" - "cmoveq %2,%0" - : "=r" (set) - : "r" (~(*p >> bit)), "r"(64L)); - if (set < (64 - bit)) - return set + offset; - set = 64 - bit; - p++; - } - /* - * No zero yet, search remaining full words for a zero - */ - res = __find_first_zero_bit (p, size - 64 * (p - addr)); - - return (offset + set + res); -} - -static inline long -__find_first_bit(const unsigned long * addr, unsigned long size) -{ - long d0, d1; - long res; - - /* - * We must test the size in words, not in bits, because - * otherwise incoming sizes in the range -63..-1 will not run - * any scasq instructions, and then the flags used by the jz - * instruction will have whatever random value was in place - * before. Nobody should call us like that, but - * find_next_bit() does when offset and size are at the same - * word and it fails to find a one itself. - */ - size += 63; - size >>= 6; - if (!size) - return 0; - asm volatile( - " repe; scasq\n" - " jz 1f\n" - " subq $8,%%rdi\n" - " bsfq (%%rdi),%%rax\n" - "1: subq %[addr],%%rdi\n" - " shlq $3,%%rdi\n" - " addq %%rdi,%%rax" - :"=a" (res), "=&c" (d0), "=&D" (d1) - :"0" (0ULL), "1" (size), "2" (addr), - [addr] "r" (addr) : "memory"); - return res; -} - -/** - * find_first_bit - find the first set bit in a memory region - * @addr: The address to start the search at - * @size: The maximum size to search - * - * Returns the bit-number of the first set bit, not the number of the byte - * containing a bit. - */ -long find_first_bit(const unsigned long * addr, unsigned long size) -{ - return __find_first_bit(addr,size); -} - -/** - * find_next_bit - find the first set bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -long find_next_bit(const unsigned long * addr, long size, long offset) -{ - const unsigned long * p = addr + (offset >> 6); - unsigned long set = 0, bit = offset & 63, res; - - if (bit) { - /* - * Look for nonzero in the first 64 bits: - */ - asm("bsfq %1,%0\n\t" - "cmoveq %2,%0\n\t" - : "=r" (set) - : "r" (*p >> bit), "r" (64L)); - if (set < (64 - bit)) - return set + offset; - set = 64 - bit; - p++; - } - /* - * No set bit yet, search remaining full words for a bit - */ - res = __find_first_bit (p, size - 64 * (p - addr)); - return (offset + set + res); -} - -#include <linux/module.h> - -EXPORT_SYMBOL(find_next_bit); -EXPORT_SYMBOL(find_first_bit); -EXPORT_SYMBOL(find_first_zero_bit); -EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/x86_64/lib/bitstr.c b/arch/x86_64/lib/bitstr.c deleted file mode 100644 index 24676609a6ac..000000000000 --- a/arch/x86_64/lib/bitstr.c +++ /dev/null @@ -1,28 +0,0 @@ -#include <linux/module.h> -#include <linux/bitops.h> - -/* Find string of zero bits in a bitmap */ -unsigned long -find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len) -{ - unsigned long n, end, i; - - again: - n = find_next_zero_bit(bitmap, nbits, start); - if (n == -1) - return -1; - - /* could test bitsliced, but it's hardly worth it */ - end = n+len; - if (end >= nbits) - return -1; - for (i = n+1; i < end; i++) { - if (test_bit(i, bitmap)) { - start = i+1; - goto again; - } - } - return n; -} - -EXPORT_SYMBOL(find_next_zero_string); diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S deleted file mode 100644 index 9a10a78bb4a4..000000000000 --- a/arch/x86_64/lib/clear_page.S +++ /dev/null @@ -1,59 +0,0 @@ -#include <linux/linkage.h> -#include <asm/dwarf2.h> - -/* - * Zero a page. - * rdi page - */ - ALIGN -clear_page_c: - CFI_STARTPROC - movl $4096/8,%ecx - xorl %eax,%eax - rep stosq - ret - CFI_ENDPROC -ENDPROC(clear_page) - -ENTRY(clear_page) - CFI_STARTPROC - xorl %eax,%eax - movl $4096/64,%ecx - .p2align 4 -.Lloop: - decl %ecx -#define PUT(x) movq %rax,x*8(%rdi) - movq %rax,(%rdi) - PUT(1) - PUT(2) - PUT(3) - PUT(4) - PUT(5) - PUT(6) - PUT(7) - leaq 64(%rdi),%rdi - jnz .Lloop - nop - ret - CFI_ENDPROC -.Lclear_page_end: -ENDPROC(clear_page) - - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ - -#include <asm/cpufeature.h> - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad clear_page - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lclear_page_end - clear_page - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S deleted file mode 100644 index 727a5d46d2fc..000000000000 --- a/arch/x86_64/lib/copy_page.S +++ /dev/null @@ -1,119 +0,0 @@ -/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> - - ALIGN -copy_page_c: - CFI_STARTPROC - movl $4096/8,%ecx - rep movsq - ret - CFI_ENDPROC -ENDPROC(copy_page_c) - -/* Don't use streaming store because it's better when the target - ends up in cache. */ - -/* Could vary the prefetch distance based on SMP/UP */ - -ENTRY(copy_page) - CFI_STARTPROC - subq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET 3*8 - movq %rbx,(%rsp) - CFI_REL_OFFSET rbx, 0 - movq %r12,1*8(%rsp) - CFI_REL_OFFSET r12, 1*8 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13, 2*8 - - movl $(4096/64)-5,%ecx - .p2align 4 -.Loop64: - dec %rcx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 - - prefetcht0 5*64(%rsi) - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) - - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi - - jnz .Loop64 - - movl $5,%ecx - .p2align 4 -.Loop2: - decl %ecx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - - jnz .Loop2 - - movq (%rsp),%rbx - CFI_RESTORE rbx - movq 1*8(%rsp),%r12 - CFI_RESTORE r12 - movq 2*8(%rsp),%r13 - CFI_RESTORE r13 - addq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET -3*8 - ret -.Lcopy_page_end: - CFI_ENDPROC -ENDPROC(copy_page) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - -#include <asm/cpufeature.h> - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad copy_page - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lcopy_page_end - copy_page - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S deleted file mode 100644 index 70bebd310408..000000000000 --- a/arch/x86_64/lib/copy_user.S +++ /dev/null @@ -1,354 +0,0 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs. - * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> - -#define FIX_ALIGNMENT 1 - -#include <asm/current.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/cpufeature.h> - - .macro ALTERNATIVE_JUMP feature,orig,alt -0: - .byte 0xe9 /* 32bit jump */ - .long \orig-1f /* by default jump to orig */ -1: - .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt-1b /* offset */ /* or alternatively to alt */ - .previous - .section .altinstructions,"a" - .align 8 - .quad 0b - .quad 2b - .byte \feature /* when feature is set */ - .byte 5 - .byte 5 - .previous - .endm - -/* Standard copy_to_user with segment limit checking */ -ENTRY(copy_to_user) - CFI_STARTPROC - GET_THREAD_INFO(%rax) - movq %rdi,%rcx - addq %rdx,%rcx - jc bad_to_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_to_user - xorl %eax,%eax /* clear zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC - -ENTRY(copy_user_generic) - CFI_STARTPROC - movl $1,%ecx /* set zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC - -ENTRY(__copy_from_user_inatomic) - CFI_STARTPROC - xorl %ecx,%ecx /* clear zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC - -/* Standard copy_from_user with segment limit checking */ -ENTRY(copy_from_user) - CFI_STARTPROC - GET_THREAD_INFO(%rax) - movq %rsi,%rcx - addq %rdx,%rcx - jc bad_from_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_from_user - movl $1,%ecx /* set zero flag */ - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC -ENDPROC(copy_from_user) - - .section .fixup,"ax" - /* must zero dest */ -bad_from_user: - CFI_STARTPROC - movl %edx,%ecx - xorl %eax,%eax - rep - stosb -bad_to_user: - movl %edx,%eax - ret - CFI_ENDPROC -END(bad_from_user) - .previous - - -/* - * copy_user_generic_unrolled - memory copy with exception handling. - * This version is for CPUs like P4 that don't have efficient micro code for rep movsq - * - * Input: - * rdi destination - * rsi source - * rdx count - * ecx zero flag -- if true zero destination on error - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -ENTRY(copy_user_generic_unrolled) - CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - pushq %rcx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rcx, 0 - xorl %eax,%eax /*zero for the exception handler */ - -#ifdef FIX_ALIGNMENT - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jnz .Lbad_alignment -.Lafter_bad_alignment: -#endif - - movq %rdx,%rcx - - movl $64,%ebx - shrq $6,%rdx - decq %rdx - js .Lhandle_tail - - .p2align 4 -.Lloop: -.Ls1: movq (%rsi),%r11 -.Ls2: movq 1*8(%rsi),%r8 -.Ls3: movq 2*8(%rsi),%r9 -.Ls4: movq 3*8(%rsi),%r10 -.Ld1: movq %r11,(%rdi) -.Ld2: movq %r8,1*8(%rdi) -.Ld3: movq %r9,2*8(%rdi) -.Ld4: movq %r10,3*8(%rdi) - -.Ls5: movq 4*8(%rsi),%r11 -.Ls6: movq 5*8(%rsi),%r8 -.Ls7: movq 6*8(%rsi),%r9 -.Ls8: movq 7*8(%rsi),%r10 -.Ld5: movq %r11,4*8(%rdi) -.Ld6: movq %r8,5*8(%rdi) -.Ld7: movq %r9,6*8(%rdi) -.Ld8: movq %r10,7*8(%rdi) - - decq %rdx - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - - jns .Lloop - - .p2align 4 -.Lhandle_tail: - movl %ecx,%edx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - movl $8,%ebx - .p2align 4 -.Lloop_8: -.Ls9: movq (%rsi),%r8 -.Ld9: movq %r8,(%rdi) - decl %ecx - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: -.Ls10: movb (%rsi),%bl -.Ld10: movb %bl,(%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 - - CFI_REMEMBER_STATE -.Lende: - popq %rcx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rcx - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret - CFI_RESTORE_STATE - -#ifdef FIX_ALIGNMENT - /* align destination */ - .p2align 4 -.Lbad_alignment: - movl $8,%r9d - subl %ecx,%r9d - movl %r9d,%ecx - cmpq %r9,%rdx - jz .Lhandle_7 - js .Lhandle_7 -.Lalign_1: -.Ls11: movb (%rsi),%bl -.Ld11: movb %bl,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .Lalign_1 - subq %r9,%rdx - jmp .Lafter_bad_alignment -#endif - - /* table sorted by exception address */ - .section __ex_table,"a" - .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e - .quad .Ld2,.Ls2e - .quad .Ld3,.Ls3e - .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e - .quad .Ld6,.Ls6e - .quad .Ld7,.Ls7e - .quad .Ld8,.Ls8e - .quad .Ls9,.Le_quad - .quad .Ld9,.Le_quad - .quad .Ls10,.Le_byte - .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT - .quad .Ls11,.Lzero_rest - .quad .Ld11,.Lzero_rest -#endif - .quad .Le5,.Le_zero - .previous - - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ - /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax -.Ls2e: addl $8,%eax -.Ls3e: addl $8,%eax -.Ls4e: addl $8,%eax -.Ls5e: addl $8,%eax -.Ls6e: addl $8,%eax -.Ls7e: addl $8,%eax -.Ls8e: addl $8,%eax - addq %rbx,%rdi /* +64 */ - subq %rax,%rdi /* correct destination with computed offset */ - - shlq $6,%rdx /* loop counter * 64 (stride length) */ - addq %rax,%rdx /* add offset to loopcnt */ - andl $63,%ecx /* remaining bytes */ - addq %rcx,%rdx /* add them */ - jmp .Lzero_rest - - /* exception on quad word loop in tail handling */ - /* ecx: loopcnt/8, %edx: length, rdi: correct */ -.Le_quad: - shll $3,%ecx - andl $7,%edx - addl %ecx,%edx - /* edx: bytes to zero, rdi: dest, eax:zero */ -.Lzero_rest: - cmpl $0,(%rsp) - jz .Le_zero - movq %rdx,%rcx -.Le_byte: - xorl %eax,%eax -.Le5: rep - stosb - /* when there is another exception while zeroing the rest just return */ -.Le_zero: - movq %rdx,%rax - jmp .Lende - CFI_ENDPROC -ENDPROC(copy_user_generic) - - - /* Some CPUs run faster using the string copy instructions. - This is also a lot simpler. Use them when possible. - Patch in jmps to this code instead of copying it fully - to avoid unwanted aliasing in the exception tables. */ - - /* rdi destination - * rsi source - * rdx count - * ecx zero flag - * - * Output: - * eax uncopied bytes or 0 if successfull. - * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. - */ -ENTRY(copy_user_generic_string) - CFI_STARTPROC - movl %ecx,%r8d /* save zero flag */ - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx - jz 10f -1: rep - movsq - movl %edx,%ecx -2: rep - movsb -9: movl %ecx,%eax - ret - - /* multiple of 8 byte */ -10: rep - movsq - xor %eax,%eax - ret - - /* exception handling */ -3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ - jmp 6f -5: movl %ecx,%eax /* exception on byte loop */ - /* eax: left over bytes */ -6: testl %r8d,%r8d /* zero flag set? */ - jz 7f - movl %eax,%ecx /* initialize x86 loop counter */ - push %rax - xorl %eax,%eax -8: rep - stosb /* zero the rest */ -11: pop %rax -7: ret - CFI_ENDPROC -END(copy_user_generic_c) - - .section __ex_table,"a" - .quad 1b,3b - .quad 2b,5b - .quad 8b,11b - .quad 10b,3b - .previous diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S deleted file mode 100644 index 4620efb12f13..000000000000 --- a/arch/x86_64/lib/copy_user_nocache.S +++ /dev/null @@ -1,217 +0,0 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs. - * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> - -#define FIX_ALIGNMENT 1 - -#include <asm/current.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/cpufeature.h> - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination/source out of cache for more performance. - * - * Input: - * rdi destination - * rsi source - * rdx count - * rcx zero flag when 1 zero on exception - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -ENTRY(__copy_user_nocache) - CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - pushq %rcx /* save zero flag */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rcx, 0 - - xorl %eax,%eax /* zero for the exception handler */ - -#ifdef FIX_ALIGNMENT - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jnz .Lbad_alignment -.Lafter_bad_alignment: -#endif - - movq %rdx,%rcx - - movl $64,%ebx - shrq $6,%rdx - decq %rdx - js .Lhandle_tail - - .p2align 4 -.Lloop: -.Ls1: movq (%rsi),%r11 -.Ls2: movq 1*8(%rsi),%r8 -.Ls3: movq 2*8(%rsi),%r9 -.Ls4: movq 3*8(%rsi),%r10 -.Ld1: movnti %r11,(%rdi) -.Ld2: movnti %r8,1*8(%rdi) -.Ld3: movnti %r9,2*8(%rdi) -.Ld4: movnti %r10,3*8(%rdi) - -.Ls5: movq 4*8(%rsi),%r11 -.Ls6: movq 5*8(%rsi),%r8 -.Ls7: movq 6*8(%rsi),%r9 -.Ls8: movq 7*8(%rsi),%r10 -.Ld5: movnti %r11,4*8(%rdi) -.Ld6: movnti %r8,5*8(%rdi) -.Ld7: movnti %r9,6*8(%rdi) -.Ld8: movnti %r10,7*8(%rdi) - - dec %rdx - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - - jns .Lloop - - .p2align 4 -.Lhandle_tail: - movl %ecx,%edx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - movl $8,%ebx - .p2align 4 -.Lloop_8: -.Ls9: movq (%rsi),%r8 -.Ld9: movnti %r8,(%rdi) - decl %ecx - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: -.Ls10: movb (%rsi),%bl -.Ld10: movb %bl,(%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 - - CFI_REMEMBER_STATE -.Lende: - popq %rcx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE %rcx - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret - CFI_RESTORE_STATE - -#ifdef FIX_ALIGNMENT - /* align destination */ - .p2align 4 -.Lbad_alignment: - movl $8,%r9d - subl %ecx,%r9d - movl %r9d,%ecx - cmpq %r9,%rdx - jz .Lhandle_7 - js .Lhandle_7 -.Lalign_1: -.Ls11: movb (%rsi),%bl -.Ld11: movb %bl,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .Lalign_1 - subq %r9,%rdx - jmp .Lafter_bad_alignment -#endif - - /* table sorted by exception address */ - .section __ex_table,"a" - .align 8 - .quad .Ls1,.Ls1e - .quad .Ls2,.Ls2e - .quad .Ls3,.Ls3e - .quad .Ls4,.Ls4e - .quad .Ld1,.Ls1e - .quad .Ld2,.Ls2e - .quad .Ld3,.Ls3e - .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e - .quad .Ls6,.Ls6e - .quad .Ls7,.Ls7e - .quad .Ls8,.Ls8e - .quad .Ld5,.Ls5e - .quad .Ld6,.Ls6e - .quad .Ld7,.Ls7e - .quad .Ld8,.Ls8e - .quad .Ls9,.Le_quad - .quad .Ld9,.Le_quad - .quad .Ls10,.Le_byte - .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT - .quad .Ls11,.Lzero_rest - .quad .Ld11,.Lzero_rest -#endif - .quad .Le5,.Le_zero - .previous - - /* compute 64-offset for main loop. 8 bytes accuracy with error on the - pessimistic side. this is gross. it would be better to fix the - interface. */ - /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax -.Ls2e: addl $8,%eax -.Ls3e: addl $8,%eax -.Ls4e: addl $8,%eax -.Ls5e: addl $8,%eax -.Ls6e: addl $8,%eax -.Ls7e: addl $8,%eax -.Ls8e: addl $8,%eax - addq %rbx,%rdi /* +64 */ - subq %rax,%rdi /* correct destination with computed offset */ - - shlq $6,%rdx /* loop counter * 64 (stride length) */ - addq %rax,%rdx /* add offset to loopcnt */ - andl $63,%ecx /* remaining bytes */ - addq %rcx,%rdx /* add them */ - jmp .Lzero_rest - - /* exception on quad word loop in tail handling */ - /* ecx: loopcnt/8, %edx: length, rdi: correct */ -.Le_quad: - shll $3,%ecx - andl $7,%edx - addl %ecx,%edx - /* edx: bytes to zero, rdi: dest, eax:zero */ -.Lzero_rest: - cmpl $0,(%rsp) /* zero flag set? */ - jz .Le_zero - movq %rdx,%rcx -.Le_byte: - xorl %eax,%eax -.Le5: rep - stosb - /* when there is another exception while zeroing the rest just return */ -.Le_zero: - movq %rdx,%rax - jmp .Lende - CFI_ENDPROC -ENDPROC(__copy_user_nocache) - - diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S deleted file mode 100644 index f0dba36578ea..000000000000 --- a/arch/x86_64/lib/csum-copy.S +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file COPYING in the main directory of this archive - * for more details. No warranty for anything given at all. - */ -#include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/errno.h> - -/* - * Checksum copy with exception handling. - * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the - * destination is zeroed. - * - * Input - * rdi source - * rsi destination - * edx len (32bit) - * ecx sum (32bit) - * r8 src_err_ptr (int) - * r9 dst_err_ptr (int) - * - * Output - * eax 64bit sum. undefined in case of exception. - * - * Wrappers need to take care of valid exception sum and zeroing. - * They also should align source or destination to 8 bytes. - */ - - .macro source -10: - .section __ex_table,"a" - .align 8 - .quad 10b,.Lbad_source - .previous - .endm - - .macro dest -20: - .section __ex_table,"a" - .align 8 - .quad 20b,.Lbad_dest - .previous - .endm - - .macro ignore L=.Lignore -30: - .section __ex_table,"a" - .align 8 - .quad 30b,\L - .previous - .endm - - -ENTRY(csum_partial_copy_generic) - CFI_STARTPROC - cmpl $3*64,%edx - jle .Lignore - -.Lignore: - subq $7*8,%rsp - CFI_ADJUST_CFA_OFFSET 7*8 - movq %rbx,2*8(%rsp) - CFI_REL_OFFSET rbx, 2*8 - movq %r12,3*8(%rsp) - CFI_REL_OFFSET r12, 3*8 - movq %r14,4*8(%rsp) - CFI_REL_OFFSET r14, 4*8 - movq %r13,5*8(%rsp) - CFI_REL_OFFSET r13, 5*8 - movq %rbp,6*8(%rsp) - CFI_REL_OFFSET rbp, 6*8 - - movq %r8,(%rsp) - movq %r9,1*8(%rsp) - - movl %ecx,%eax - movl %edx,%ecx - - xorl %r9d,%r9d - movq %rcx,%r12 - - shrq $6,%r12 - jz .Lhandle_tail /* < 64 */ - - clc - - /* main loop. clear in 64 byte blocks */ - /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ - /* r11: temp3, rdx: temp4, r12 loopcnt */ - /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ - .p2align 4 -.Lloop: - source - movq (%rdi),%rbx - source - movq 8(%rdi),%r8 - source - movq 16(%rdi),%r11 - source - movq 24(%rdi),%rdx - - source - movq 32(%rdi),%r10 - source - movq 40(%rdi),%rbp - source - movq 48(%rdi),%r14 - source - movq 56(%rdi),%r13 - - ignore 2f - prefetcht0 5*64(%rdi) -2: - adcq %rbx,%rax - adcq %r8,%rax - adcq %r11,%rax - adcq %rdx,%rax - adcq %r10,%rax - adcq %rbp,%rax - adcq %r14,%rax - adcq %r13,%rax - - decl %r12d - - dest - movq %rbx,(%rsi) - dest - movq %r8,8(%rsi) - dest - movq %r11,16(%rsi) - dest - movq %rdx,24(%rsi) - - dest - movq %r10,32(%rsi) - dest - movq %rbp,40(%rsi) - dest - movq %r14,48(%rsi) - dest - movq %r13,56(%rsi) - -3: - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - - jnz .Lloop - - adcq %r9,%rax - - /* do last upto 56 bytes */ -.Lhandle_tail: - /* ecx: count */ - movl %ecx,%r10d - andl $63,%ecx - shrl $3,%ecx - jz .Lfold - clc - .p2align 4 -.Lloop_8: - source - movq (%rdi),%rbx - adcq %rbx,%rax - decl %ecx - dest - movq %rbx,(%rsi) - leaq 8(%rsi),%rsi /* preserve carry */ - leaq 8(%rdi),%rdi - jnz .Lloop_8 - adcq %r9,%rax /* add in carry */ - -.Lfold: - /* reduce checksum to 32bits */ - movl %eax,%ebx - shrq $32,%rax - addl %ebx,%eax - adcl %r9d,%eax - - /* do last upto 6 bytes */ -.Lhandle_7: - movl %r10d,%ecx - andl $7,%ecx - shrl $1,%ecx - jz .Lhandle_1 - movl $2,%edx - xorl %ebx,%ebx - clc - .p2align 4 -.Lloop_1: - source - movw (%rdi),%bx - adcl %ebx,%eax - decl %ecx - dest - movw %bx,(%rsi) - leaq 2(%rdi),%rdi - leaq 2(%rsi),%rsi - jnz .Lloop_1 - adcl %r9d,%eax /* add in carry */ - - /* handle last odd byte */ -.Lhandle_1: - testl $1,%r10d - jz .Lende - xorl %ebx,%ebx - source - movb (%rdi),%bl - dest - movb %bl,(%rsi) - addl %ebx,%eax - adcl %r9d,%eax /* carry */ - - CFI_REMEMBER_STATE -.Lende: - movq 2*8(%rsp),%rbx - CFI_RESTORE rbx - movq 3*8(%rsp),%r12 - CFI_RESTORE r12 - movq 4*8(%rsp),%r14 - CFI_RESTORE r14 - movq 5*8(%rsp),%r13 - CFI_RESTORE r13 - movq 6*8(%rsp),%rbp - CFI_RESTORE rbp - addq $7*8,%rsp - CFI_ADJUST_CFA_OFFSET -7*8 - ret - CFI_RESTORE_STATE - - /* Exception handlers. Very simple, zeroing is done in the wrappers */ -.Lbad_source: - movq (%rsp),%rax - testq %rax,%rax - jz .Lende - movl $-EFAULT,(%rax) - jmp .Lende - -.Lbad_dest: - movq 8(%rsp),%rax - testq %rax,%rax - jz .Lende - movl $-EFAULT,(%rax) - jmp .Lende - CFI_ENDPROC -ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c deleted file mode 100644 index bc503f506903..000000000000 --- a/arch/x86_64/lib/csum-partial.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * arch/x86_64/lib/csum-partial.c - * - * This file contains network checksum routines that are better done - * in an architecture-specific manner due to speed. - */ - -#include <linux/compiler.h> -#include <linux/module.h> -#include <asm/checksum.h> - -static inline unsigned short from32to16(unsigned a) -{ - unsigned short b = a >> 16; - asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" - : "=r" (b) - : "0" (b), "r" (a)); - return b; -} - -/* - * Do a 64-bit checksum on an arbitrary memory area. - * Returns a 32bit checksum. - * - * This isn't as time critical as it used to be because many NICs - * do hardware checksumming these days. - * - * Things tried and found to not make it faster: - * Manual Prefetching - * Unrolling to an 128 bytes inner loop. - * Using interleaving with more registers to break the carry chains. - */ -static unsigned do_csum(const unsigned char *buff, unsigned len) -{ - unsigned odd, count; - unsigned long result = 0; - - if (unlikely(len == 0)) - return result; - odd = 1 & (unsigned long) buff; - if (unlikely(odd)) { - result = *buff << 8; - len--; - buff++; - } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(unsigned short *)buff; - count--; - len -= 2; - buff += 2; - } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { - unsigned long zero; - unsigned count64; - if (4 & (unsigned long) buff) { - result += *(unsigned int *) buff; - count--; - len -= 4; - buff += 4; - } - count >>= 1; /* nr of 64-bit words.. */ - - /* main loop using 64byte blocks */ - zero = 0; - count64 = count >> 3; - while (count64) { - asm("addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcq 4*8(%[src]),%[res]\n\t" - "adcq 5*8(%[src]),%[res]\n\t" - "adcq 6*8(%[src]),%[res]\n\t" - "adcq 7*8(%[src]),%[res]\n\t" - "adcq %[zero],%[res]" - : [res] "=r" (result) - : [src] "r" (buff), [zero] "r" (zero), - "[res]" (result)); - buff += 64; - count64--; - } - - /* last upto 7 8byte blocks */ - count %= 8; - while (count) { - asm("addq %1,%0\n\t" - "adcq %2,%0\n" - : "=r" (result) - : "m" (*(unsigned long *)buff), - "r" (zero), "0" (result)); - --count; - buff += 8; - } - result = add32_with_carry(result>>32, - result&0xffffffff); - - if (len & 4) { - result += *(unsigned int *) buff; - buff += 4; - } - } - if (len & 2) { - result += *(unsigned short *) buff; - buff += 2; - } - } - if (len & 1) - result += *buff; - result = add32_with_carry(result>>32, result & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - } - return result; -} - -/* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) - * - * returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic - * - * this function must be called with even lengths, except - * for the last fragment, which may be odd - * - * it's best to have buff aligned on a 64-bit boundary - */ -__wsum csum_partial(const void *buff, int len, __wsum sum) -{ - return (__force __wsum)add32_with_carry(do_csum(buff, len), - (__force u32)sum); -} - -EXPORT_SYMBOL(csum_partial); - -/* - * this routine is used for miscellaneous IP-like checksums, mainly - * in icmp.c - */ -__sum16 ip_compute_csum(const void *buff, int len) -{ - return csum_fold(csum_partial(buff,len,0)); -} -EXPORT_SYMBOL(ip_compute_csum); - diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c deleted file mode 100644 index fd42a4a095fc..000000000000 --- a/arch/x86_64/lib/csum-wrappers.c +++ /dev/null @@ -1,135 +0,0 @@ -/* Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Subject to the GNU Public License v.2 - * - * Wrappers of assembly checksum functions for x86-64. - */ - -#include <asm/checksum.h> -#include <linux/module.h> - -/** - * csum_partial_copy_from_user - Copy and checksum from user space. - * @src: source address (user space) - * @dst: destination address - * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) - * @errp: set to -EFAULT for an bad source address. - * - * Returns an 32bit unfolded checksum of the buffer. - * src and dst are best aligned to 64bits. - */ -__wsum -csum_partial_copy_from_user(const void __user *src, void *dst, - int len, __wsum isum, int *errp) -{ - might_sleep(); - *errp = 0; - if (likely(access_ok(VERIFY_READ,src, len))) { - /* Why 6, not 7? To handle odd addresses aligned we - would need to do considerable complications to fix the - checksum which is defined as an 16bit accumulator. The - fix alignment code is primarily for performance - compatibility with 32bit and that will handle odd - addresses slowly too. */ - if (unlikely((unsigned long)src & 6)) { - while (((unsigned long)src & 6) && len >= 2) { - __u16 val16; - *errp = __get_user(val16, (const __u16 __user *)src); - if (*errp) - return isum; - *(__u16 *)dst = val16; - isum = (__force __wsum)add32_with_carry( - (__force unsigned)isum, val16); - src += 2; - dst += 2; - len -= 2; - } - } - isum = csum_partial_copy_generic((__force const void *)src, - dst, len, isum, errp, NULL); - if (likely(*errp == 0)) - return isum; - } - *errp = -EFAULT; - memset(dst,0,len); - return isum; -} - -EXPORT_SYMBOL(csum_partial_copy_from_user); - -/** - * csum_partial_copy_to_user - Copy and checksum to user space. - * @src: source address - * @dst: destination address (user space) - * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) - * @errp: set to -EFAULT for an bad destination address. - * - * Returns an 32bit unfolded checksum of the buffer. - * src and dst are best aligned to 64bits. - */ -__wsum -csum_partial_copy_to_user(const void *src, void __user *dst, - int len, __wsum isum, int *errp) -{ - might_sleep(); - if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { - *errp = -EFAULT; - return 0; - } - - if (unlikely((unsigned long)dst & 6)) { - while (((unsigned long)dst & 6) && len >= 2) { - __u16 val16 = *(__u16 *)src; - isum = (__force __wsum)add32_with_carry( - (__force unsigned)isum, val16); - *errp = __put_user(val16, (__u16 __user *)dst); - if (*errp) - return isum; - src += 2; - dst += 2; - len -= 2; - } - } - - *errp = 0; - return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); -} - -EXPORT_SYMBOL(csum_partial_copy_to_user); - -/** - * csum_partial_copy_nocheck - Copy and checksum. - * @src: source address - * @dst: destination address - * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) - * - * Returns an 32bit unfolded checksum of the buffer. - */ -__wsum -csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) -{ - return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL); -} -EXPORT_SYMBOL(csum_partial_copy_nocheck); - -__sum16 csum_ipv6_magic(const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u32 len, unsigned short proto, __wsum sum) -{ - __u64 rest, sum64; - - rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) + - (__force __u64)sum; - asm(" addq (%[saddr]),%[sum]\n" - " adcq 8(%[saddr]),%[sum]\n" - " adcq (%[daddr]),%[sum]\n" - " adcq 8(%[daddr]),%[sum]\n" - " adcq $0,%[sum]\n" - : [sum] "=r" (sum64) - : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr)); - return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32)); -} - -EXPORT_SYMBOL(csum_ipv6_magic); diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c deleted file mode 100644 index 2dbebd308347..000000000000 --- a/arch/x86_64/lib/delay.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Precise Delay Loops for x86-64 - * - * Copyright (C) 1993 Linus Torvalds - * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> - * - * The __delay function must _NOT_ be inlined as its execution time - * depends wildly on alignment on many x86 processors. - */ - -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/delay.h> -#include <asm/delay.h> -#include <asm/msr.h> - -#ifdef CONFIG_SMP -#include <asm/smp.h> -#endif - -int read_current_timer(unsigned long *timer_value) -{ - rdtscll(*timer_value); - return 0; -} - -void __delay(unsigned long loops) -{ - unsigned bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } - while((now-bclock) < loops); -} -EXPORT_SYMBOL(__delay); - -inline void __const_udelay(unsigned long xloops) -{ - __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1); -} -EXPORT_SYMBOL(__const_udelay); - -void __udelay(unsigned long usecs) -{ - __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ -} -EXPORT_SYMBOL(__udelay); - -void __ndelay(unsigned long nsecs) -{ - __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ -} -EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S deleted file mode 100644 index 5448876261f8..000000000000 --- a/arch/x86_64/lib/getuser.S +++ /dev/null @@ -1,109 +0,0 @@ -/* - * __get_user functions. - * - * (C) Copyright 1998 Linus Torvalds - * (C) Copyright 2005 Andi Kleen - * - * These functions have a non-standard call interface - * to make them more efficient, especially as they - * return an error value in addition to the "real" - * return value. - */ - -/* - * __get_user_X - * - * Inputs: %rcx contains the address. - * The register is modified, but all changes are undone - * before returning because the C code doesn't know about it. - * - * Outputs: %rax is error code (0 or -EFAULT) - * %rdx contains zero-extended value - * - * %r8 is destroyed. - * - * These functions should not modify any other registers, - * as they get called from within inline assembly. - */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/page.h> -#include <asm/errno.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> - - .text -ENTRY(__get_user_1) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - cmpq threadinfo_addr_limit(%r8),%rcx - jae bad_get_user -1: movzb (%rcx),%edx - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__get_user_1) - -ENTRY(__get_user_2) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $1,%rcx - jc 20f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 20f - decq %rcx -2: movzwl (%rcx),%edx - xorl %eax,%eax - ret -20: decq %rcx - jmp bad_get_user - CFI_ENDPROC -ENDPROC(__get_user_2) - -ENTRY(__get_user_4) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $3,%rcx - jc 30f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 30f - subq $3,%rcx -3: movl (%rcx),%edx - xorl %eax,%eax - ret -30: subq $3,%rcx - jmp bad_get_user - CFI_ENDPROC -ENDPROC(__get_user_4) - -ENTRY(__get_user_8) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $7,%rcx - jc 40f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 40f - subq $7,%rcx -4: movq (%rcx),%rdx - xorl %eax,%eax - ret -40: subq $7,%rcx - jmp bad_get_user - CFI_ENDPROC -ENDPROC(__get_user_8) - -bad_get_user: - CFI_STARTPROC - xorl %edx,%edx - movq $(-EFAULT),%rax - ret - CFI_ENDPROC -END(bad_get_user) - -.section __ex_table,"a" - .quad 1b,bad_get_user - .quad 2b,bad_get_user - .quad 3b,bad_get_user - .quad 4b,bad_get_user -.previous diff --git a/arch/x86_64/lib/io.c b/arch/x86_64/lib/io.c deleted file mode 100644 index 87b4a4e18039..000000000000 --- a/arch/x86_64/lib/io.c +++ /dev/null @@ -1,23 +0,0 @@ -#include <linux/string.h> -#include <asm/io.h> -#include <linux/module.h> - -void __memcpy_toio(unsigned long dst,const void*src,unsigned len) -{ - __inline_memcpy((void *) dst,src,len); -} -EXPORT_SYMBOL(__memcpy_toio); - -void __memcpy_fromio(void *dst,unsigned long src,unsigned len) -{ - __inline_memcpy(dst,(const void *) src,len); -} -EXPORT_SYMBOL(__memcpy_fromio); - -void memset_io(volatile void __iomem *a, int b, size_t c) -{ - /* XXX: memset can mangle the IO patterns quite a bit. - perhaps it would be better to use a dumb one */ - memset((void *)a,b,c); -} -EXPORT_SYMBOL(memset_io); diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S deleted file mode 100644 index 05a95e713da8..000000000000 --- a/arch/x86_64/lib/iomap_copy.S +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2006 PathScale, Inc. All Rights Reserved. - * - * This file is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> - -/* - * override generic version in lib/iomap_copy.c - */ -ENTRY(__iowrite32_copy) - CFI_STARTPROC - movl %edx,%ecx - rep movsd - ret - CFI_ENDPROC -ENDPROC(__iowrite32_copy) diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S deleted file mode 100644 index c22981fa2f3a..000000000000 --- a/arch/x86_64/lib/memcpy.S +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright 2002 Andi Kleen */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/cpufeature.h> - -/* - * memcpy - Copy a memory block. - * - * Input: - * rdi destination - * rsi source - * rdx count - * - * Output: - * rax original destination - */ - - ALIGN -memcpy_c: - CFI_STARTPROC - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx - rep movsq - movl %edx,%ecx - rep movsb - ret - CFI_ENDPROC -ENDPROC(memcpy_c) - -ENTRY(__memcpy) -ENTRY(memcpy) - CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - movq %rdi,%rax - - movl %edx,%ecx - shrl $6,%ecx - jz .Lhandle_tail - - .p2align 4 -.Lloop_64: - decl %ecx - - movq (%rsi),%r11 - movq 8(%rsi),%r8 - - movq %r11,(%rdi) - movq %r8,1*8(%rdi) - - movq 2*8(%rsi),%r9 - movq 3*8(%rsi),%r10 - - movq %r9,2*8(%rdi) - movq %r10,3*8(%rdi) - - movq 4*8(%rsi),%r11 - movq 5*8(%rsi),%r8 - - movq %r11,4*8(%rdi) - movq %r8,5*8(%rdi) - - movq 6*8(%rsi),%r9 - movq 7*8(%rsi),%r10 - - movq %r9,6*8(%rdi) - movq %r10,7*8(%rdi) - - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - jnz .Lloop_64 - -.Lhandle_tail: - movl %edx,%ecx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - .p2align 4 -.Lloop_8: - decl %ecx - movq (%rsi),%r8 - movq %r8,(%rdi) - leaq 8(%rdi),%rdi - leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: - movb (%rsi),%r8b - movb %r8b,(%rdi) - incq %rdi - incq %rsi - decl %ecx - jnz .Lloop_1 - -.Lende: - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx - ret -.Lfinal: - CFI_ENDPROC -ENDPROC(memcpy) -ENDPROC(__memcpy) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad memcpy - .quad 1b - .byte X86_FEATURE_REP_GOOD - /* Replace only beginning, memcpy is used to apply alternatives, so it - * is silly to overwrite itself with nops - reboot is only outcome... */ - .byte 2b - 1b - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c deleted file mode 100644 index 751ebae8ec42..000000000000 --- a/arch/x86_64/lib/memmove.c +++ /dev/null @@ -1,21 +0,0 @@ -/* Normally compiler builtins are used, but sometimes the compiler calls out - of line code. Based on asm-i386/string.h. - */ -#define _STRING_C -#include <linux/string.h> -#include <linux/module.h> - -#undef memmove -void *memmove(void * dest,const void *src,size_t count) -{ - if (dest < src) { - return memcpy(dest,src,count); - } else { - char *p = (char *) dest + count; - char *s = (char *) src + count; - while (count--) - *--p = *--s; - } - return dest; -} -EXPORT_SYMBOL(memmove); diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S deleted file mode 100644 index 2c5948116bd2..000000000000 --- a/arch/x86_64/lib/memset.S +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> - -/* - * ISO C memset - set a memory block to a byte value. - * - * rdi destination - * rsi value (char) - * rdx count (bytes) - * - * rax original destination - */ - ALIGN -memset_c: - CFI_STARTPROC - movq %rdi,%r9 - movl %edx,%r8d - andl $7,%r8d - movl %edx,%ecx - shrl $3,%ecx - /* expand byte value */ - movzbl %sil,%esi - movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ - rep stosq - movl %r8d,%ecx - rep stosb - movq %r9,%rax - ret - CFI_ENDPROC -ENDPROC(memset_c) - -ENTRY(memset) -ENTRY(__memset) - CFI_STARTPROC - movq %rdi,%r10 - movq %rdx,%r11 - - /* expand byte value */ - movzbl %sil,%ecx - movabs $0x0101010101010101,%rax - mul %rcx /* with rax, clobbers rdx */ - - /* align dst */ - movl %edi,%r9d - andl $7,%r9d - jnz .Lbad_alignment - CFI_REMEMBER_STATE -.Lafter_bad_alignment: - - movl %r11d,%ecx - shrl $6,%ecx - jz .Lhandle_tail - - .p2align 4 -.Lloop_64: - decl %ecx - movq %rax,(%rdi) - movq %rax,8(%rdi) - movq %rax,16(%rdi) - movq %rax,24(%rdi) - movq %rax,32(%rdi) - movq %rax,40(%rdi) - movq %rax,48(%rdi) - movq %rax,56(%rdi) - leaq 64(%rdi),%rdi - jnz .Lloop_64 - - /* Handle tail in loops. The loops should be faster than hard - to predict jump tables. */ - .p2align 4 -.Lhandle_tail: - movl %r11d,%ecx - andl $63&(~7),%ecx - jz .Lhandle_7 - shrl $3,%ecx - .p2align 4 -.Lloop_8: - decl %ecx - movq %rax,(%rdi) - leaq 8(%rdi),%rdi - jnz .Lloop_8 - -.Lhandle_7: - movl %r11d,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: - decl %ecx - movb %al,(%rdi) - leaq 1(%rdi),%rdi - jnz .Lloop_1 - -.Lende: - movq %r10,%rax - ret - - CFI_RESTORE_STATE -.Lbad_alignment: - cmpq $7,%r11 - jbe .Lhandle_7 - movq %rax,(%rdi) /* unaligned store */ - movq $8,%r8 - subq %r9,%r8 - addq %r8,%rdi - subq %r8,%r11 - jmp .Lafter_bad_alignment -.Lfinal: - CFI_ENDPROC -ENDPROC(memset) -ENDPROC(__memset) - - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ - -#include <asm/cpufeature.h> - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (memset_c - memset) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - .align 8 - .quad memset - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lfinal - memset - .byte 2b - 1b - .previous diff --git a/arch/x86_64/lib/msr-on-cpu.c b/arch/x86_64/lib/msr-on-cpu.c deleted file mode 100644 index 47e0ec47c376..000000000000 --- a/arch/x86_64/lib/msr-on-cpu.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../i386/lib/msr-on-cpu.c" diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S deleted file mode 100644 index 4989f5a8fa9b..000000000000 --- a/arch/x86_64/lib/putuser.S +++ /dev/null @@ -1,106 +0,0 @@ -/* - * __put_user functions. - * - * (C) Copyright 1998 Linus Torvalds - * (C) Copyright 2005 Andi Kleen - * - * These functions have a non-standard call interface - * to make them more efficient, especially as they - * return an error value in addition to the "real" - * return value. - */ - -/* - * __put_user_X - * - * Inputs: %rcx contains the address - * %rdx contains new value - * - * Outputs: %rax is error code (0 or -EFAULT) - * - * %r8 is destroyed. - * - * These functions should not modify any other registers, - * as they get called from within inline assembly. - */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/page.h> -#include <asm/errno.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> - - .text -ENTRY(__put_user_1) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - cmpq threadinfo_addr_limit(%r8),%rcx - jae bad_put_user -1: movb %dl,(%rcx) - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__put_user_1) - -ENTRY(__put_user_2) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $1,%rcx - jc 20f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 20f - decq %rcx -2: movw %dx,(%rcx) - xorl %eax,%eax - ret -20: decq %rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_2) - -ENTRY(__put_user_4) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $3,%rcx - jc 30f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 30f - subq $3,%rcx -3: movl %edx,(%rcx) - xorl %eax,%eax - ret -30: subq $3,%rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_4) - -ENTRY(__put_user_8) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $7,%rcx - jc 40f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 40f - subq $7,%rcx -4: movq %rdx,(%rcx) - xorl %eax,%eax - ret -40: subq $7,%rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_8) - -bad_put_user: - CFI_STARTPROC - movq $(-EFAULT),%rax - ret - CFI_ENDPROC -END(bad_put_user) - -.section __ex_table,"a" - .quad 1b,bad_put_user - .quad 2b,bad_put_user - .quad 3b,bad_put_user - .quad 4b,bad_put_user -.previous diff --git a/arch/x86_64/lib/rwlock.S b/arch/x86_64/lib/rwlock.S deleted file mode 100644 index 0cde1f807314..000000000000 --- a/arch/x86_64/lib/rwlock.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Slow paths of read/write spinlocks. */ - -#include <linux/linkage.h> -#include <asm/rwlock.h> -#include <asm/alternative-asm.i> -#include <asm/dwarf2.h> - -/* rdi: pointer to rwlock_t */ -ENTRY(__write_lock_failed) - CFI_STARTPROC - LOCK_PREFIX - addl $RW_LOCK_BIAS,(%rdi) -1: rep - nop - cmpl $RW_LOCK_BIAS,(%rdi) - jne 1b - LOCK_PREFIX - subl $RW_LOCK_BIAS,(%rdi) - jnz __write_lock_failed - ret - CFI_ENDPROC -END(__write_lock_failed) - -/* rdi: pointer to rwlock_t */ -ENTRY(__read_lock_failed) - CFI_STARTPROC - LOCK_PREFIX - incl (%rdi) -1: rep - nop - cmpl $1,(%rdi) - js 1b - LOCK_PREFIX - decl (%rdi) - js __read_lock_failed - ret - CFI_ENDPROC -END(__read_lock_failed) diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S deleted file mode 100644 index 55e586d352d3..000000000000 --- a/arch/x86_64/lib/thunk.S +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Save registers before calling assembly functions. This avoids - * disturbance of register allocation in some inline assembly constructs. - * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Subject to the GNU public license, v.2. No warranty of any kind. - */ - - #include <linux/linkage.h> - #include <asm/dwarf2.h> - #include <asm/calling.h> - #include <asm/rwlock.h> - - /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro thunk name,func - .globl \name -\name: - CFI_STARTPROC - SAVE_ARGS - call \func - jmp restore - CFI_ENDPROC - .endm - - /* rdi: arg1 ... normal C conventions. rax is passed from C. */ - .macro thunk_retrax name,func - .globl \name -\name: - CFI_STARTPROC - SAVE_ARGS - call \func - jmp restore_norax - CFI_ENDPROC - .endm - - - .section .sched.text -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM - thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed - thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed - thunk rwsem_wake_thunk,rwsem_wake - thunk rwsem_downgrade_thunk,rwsem_downgrade_wake -#endif - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up - -#ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off -#endif - - /* SAVE_ARGS below is used only for the .cfi directives it contains. */ - CFI_STARTPROC - SAVE_ARGS -restore: - RESTORE_ARGS - ret - CFI_ENDPROC - - CFI_STARTPROC - SAVE_ARGS -restore_norax: - RESTORE_ARGS 1 - ret - CFI_ENDPROC diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c deleted file mode 100644 index 893d43f838cc..000000000000 --- a/arch/x86_64/lib/usercopy.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * User address space access functions. - * - * Copyright 1997 Andi Kleen <ak@muc.de> - * Copyright 1997 Linus Torvalds - * Copyright 2002 Andi Kleen <ak@suse.de> - */ -#include <linux/module.h> -#include <asm/uaccess.h> - -/* - * Copy a null terminated string from userspace. - */ - -#define __do_strncpy_from_user(dst,src,count,res) \ -do { \ - long __d0, __d1, __d2; \ - might_sleep(); \ - __asm__ __volatile__( \ - " testq %1,%1\n" \ - " jz 2f\n" \ - "0: lodsb\n" \ - " stosb\n" \ - " testb %%al,%%al\n" \ - " jz 1f\n" \ - " decq %1\n" \ - " jnz 0b\n" \ - "1: subq %1,%0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movq %5,%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 8\n" \ - " .quad 0b,3b\n" \ - ".previous" \ - : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ - "=&D" (__d2) \ - : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ - : "memory"); \ -} while (0) - -long -__strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res; - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(__strncpy_from_user); - -long -strncpy_from_user(char *dst, const char __user *src, long count) -{ - long res = -EFAULT; - if (access_ok(VERIFY_READ, src, 1)) - return __strncpy_from_user(dst, src, count); - return res; -} -EXPORT_SYMBOL(strncpy_from_user); - -/* - * Zero Userspace - */ - -unsigned long __clear_user(void __user *addr, unsigned long size) -{ - long __d0; - might_sleep(); - /* no memory constraint because it doesn't change any memory gcc knows - about */ - asm volatile( - " testq %[size8],%[size8]\n" - " jz 4f\n" - "0: movq %[zero],(%[dst])\n" - " addq %[eight],%[dst]\n" - " decl %%ecx ; jnz 0b\n" - "4: movq %[size1],%%rcx\n" - " testl %%ecx,%%ecx\n" - " jz 2f\n" - "1: movb %b[zero],(%[dst])\n" - " incq %[dst]\n" - " decl %%ecx ; jnz 1b\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: lea 0(%[size1],%[size8],8),%[size8]\n" - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 0b,3b\n" - " .quad 1b,2b\n" - ".previous" - : [size8] "=c"(size), [dst] "=&D" (__d0) - : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), - [zero] "r" (0UL), [eight] "r" (8UL)); - return size; -} -EXPORT_SYMBOL(__clear_user); - -unsigned long clear_user(void __user *to, unsigned long n) -{ - if (access_ok(VERIFY_WRITE, to, n)) - return __clear_user(to, n); - return n; -} -EXPORT_SYMBOL(clear_user); - -/* - * Return the size of a string (including the ending 0) - * - * Return 0 on exception, a value greater than N if too long - */ - -long __strnlen_user(const char __user *s, long n) -{ - long res = 0; - char c; - - while (1) { - if (res>n) - return n+1; - if (__get_user(c, s)) - return 0; - if (!c) - return res+1; - res++; - s++; - } -} -EXPORT_SYMBOL(__strnlen_user); - -long strnlen_user(const char __user *s, long n) -{ - if (!access_ok(VERIFY_READ, s, n)) - return 0; - return __strnlen_user(s, n); -} -EXPORT_SYMBOL(strnlen_user); - -long strlen_user(const char __user *s) -{ - long res = 0; - char c; - - for (;;) { - if (get_user(c, s)) - return 0; - if (!c) - return res+1; - res++; - s++; - } -} -EXPORT_SYMBOL(strlen_user); - -unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) -{ - if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { - return copy_user_generic((__force void *)to, (__force void *)from, len); - } - return len; -} -EXPORT_SYMBOL(copy_in_user); - diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile deleted file mode 100644 index d25ac86fe27a..000000000000 --- a/arch/x86_64/mm/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# -# Makefile for the linux x86_64-specific parts of the memory manager. -# - -obj-y := init.o fault.o ioremap.o extable.o pageattr.o mmap.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_NUMA) += numa.o -obj-$(CONFIG_K8_NUMA) += k8topology.o -obj-$(CONFIG_ACPI_NUMA) += srat.o - -hugetlbpage-y = ../../i386/mm/hugetlbpage.o diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c deleted file mode 100644 index 79ac6e7100af..000000000000 --- a/arch/x86_64/mm/extable.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * linux/arch/x86_64/mm/extable.c - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/init.h> -#include <asm/uaccess.h> - -/* Simple binary search */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - /* Work around a B stepping K8 bug */ - if ((value >> 32) == 0) - value |= 0xffffffffUL << 32; - - while (first <= last) { - const struct exception_table_entry *mid; - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - return NULL; -} diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c deleted file mode 100644 index 54816adb8e93..000000000000 --- a/arch/x86_64/mm/fault.c +++ /dev/null @@ -1,636 +0,0 @@ -/* - * linux/arch/x86-64/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/compiler.h> -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kdebug.h> - -#include <asm/system.h> -#include <asm/pgalloc.h> -#include <asm/smp.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> -#include <asm-generic/sections.h> - -/* Page fault error code bits */ -#define PF_PROT (1<<0) /* or no page found */ -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) - -static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); - -/* Hook to register for page fault notifications */ -int register_page_fault_notifier(struct notifier_block *nb) -{ - vmalloc_sync_all(); - return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); -} -EXPORT_SYMBOL_GPL(register_page_fault_notifier); - -int unregister_page_fault_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); -} -EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); - -static inline int notify_page_fault(struct pt_regs *regs, long err) -{ - struct die_args args = { - .regs = regs, - .str = "page fault", - .err = err, - .trapnr = 14, - .signr = SIGSEGV - }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, - DIE_PAGE_FAULT, &args); -} - -/* Sometimes the CPU reports invalid exceptions on prefetch. - Check that here and ignore. - Opcode checker based on code by Richard Brunner */ -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - unsigned char *instr; - int scan_more = 1; - int prefetch = 0; - unsigned char *max_instr; - - /* If it was a exec fault ignore */ - if (error_code & PF_INSTR) - return 0; - - instr = (unsigned char __user *)convert_rip_to_linear(current, regs); - max_instr = instr + 15; - - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) - return 0; - - while (scan_more && instr < max_instr) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (probe_kernel_address(instr, opcode)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 - prefixes. In long mode, the CPU will signal - invalid opcode if some of these prefixes are - present so we will never get here anyway */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x40: - /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes - Need to figure out under what instruction mode the - instruction was issued ... */ - /* Could check the LDT for lm, but for now it's good - enough to assume that long mode only uses well known - segments or kernel. */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static int bad_address(void *p) -{ - unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); -} - -void dump_pagetable(unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; - - pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud)) goto ret; - - pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd)) goto ret; - - pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); -ret: - printk("\n"); - return; -bad: - printk("BAD\n"); -} - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. */ - -static int is_errata93(struct pt_regs *regs, unsigned long address) -{ - static int warned; - if (address != regs->rip) - return 0; - if ((address >> 32) != 0) - return 0; - address |= 0xffffffffUL << 32; - if ((address >= (u64)_stext && address <= (u64)_etext) || - (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { - printk(errata93_warning); - warned = 1; - } - regs->rip = address; - return 1; - } - return 0; -} - -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) -{ - unsigned long flags = oops_begin(); - struct task_struct *tsk; - - printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); - dump_pagetable(address); - tsk = current; - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Bad pagetable", regs, error_code); - oops_end(flags); - do_exit(SIGKILL); -} - -/* - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - - pgd = pgd_offset(current->mm ?: &init_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* Below here mismatches are bugs because these lower tables - are shared */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - return 0; -} - -static int page_fault_trace; -int show_unhandled_signals = 1; - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - */ -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - const struct exception_table_entry *fixup; - int write, fault; - unsigned long flags; - siginfo_t info; - - tsk = current; - mm = tsk->mm; - prefetchw(&mm->mmap_sem); - - /* get the address */ - address = read_cr2(); - - info.si_code = SEGV_MAPERR; - - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE64)) { - /* - * Don't check for the module range here: its PML4 - * is always initialized because it's shared with the main - * kernel text. Only vmalloc may need PML4 syncups. - */ - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) >= 0) - return; - } - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) - return; - - if (likely(regs->eflags & X86_EFLAGS_IF)) - local_irq_enable(); - - if (unlikely(page_fault_trace)) - printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", - regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); - - if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); - - /* - * If we're in an interrupt or have no user - * context, we must not take the fault.. - */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; - - /* - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. - */ - if (user_mode_vm(regs)) - error_code |= PF_USER; - - again: - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibilty of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->rip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (likely(vma->vm_start <= address)) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* Allow userspace just enough access below the stack pointer - * to let the 'enter' instruction work. - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - info.si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } - - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - if (is_prefetch(regs, address, error_code)) - return; - - /* Work around K8 erratum #100 K8 in compat mode - occasionally jumps to illegal addresses >4GB. We - catch this here in the page fault handler because - these addresses are not reachable. Just detect this - case and return. Any code segment in LDT is - compatibility mode. */ - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->rip, - regs->rsp, error_code); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code has been set above */ - info.si_addr = (void __user *)address; - force_sig_info(SIGSEGV, &info, tsk); - return; - } - -no_context: - - /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } - - /* - * Hall of shame of CPU/BIOS bugs. - */ - - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - flags = oops_begin(); - - if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "Unable to handle kernel paging request"); - printk(" at %016lx RIP: \n" KERN_ALERT,address); - printk_address(regs->rip); - dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Oops", regs, error_code); - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_init(current)) { - yield(); - goto again; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, tsk); - return; -} - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - /* Note that races in the updates of insync and start aren't - problematic: - insync can only get set bits added, and updates to start are only - improving performance (without affecting correctness if undone). */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - const pgd_t *pgd_ref = pgd_offset_k(address); - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - spin_lock(&pgd_lock); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock(&pgd_lock); - set_bit(pgd_index(address), insync); - } - if (address == start) - start = address + PGDIR_SIZE; - } - /* Check that there is no need to do the same for the modules area. */ - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == - (__START_KERNEL & PGDIR_MASK))); -} - -static int __init enable_pagefaulttrace(char *str) -{ - page_fault_trace = 1; - return 1; -} -__setup("pagefaulttrace", enable_pagefaulttrace); diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c deleted file mode 100644 index 458893b376f8..000000000000 --- a/arch/x86_64/mm/init.c +++ /dev/null @@ -1,750 +0,0 @@ -/* - * linux/arch/x86_64/mm/init.c - * - * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/smp.h> -#include <linux/init.h> -#include <linux/pagemap.h> -#include <linux/bootmem.h> -#include <linux/proc_fs.h> -#include <linux/pci.h> -#include <linux/pfn.h> -#include <linux/poison.h> -#include <linux/dma-mapping.h> -#include <linux/module.h> -#include <linux/memory_hotplug.h> -#include <linux/nmi.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/dma.h> -#include <asm/fixmap.h> -#include <asm/e820.h> -#include <asm/apic.h> -#include <asm/tlb.h> -#include <asm/mmu_context.h> -#include <asm/proto.h> -#include <asm/smp.h> -#include <asm/sections.h> - -#ifndef Dprintk -#define Dprintk(x...) -#endif - -const struct dma_mapping_ops* dma_ops; -EXPORT_SYMBOL(dma_ops); - -static unsigned long dma_reserve __initdata; - -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - -/* - * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the - * physical space so we can cache the place of the first one and move - * around without checking the pgd every time. - */ - -void show_mem(void) -{ - long i, total = 0, reserved = 0; - long shared = 0, cached = 0; - pg_data_t *pgdat; - struct page *page; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - - for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - /* this loop can take a while with 256 GB and 4k pages - so update the NMI watchdog */ - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { - touch_nmi_watchdog(); - } - if (!pfn_valid(pgdat->node_start_pfn + i)) - continue; - page = pfn_to_page(pgdat->node_start_pfn + i); - total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - } - printk(KERN_INFO "%lu pages of RAM\n", total); - printk(KERN_INFO "%lu reserved pages\n",reserved); - printk(KERN_INFO "%lu pages shared\n",shared); - printk(KERN_INFO "%lu pages swap cached\n",cached); -} - -int after_bootmem; - -static __init void *spp_getpage(void) -{ - void *ptr; - if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); - else - ptr = alloc_bootmem_pages(PAGE_SIZE); - if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) - panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); - - Dprintk("spp_getpage %p\n", ptr); - return ptr; -} - -static __init void set_pte_phys(unsigned long vaddr, - unsigned long phys, pgprot_t prot) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, new_pte; - - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); - - pgd = pgd_offset_k(vaddr); - if (pgd_none(*pgd)) { - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); - if (pmd != pmd_offset(pud, 0)) { - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); - return; - } - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); - if (pte != pte_offset_kernel(pmd, 0)) { - printk("PAGETABLE BUG #02!\n"); - return; - } - } - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); - - pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && - pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); - set_pte(pte, new_pte); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -/* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - printk("Invalid __set_fixmap\n"); - return; - } - set_pte_phys(address, phys, prot); -} - -unsigned long __meminitdata table_start, table_end; - -static __meminit void *alloc_low_page(unsigned long *phys) -{ - unsigned long pfn = table_end++; - void *adr; - - if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC); - *phys = __pa(adr); - return adr; - } - - if (pfn >= end_pfn) - panic("alloc_low_page: ran out of memory"); - - adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); - *phys = pfn * PAGE_SIZE; - return adr; -} - -static __meminit void unmap_low_page(void *adr) -{ - - if (after_bootmem) - return; - - early_iounmap(adr, PAGE_SIZE); -} - -/* Must run before zap_low_mappings */ -__meminit void *early_ioremap(unsigned long addr, unsigned long size) -{ - unsigned long vaddr; - pmd_t *pmd, *last_pmd; - int i, pmds; - - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - vaddr = __START_KERNEL_map; - pmd = level2_kernel_pgt; - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { - for (i = 0; i < pmds; i++) { - if (pmd_present(pmd[i])) - goto next; - } - vaddr += addr & ~PMD_MASK; - addr &= PMD_MASK; - for (i = 0; i < pmds; i++, addr += PMD_SIZE) - set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); - return (void *)vaddr; - next: - ; - } - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); - return NULL; -} - -/* To avoid virtual aliases later */ -__meminit void early_iounmap(void *addr, unsigned long size) -{ - unsigned long vaddr; - pmd_t *pmd; - int i, pmds; - - vaddr = (unsigned long)addr; - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - pmd = level2_kernel_pgt + pmd_index(vaddr); - for (i = 0; i < pmds; i++) - pmd_clear(pmd + i); - __flush_tlb(); -} - -static void __meminit -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) -{ - int i = pmd_index(address); - - for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { - unsigned long entry; - pmd_t *pmd = pmd_page + pmd_index(address); - - if (address >= end) { - if (!after_bootmem) - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); - break; - } - - if (pmd_val(*pmd)) - continue; - - entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; - entry &= __supported_pte_mask; - set_pmd(pmd, __pmd(entry)); - } -} - -static void __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) -{ - pmd_t *pmd = pmd_offset(pud,0); - spin_lock(&init_mm.page_table_lock); - phys_pmd_init(pmd, address, end); - spin_unlock(&init_mm.page_table_lock); - __flush_tlb_all(); -} - -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) -{ - int i = pud_index(addr); - - - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { - unsigned long pmd_phys; - pud_t *pud = pud_page + pud_index(addr); - pmd_t *pmd; - - if (addr >= end) - break; - - if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { - set_pud(pud, __pud(0)); - continue; - } - - if (pud_val(*pud)) { - phys_pmd_update(pud, addr, end); - continue; - } - - pmd = alloc_low_page(&pmd_phys); - spin_lock(&init_mm.page_table_lock); - set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - phys_pmd_init(pmd, addr, end); - spin_unlock(&init_mm.page_table_lock); - unmap_low_page(pmd); - } - __flush_tlb(); -} - -static void __init find_early_table_space(unsigned long end) -{ - unsigned long puds, pmds, tables, start; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + - round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - - /* RED-PEN putting page tables only on node 0 could - cause a hotspot and fill up ZONE_DMA. The page tables - need roughly 0.5KB per GB. */ - start = 0x8000; - table_start = find_e820_area(start, end, tables); - if (table_start == -1UL) - panic("Cannot find space for the kernel page tables"); - - table_start >>= PAGE_SHIFT; - table_end = table_start; - - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, - (table_start << PAGE_SHIFT) + tables); -} - -/* Setup the direct mapping of the physical memory at PAGE_OFFSET. - This runs before bootmem is initialized and gets pages directly from the - physical memory. To access them they are temporarily mapped. */ -void __meminit init_memory_mapping(unsigned long start, unsigned long end) -{ - unsigned long next; - - Dprintk("init_memory_mapping\n"); - - /* - * Find space for the kernel direct mapping tables. - * Later we should allocate these tables in the local node of the memory - * mapped. Unfortunately this is done currently before the nodes are - * discovered. - */ - if (!after_bootmem) - find_early_table_space(end); - - start = (unsigned long)__va(start); - end = (unsigned long)__va(end); - - for (; start < end; start = next) { - unsigned long pud_phys; - pgd_t *pgd = pgd_offset_k(start); - pud_t *pud; - - if (after_bootmem) - pud = pud_offset(pgd, start & PGDIR_MASK); - else - pud = alloc_low_page(&pud_phys); - - next = start + PGDIR_SIZE; - if (next > end) - next = end; - phys_pud_init(pud, __pa(start), __pa(next)); - if (!after_bootmem) - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); - unmap_low_page(pud); - } - - if (!after_bootmem) - mmu_cr4_features = read_cr4(); - __flush_tlb_all(); -} - -#ifndef CONFIG_NUMA -void __init paging_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = end_pfn; - - memory_present(0, 0, end_pfn); - sparse_init(); - free_area_init_nodes(max_zone_pfns); -} -#endif - -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches - from the CPU leading to inconsistent cache lines. address and size - must be aligned to 2MB boundaries. - Does nothing when the mapping doesn't exist. */ -void __init clear_kernel_mapping(unsigned long address, unsigned long size) -{ - unsigned long end = address + size; - - BUG_ON(address & ~LARGE_PAGE_MASK); - BUG_ON(size & ~LARGE_PAGE_MASK); - - for (; address < end; address += LARGE_PAGE_SIZE) { - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, address); - if (!pmd || pmd_none(*pmd)) - continue; - if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { - /* Could handle this, but it should not happen currently. */ - printk(KERN_ERR - "clear_kernel_mapping: mapping has been split. will leak memory\n"); - pmd_ERROR(*pmd); - } - set_pmd(pmd, __pmd(0)); - } - __flush_tlb_all(); -} - -/* - * Memory hotplug specific functions - */ -void online_page(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalram_pages++; - num_physpages++; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ -int arch_add_memory(int nid, u64 start, u64 size) -{ - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + ZONE_NORMAL; - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - int ret; - - init_memory_mapping(start, (start + size -1)); - - ret = __add_pages(zone, start_pfn, nr_pages); - if (ret) - goto error; - - return ret; -error: - printk("%s: Problem encountered in __add_pages!\n", __func__); - return ret; -} -EXPORT_SYMBOL_GPL(arch_add_memory); - -int remove_memory(u64 start, u64 size) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(remove_memory); - -#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) -int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - -#endif /* CONFIG_MEMORY_HOTPLUG */ - -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, - * just online the pages. - */ -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) -{ - int err = -EIO; - unsigned long pfn; - unsigned long total = 0, mem = 0; - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - if (pfn_valid(pfn)) { - online_page(pfn_to_page(pfn)); - err = 0; - mem++; - } - total++; - } - if (!err) { - z->spanned_pages += total; - z->present_pages += mem; - z->zone_pgdat->node_spanned_pages += total; - z->zone_pgdat->node_present_pages += mem; - } - return err; -} -#endif - -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, - kcore_vsyscall; - -void __init mem_init(void) -{ - long codesize, reservedpages, datasize, initsize; - - pci_iommu_alloc(); - - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); - - reservedpages = 0; - - /* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA - totalram_pages = numa_free_all_bootmem(); -#else - totalram_pages = free_all_bootmem(); -#endif - reservedpages = end_pfn - totalram_pages - - absent_pages_in_range(0, end_pfn); - - after_bootmem = 1; - - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - - /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - kclist_add(&kcore_kernel, &_stext, _end - _stext); - kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START); - - printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - end_pfn << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10); -} - -void free_init_pages(char *what, unsigned long begin, unsigned long end) -{ - unsigned long addr; - - if (begin >= end) - return; - - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - if (addr >= __START_KERNEL_map) - change_page_attr_addr(addr, 1, __pgprot(0)); - free_page(addr); - totalram_pages++; - } - if (addr > __START_KERNEL_map) - global_flush_tlb(); -} - -void free_initmem(void) -{ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} - -#ifdef CONFIG_DEBUG_RODATA - -void mark_rodata_ro(void) -{ - unsigned long start = (unsigned long)_stext, end; - -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() > 1) - start = (unsigned long)_etext; -#endif - -#ifdef CONFIG_KPROBES - start = (unsigned long)__start_rodata; -#endif - - end = (unsigned long)__end_rodata; - start = (start + PAGE_SIZE - 1) & PAGE_MASK; - end &= PAGE_MASK; - if (end <= start) - return; - - change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); - - printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", - (end - start) >> 10); - - /* - * change_page_attr_addr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. - */ - global_flush_tlb(); -} -#endif - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) -{ -#ifdef CONFIG_NUMA - int nid = phys_to_nid(phys); -#endif - unsigned long pfn = phys >> PAGE_SHIFT; - if (pfn >= end_pfn) { - /* This can happen with kdump kernels when accessing firmware - tables. */ - if (pfn < end_pfn_map) - return; - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", - phys, len); - return; - } - - /* Should check here against the e820 map to avoid double free */ -#ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len); -#else - reserve_bootmem(phys, len); -#endif - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { - dma_reserve += len / PAGE_SIZE; - set_dma_reserve(dma_reserve); - } -} - -int kern_addr_valid(unsigned long addr) -{ - unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - if (above != 0 && above != -1UL) - return 0; - - pgd = pgd_offset_k(addr); - if (pgd_none(*pgd)) - return 0; - - pud = pud_offset(pgd, addr); - if (pud_none(*pud)) - return 0; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) - return 0; - if (pmd_large(*pmd)) - return pfn_valid(pmd_pfn(*pmd)); - - pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) - return 0; - return pfn_valid(pte_pfn(*pte)); -} - -/* A pseudo VMA to allow ptrace access for the vsyscall page. This only - covers the 64bit vsyscall page now. 32bit has a real VMA now and does - not need special handling anymore. */ - -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC -}; - -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) - return NULL; -#endif - return &gate_vma; -} - -int in_gate_area(struct task_struct *task, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(task); - if (!vma) - return 0; - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives. - */ -int in_gate_area_no_task(unsigned long addr) -{ - return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); -} - -void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) -{ - return __alloc_bootmem_core(pgdat->bdata, size, - SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); -} - -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - if (vma == &gate_vma) - return "[vsyscall]"; - return NULL; -} diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c deleted file mode 100644 index 6cac90aa5032..000000000000 --- a/arch/x86_64/mm/ioremap.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * arch/x86_64/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/io.h> - -#include <asm/pgalloc.h> -#include <asm/fixmap.h> -#include <asm/tlbflush.h> -#include <asm/cacheflush.h> -#include <asm/proto.h> - -unsigned long __phys_addr(unsigned long x) -{ - if (x >= __START_KERNEL_map) - return x - __START_KERNEL_map + phys_base; - return x - PAGE_OFFSET; -} -EXPORT_SYMBOL(__phys_addr); - -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 - -/* - * Fix up the linear direct mapping of the kernel to avoid cache attribute - * conflicts. - */ -static int -ioremap_change_attr(unsigned long phys_addr, unsigned long size, - unsigned long flags) -{ - int err = 0; - if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { - unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long vaddr = (unsigned long) __va(phys_addr); - - /* - * Must use a address here and not struct page because the phys addr - * can be a in hole between nodes and not have an memmap entry. - */ - err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); - if (!err) - global_flush_tlb(); - } - return err; -} - -/* - * Generic mapping function - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t pgprot; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (__force void __iomem *)phys_to_virt(phys_addr); - -#ifdef CONFIG_FLATMEM - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (last_addr < virt_to_phys(high_memory)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = __va(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - } -#endif - - pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL - | _PAGE_DIRTY | _PAGE_ACCESSED | flags); - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, pgprot)) { - remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); - return NULL; - } - if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) { - area->flags &= 0xffffff; - vunmap(addr); - return NULL; - } - return (__force void __iomem *) (offset + (char *)addr); -} -EXPORT_SYMBOL(__ioremap); - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - return __ioremap(phys_addr, size, _PAGE_PCD); -} -EXPORT_SYMBOL(ioremap_nocache); - -/** - * iounmap - Free a IO remapping - * @addr: virtual address from ioremap_* - * - * Caller must ensure there is only one unmapping for the same pointer. - */ -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p, *o; - - if (addr <= high_memory) - return; - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) - return; - - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); - /* Use the vm area unlocked, assuming the caller - ensures there isn't another iounmap for the same address - in parallel. Reuse of the virtual address is prevented by - leaving it in the global lists until we're done with it. - cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == addr) - break; - } - read_unlock(&vmlist_lock); - - if (!p) { - printk("iounmap: bad address %p\n", addr); - dump_stack(); - return; - } - - /* Reset the direct mapping. Can block */ - if (p->flags >> 20) - ioremap_change_attr(p->phys_addr, p->size, 0); - - /* Finally remove it */ - o = remove_vm_area((void *)addr); - BUG_ON(p != o || o == NULL); - kfree(p); -} -EXPORT_SYMBOL(iounmap); - diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c deleted file mode 100644 index a96006f7ae0c..000000000000 --- a/arch/x86_64/mm/k8topology.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * AMD K8 NUMA support. - * Discover the memory map and associated nodes. - * - * This version reads it directly from the K8 northbridge. - * - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/module.h> -#include <linux/nodemask.h> -#include <asm/io.h> -#include <linux/pci_ids.h> -#include <asm/types.h> -#include <asm/mmzone.h> -#include <asm/proto.h> -#include <asm/e820.h> -#include <asm/pci-direct.h> -#include <asm/numa.h> - -static __init int find_northbridge(void) -{ - int num; - - for (num = 0; num < 32; num++) { - u32 header; - - header = read_pci_config(0, num, 0, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) - continue; - - header = read_pci_config(0, num, 1, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) - continue; - return num; - } - - return -1; -} - -int __init k8_scan_nodes(unsigned long start, unsigned long end) -{ - unsigned long prevbase; - struct bootnode nodes[8]; - int nodeid, i, j, nb; - unsigned char nodeids[8]; - int found = 0; - u32 reg; - unsigned numnodes; - unsigned num_cores; - - if (!early_pci_allowed()) - return -1; - - nb = find_northbridge(); - if (nb < 0) - return nb; - - printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); - - num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - printk(KERN_INFO "CPU has %d num_cores\n", num_cores); - - reg = read_pci_config(0, nb, 0, 0x60); - numnodes = ((reg >> 4) & 0xF) + 1; - if (numnodes <= 1) - return -1; - - printk(KERN_INFO "Number of nodes %d\n", numnodes); - - memset(&nodes,0,sizeof(nodes)); - prevbase = 0; - for (i = 0; i < 8; i++) { - unsigned long base,limit; - u32 nodeid; - - base = read_pci_config(0, nb, 1, 0x40 + i*8); - limit = read_pci_config(0, nb, 1, 0x44 + i*8); - - nodeid = limit & 7; - nodeids[i] = nodeid; - if ((base & 3) == 0) { - if (i < numnodes) - printk("Skipping disabled node %d\n", i); - continue; - } - if (nodeid >= numnodes) { - printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); - continue; - } - - if (!limit) { - printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, - base); - continue; - } - if ((base >> 8) & 3 || (limit >> 8) & 3) { - printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", - nodeid, (base>>8)&3, (limit>>8) & 3); - return -1; - } - if (node_isset(nodeid, node_possible_map)) { - printk(KERN_INFO "Node %d already present. Skipping\n", - nodeid); - continue; - } - - limit >>= 16; - limit <<= 24; - limit |= (1<<24)-1; - limit++; - - if (limit > end_pfn << PAGE_SHIFT) - limit = end_pfn << PAGE_SHIFT; - if (limit <= base) - continue; - - base >>= 16; - base <<= 24; - - if (base < start) - base = start; - if (limit > end) - limit = end; - if (limit == base) { - printk(KERN_ERR "Empty node %d\n", nodeid); - continue; - } - if (limit < base) { - printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", - nodeid, base, limit); - continue; - } - - /* Could sort here, but pun for now. Should not happen anyroads. */ - if (prevbase > base) { - printk(KERN_ERR "Node map not sorted %lx,%lx\n", - prevbase,base); - return -1; - } - - printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); - - found++; - - nodes[nodeid].start = base; - nodes[nodeid].end = limit; - e820_register_active_regions(nodeid, - nodes[nodeid].start >> PAGE_SHIFT, - nodes[nodeid].end >> PAGE_SHIFT); - - prevbase = base; - - node_set(nodeid, node_possible_map); - } - - if (!found) - return -1; - - memnode_shift = compute_hash_shift(nodes, 8); - if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); - - for (i = 0; i < 8; i++) { - if (nodes[i].start != nodes[i].end) { - nodeid = nodeids[i]; - for (j = 0; j < num_cores; j++) - apicid_to_node[(nodeid * num_cores) + j] = i; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } - } - - numa_init_array(); - return 0; -} diff --git a/arch/x86_64/mm/mmap.c b/arch/x86_64/mm/mmap.c deleted file mode 100644 index 80bba0dc000e..000000000000 --- a/arch/x86_64/mm/mmap.c +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright 2005 Andi Kleen, SuSE Labs. - * Licensed under GPL, v.2 - */ -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/random.h> -#include <asm/ia32.h> - -/* Notebook: move the mmap code from sys_x86_64.c over here. */ - -void arch_pick_mmap_layout(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->flags & _TIF_IA32) - return ia32_pick_mmap_layout(mm); -#endif - mm->mmap_base = TASK_UNMAPPED_BASE; - if (current->flags & PF_RANDOMIZE) { - /* Add 28bit randomness which is about 40bits of address space - because mmap base has to be page aligned. - or ~1/128 of the total user VM - (total user address space is 47bits) */ - unsigned rnd = get_random_int() & 0xfffffff; - mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT; - } - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; -} - diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c deleted file mode 100644 index 6da235522269..000000000000 --- a/arch/x86_64/mm/numa.c +++ /dev/null @@ -1,648 +0,0 @@ -/* - * Generic VM initialization for x86-64 NUMA setups. - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/mmzone.h> -#include <linux/ctype.h> -#include <linux/module.h> -#include <linux/nodemask.h> - -#include <asm/e820.h> -#include <asm/proto.h> -#include <asm/dma.h> -#include <asm/numa.h> -#include <asm/acpi.h> - -#ifndef Dprintk -#define Dprintk(x...) -#endif - -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -bootmem_data_t plat_node_bdata[MAX_NUMNODES]; - -struct memnode memnode; - -unsigned char cpu_to_node[NR_CPUS] __read_mostly = { - [0 ... NR_CPUS-1] = NUMA_NO_NODE -}; -unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; -cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; - -int numa_off __initdata; -unsigned long __initdata nodemap_addr; -unsigned long __initdata nodemap_size; - - -/* - * Given a shift value, try to populate memnodemap[] - * Returns : - * 1 if OK - * 0 if memnodmap[] too small (of shift too small) - * -1 if node overlap or lost ram (shift too big) - */ -static int __init -populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) -{ - int i; - int res = -1; - unsigned long addr, end; - - memset(memnodemap, 0xff, memnodemapsize); - for (i = 0; i < numnodes; i++) { - addr = nodes[i].start; - end = nodes[i].end; - if (addr >= end) - continue; - if ((end >> shift) >= memnodemapsize) - return 0; - do { - if (memnodemap[addr >> shift] != 0xff) - return -1; - memnodemap[addr >> shift] = i; - addr += (1UL << shift); - } while (addr < end); - res = 1; - } - return res; -} - -static int __init allocate_cachealigned_memnodemap(void) -{ - unsigned long pad, pad_addr; - - memnodemap = memnode.embedded_map; - if (memnodemapsize <= 48) - return 0; - - pad = L1_CACHE_BYTES - 1; - pad_addr = 0x8000; - nodemap_size = pad + memnodemapsize; - nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, - nodemap_size); - if (nodemap_addr == -1UL) { - printk(KERN_ERR - "NUMA: Unable to allocate Memory to Node hash map\n"); - nodemap_addr = nodemap_size = 0; - return -1; - } - pad_addr = (nodemap_addr + pad) & ~pad; - memnodemap = phys_to_virt(pad_addr); - - printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", - nodemap_addr, nodemap_addr + nodemap_size); - return 0; -} - -/* - * The LSB of all start and end addresses in the node map is the value of the - * maximum possible shift. - */ -static int __init -extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) -{ - int i, nodes_used = 0; - unsigned long start, end; - unsigned long bitfield = 0, memtop = 0; - - for (i = 0; i < numnodes; i++) { - start = nodes[i].start; - end = nodes[i].end; - if (start >= end) - continue; - bitfield |= start; - nodes_used++; - if (end > memtop) - memtop = end; - } - if (nodes_used <= 1) - i = 63; - else - i = find_first_bit(&bitfield, sizeof(unsigned long)*8); - memnodemapsize = (memtop >> i)+1; - return i; -} - -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) -{ - int shift; - - shift = extract_lsb_from_nodes(nodes, numnodes); - if (allocate_cachealigned_memnodemap()) - return -1; - printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", - shift); - - if (populate_memnodemap(nodes, numnodes, shift) != 1) { - printk(KERN_INFO - "Your memory is not aligned you need to rebuild your kernel " - "with a bigger NODEMAPSIZE shift=%d\n", - shift); - return -1; - } - return shift; -} - -#ifdef CONFIG_SPARSEMEM -int early_pfn_to_nid(unsigned long pfn) -{ - return phys_to_nid(pfn << PAGE_SHIFT); -} -#endif - -static void * __init -early_node_mem(int nodeid, unsigned long start, unsigned long end, - unsigned long size) -{ - unsigned long mem = find_e820_area(start, end, size); - void *ptr; - if (mem != -1L) - return __va(mem); - ptr = __alloc_bootmem_nopanic(size, - SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); - if (ptr == 0) { - printk(KERN_ERR "Cannot find %lu bytes in node %d\n", - size, nodeid); - return NULL; - } - return ptr; -} - -/* Initialize bootmem allocator for a node */ -void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) -{ - unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; - unsigned long nodedata_phys; - void *bootmap; - const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); - - start = round_up(start, ZONE_ALIGN); - - printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); - - start_pfn = start >> PAGE_SHIFT; - end_pfn = end >> PAGE_SHIFT; - - node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); - if (node_data[nodeid] == NULL) - return; - nodedata_phys = __pa(node_data[nodeid]); - - memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; - NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; - - /* Find a place for the bootmem map */ - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); - bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); - bootmap = early_node_mem(nodeid, bootmap_start, end, - bootmap_pages<<PAGE_SHIFT); - if (bootmap == NULL) { - if (nodedata_phys < start || nodedata_phys >= end) - free_bootmem((unsigned long)node_data[nodeid],pgdat_size); - node_data[nodeid] = NULL; - return; - } - bootmap_start = __pa(bootmap); - Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); - - bootmap_size = init_bootmem_node(NODE_DATA(nodeid), - bootmap_start >> PAGE_SHIFT, - start_pfn, end_pfn); - - free_bootmem_with_active_regions(nodeid, end); - - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); -#ifdef CONFIG_ACPI_NUMA - srat_reserve_add_area(nodeid); -#endif - node_set_online(nodeid); -} - -/* Initialize final allocator for a zone */ -void __init setup_node_zones(int nodeid) -{ - unsigned long start_pfn, end_pfn, memmapsize, limit; - - start_pfn = node_start_pfn(nodeid); - end_pfn = node_end_pfn(nodeid); - - Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n", - nodeid, start_pfn, end_pfn); - - /* Try to allocate mem_map at end to not fill up precious <4GB - memory. */ - memmapsize = sizeof(struct page) * (end_pfn-start_pfn); - limit = end_pfn << PAGE_SHIFT; -#ifdef CONFIG_FLAT_NODE_MEM_MAP - NODE_DATA(nodeid)->node_mem_map = - __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, - memmapsize, SMP_CACHE_BYTES, - round_down(limit - memmapsize, PAGE_SIZE), - limit); -#endif -} - -void __init numa_init_array(void) -{ - int rr, i; - /* There are unfortunately some poorly designed mainboards around - that only connect memory to a single CPU. This breaks the 1:1 cpu->node - mapping. To avoid this fill in the mapping for all possible - CPUs, as the number of CPUs is not known yet. - We round robin the existing nodes. */ - rr = first_node(node_online_map); - for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node[i] != NUMA_NO_NODE) - continue; - numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); - } - -} - -#ifdef CONFIG_NUMA_EMU -/* Numa emulation */ -char *cmdline __initdata; - -/* - * Setups up nid to range from addr to addr + size. If the end boundary is - * greater than max_addr, then max_addr is used instead. The return value is 0 - * if there is additional memory left for allocation past addr and -1 otherwise. - * addr is adjusted to be at the end of the node. - */ -static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, - u64 size, u64 max_addr) -{ - int ret = 0; - nodes[nid].start = *addr; - *addr += size; - if (*addr >= max_addr) { - *addr = max_addr; - ret = -1; - } - nodes[nid].end = *addr; - node_set(nid, node_possible_map); - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, - nodes[nid].start, nodes[nid].end, - (nodes[nid].end - nodes[nid].start) >> 20); - return ret; -} - -/* - * Splits num_nodes nodes up equally starting at node_start. The return value - * is the number of nodes split up and addr is adjusted to be at the end of the - * last node allocated. - */ -static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, - int num_nodes) -{ - unsigned int big; - u64 size; - int i; - - if (num_nodes <= 0) - return -1; - if (num_nodes > MAX_NUMNODES) - num_nodes = MAX_NUMNODES; - size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / - num_nodes; - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the leftovers. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / - FAKE_NODE_MIN_SIZE; - - /* Round down to nearest FAKE_NODE_MIN_SIZE. */ - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - printk(KERN_ERR "Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - for (i = node_start; i < num_nodes + node_start; i++) { - u64 end = *addr + size; - if (i < big) - end += FAKE_NODE_MIN_SIZE; - /* - * The final node can have the remaining system RAM. Other - * nodes receive roughly the same amount of available pages. - */ - if (i == num_nodes + node_start - 1) - end = max_addr; - else - while (end - *addr - e820_hole_size(*addr, end) < - size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } - if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) - break; - } - return i - node_start + 1; -} - -/* - * Splits the remaining system RAM into chunks of size. The remaining memory is - * always assigned to a final node and can be asymmetric. Returns the number of - * nodes split. - */ -static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, u64 size) -{ - int i = node_start; - size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, nodes, addr, size, max_addr)) - ; - return i - node_start; -} - -/* - * Sets up the system RAM area from start_pfn to end_pfn according to the - * numa=fake command-line option. - */ -static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) -{ - struct bootnode nodes[MAX_NUMNODES]; - u64 addr = start_pfn << PAGE_SHIFT; - u64 max_addr = end_pfn << PAGE_SHIFT; - int num_nodes = 0; - int coeff_flag; - int coeff = -1; - int num = 0; - u64 size; - int i; - - memset(&nodes, 0, sizeof(nodes)); - /* - * If the numa=fake command-line is just a single number N, split the - * system RAM into N fake nodes. - */ - if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, - simple_strtol(cmdline, NULL, 0)); - if (num_nodes < 0) - return num_nodes; - goto out; - } - - /* Parse the command line. */ - for (coeff_flag = 0; ; cmdline++) { - if (*cmdline && isdigit(*cmdline)) { - num = num * 10 + *cmdline - '0'; - continue; - } - if (*cmdline == '*') { - if (num > 0) - coeff = num; - coeff_flag = 1; - } - if (!*cmdline || *cmdline == ',') { - if (!coeff_flag) - coeff = 1; - /* - * Round down to the nearest FAKE_NODE_MIN_SIZE. - * Command-line coefficients are in megabytes. - */ - size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; - if (size) - for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, nodes, - &addr, size, max_addr) < 0) - goto done; - if (!*cmdline) - break; - coeff_flag = 0; - coeff = -1; - } - num = 0; - } -done: - if (!num_nodes) - return -1; - /* Fill remainder of system RAM, if appropriate. */ - if (addr < max_addr) { - if (coeff_flag && coeff < 0) { - /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(nodes, &addr, max_addr, - num_nodes, num); - goto out; - } - switch (*(cmdline - 1)) { - case '*': - /* Split remaining nodes into coeff chunks */ - if (coeff <= 0) - break; - num_nodes += split_nodes_equally(nodes, &addr, max_addr, - num_nodes, coeff); - break; - case ',': - /* Do not allocate remaining system RAM */ - break; - default: - /* Give one final node */ - setup_node_range(num_nodes, nodes, &addr, - max_addr - addr, max_addr); - num_nodes++; - } - } -out: - memnode_shift = compute_hash_shift(nodes, num_nodes); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. NUMA emulation " - "disabled.\n"); - return -1; - } - - /* - * We need to vacate all active ranges that may have been registered by - * SRAT and set acpi_numa to -1 so that srat_disabled() always returns - * true. NUMA emulation has succeeded so we will not scan ACPI nodes. - */ - remove_all_active_ranges(); -#ifdef CONFIG_ACPI_NUMA - acpi_numa = -1; -#endif - for_each_node_mask(i, node_possible_map) { - e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } - acpi_fake_nodes(nodes, num_nodes); - numa_init_array(); - return 0; -} -#endif /* CONFIG_NUMA_EMU */ - -void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) -{ - int i; - - nodes_clear(node_possible_map); - -#ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, end_pfn)) - return; - nodes_clear(node_possible_map); -#endif - -#ifdef CONFIG_ACPI_NUMA - if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT)) - return; - nodes_clear(node_possible_map); -#endif - -#ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) - return; - nodes_clear(node_possible_map); -#endif - printk(KERN_INFO "%s\n", - numa_off ? "NUMA turned off" : "No NUMA configuration found"); - - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", - start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT); - /* setup dummy node covering all memory */ - memnode_shift = 63; - memnodemap = memnode.embedded_map; - memnodemap[0] = 0; - nodes_clear(node_online_map); - node_set_online(0); - node_set(0, node_possible_map); - for (i = 0; i < NR_CPUS; i++) - numa_set_node(i, 0); - node_to_cpumask[0] = cpumask_of_cpu(0); - e820_register_active_regions(0, start_pfn, end_pfn); - setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); -} - -__cpuinit void numa_add_cpu(int cpu) -{ - set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); -} - -void __cpuinit numa_set_node(int cpu, int node) -{ - cpu_pda(cpu)->nodenumber = node; - cpu_to_node[cpu] = node; -} - -unsigned long __init numa_free_all_bootmem(void) -{ - int i; - unsigned long pages = 0; - for_each_online_node(i) { - pages += free_all_bootmem_node(NODE_DATA(i)); - } - return pages; -} - -void __init paging_init(void) -{ - int i; - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = end_pfn; - - sparse_memory_present_with_active_regions(MAX_NUMNODES); - sparse_init(); - - for_each_online_node(i) { - setup_node_zones(i); - } - - free_area_init_nodes(max_zone_pfns); -} - -static __init int numa_setup(char *opt) -{ - if (!opt) - return -EINVAL; - if (!strncmp(opt,"off",3)) - numa_off = 1; -#ifdef CONFIG_NUMA_EMU - if (!strncmp(opt, "fake=", 5)) - cmdline = opt + 5; -#endif -#ifdef CONFIG_ACPI_NUMA - if (!strncmp(opt,"noacpi",6)) - acpi_numa = -1; - if (!strncmp(opt,"hotadd=", 7)) - hotadd_percent = simple_strtoul(opt+7, NULL, 10); -#endif - return 0; -} - -early_param("numa", numa_setup); - -/* - * Setup early cpu_to_node. - * - * Populate cpu_to_node[] only if x86_cpu_to_apicid[], - * and apicid_to_node[] tables have valid entries for a CPU. - * This means we skip cpu_to_node[] initialisation for NUMA - * emulation and faking node case (when running a kernel compiled - * for NUMA on a non NUMA box), which is OK as cpu_to_node[] - * is already initialized in a round robin manner at numa_init_array, - * prior to this call, and this initialization is good enough - * for the fake NUMA cases. - */ -void __init init_cpu_to_node(void) -{ - int i; - for (i = 0; i < NR_CPUS; i++) { - u8 apicid = x86_cpu_to_apicid[i]; - if (apicid == BAD_APICID) - continue; - if (apicid_to_node[apicid] == NUMA_NO_NODE) - continue; - numa_set_node(i,apicid_to_node[apicid]); - } -} - -EXPORT_SYMBOL(cpu_to_node); -EXPORT_SYMBOL(node_to_cpumask); -EXPORT_SYMBOL(memnode); -EXPORT_SYMBOL(node_data); - -#ifdef CONFIG_DISCONTIGMEM -/* - * Functions to convert PFNs from/to per node page addresses. - * These are out of line because they are quite big. - * They could be all tuned by pre caching more state. - * Should do that. - */ - -int pfn_valid(unsigned long pfn) -{ - unsigned nid; - if (pfn >= num_physpages) - return 0; - nid = pfn_to_nid(pfn); - if (nid == 0xff) - return 0; - return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); -} -EXPORT_SYMBOL(pfn_valid); -#endif diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c deleted file mode 100644 index 10b9809ce821..000000000000 --- a/arch/x86_64/mm/pageattr.c +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * Thanks to Ben LaHaise for precious feedback. - */ - -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/tlbflush.h> -#include <asm/io.h> - -pte_t *lookup_address(unsigned long address) -{ - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - if (pgd_none(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; - if (pmd_large(*pmd)) - return (pte_t *)pmd; - pte = pte_offset_kernel(pmd, address); - if (pte && !pte_present(*pte)) - pte = NULL; - return pte; -} - -static struct page *split_large_page(unsigned long address, pgprot_t prot, - pgprot_t ref_prot) -{ - int i; - unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); - pte_t *pbase; - if (!base) - return NULL; - /* - * page_private is used to track the number of entries in - * the page table page have non standard attributes. - */ - SetPagePrivate(base); - page_private(base) = 0; - - address = __pa(address); - addr = address & LARGE_PAGE_MASK; - pbase = (pte_t *)page_address(base); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : ref_prot); - } - return base; -} - -static void cache_flush_page(void *adr) -{ - int i; - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - asm volatile("clflush (%0)" :: "r" (adr + i)); -} - -static void flush_kernel_map(void *arg) -{ - struct list_head *l = (struct list_head *)arg; - struct page *pg; - - /* When clflush is available always use it because it is - much cheaper than WBINVD. */ - /* clflush is still broken. Disable for now. */ - if (1 || !cpu_has_clflush) - asm volatile("wbinvd" ::: "memory"); - else list_for_each_entry(pg, l, lru) { - void *adr = page_address(pg); - cache_flush_page(adr); - } - __flush_tlb_all(); -} - -static inline void flush_map(struct list_head *l) -{ - on_each_cpu(flush_kernel_map, l, 1, 1); -} - -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ - -static inline void save_page(struct page *fpage) -{ - if (!test_and_set_bit(PG_arch_1, &fpage->flags)) - list_add(&fpage->lru, &deferred_pages); -} - -/* - * No more special protections in this 2/4MB area - revert to a - * large page again. - */ -static void revert_page(unsigned long address, pgprot_t ref_prot) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t large_pte; - unsigned long pfn; - - pgd = pgd_offset_k(address); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd,address); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, address); - BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; - large_pte = pfn_pte(pfn, ref_prot); - large_pte = pte_mkhuge(large_pte); - set_pte((pte_t *)pmd, large_pte); -} - -static int -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, - pgprot_t ref_prot) -{ - pte_t *kpte; - struct page *kpte_page; - pgprot_t ref_prot2; - - kpte = lookup_address(address); - if (!kpte) return 0; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); - if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, prot)); - } else { - /* - * split_large_page will take the reference for this - * change_page_attr on the split page. - */ - struct page *split; - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); - if (!split) - return -ENOMEM; - set_pte(kpte, mk_pte(split, ref_prot2)); - kpte_page = split; - } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, ref_prot)); - BUG_ON(page_private(kpte_page) == 0); - page_private(kpte_page)--; - } else - BUG(); - - /* on x86-64 the direct mapping set at boot is not using 4k pages */ - BUG_ON(PageReserved(kpte_page)); - - save_page(kpte_page); - if (page_private(kpte_page) == 0) - revert_page(address, ref_prot); - return 0; -} - -/* - * Change the page attributes of an page in the linear mapping. - * - * This should be used when a page is mapped with a different caching policy - * than write-back somewhere - some CPUs do not like it when mappings with - * different caching policies exist. This changes the page attributes of the - * in kernel linear mapping too. - * - * The caller needs to ensure that there are no conflicting mappings elsewhere. - * This function only deals with the kernel linear map. - * - * Caller must call global_flush_tlb() after this. - */ -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) -{ - int err = 0, kernel_map = 0; - int i; - - if (address >= __START_KERNEL_map - && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { - address = (unsigned long)__va(__pa(address)); - kernel_map = 1; - } - - down_write(&init_mm.mmap_sem); - for (i = 0; i < numpages; i++, address += PAGE_SIZE) { - unsigned long pfn = __pa(address) >> PAGE_SHIFT; - - if (!kernel_map || pte_present(pfn_pte(0, prot))) { - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; - } - /* Handle kernel mapping too which aliases part of the - * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { - unsigned long addr2; - pgprot_t prot2; - addr2 = __START_KERNEL_map + __pa(address); - /* Make sure the kernel mappings stay executable */ - prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); - err = __change_page_attr(addr2, pfn, prot2, - PAGE_KERNEL_EXEC); - } - } - up_write(&init_mm.mmap_sem); - return err; -} - -/* Don't call this for MMIO areas that may not have a mem_map entry */ -int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - unsigned long addr = (unsigned long)page_address(page); - return change_page_attr_addr(addr, numpages, prot); -} - -void global_flush_tlb(void) -{ - struct page *pg, *next; - struct list_head l; - - down_read(&init_mm.mmap_sem); - list_replace_init(&deferred_pages, &l); - up_read(&init_mm.mmap_sem); - - flush_map(&l); - - list_for_each_entry_safe(pg, next, &l, lru) { - list_del(&pg->lru); - clear_bit(PG_arch_1, &pg->flags); - if (page_private(pg) != 0) - continue; - ClearPagePrivate(pg); - __free_page(pg); - } -} - -EXPORT_SYMBOL(change_page_attr); -EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c deleted file mode 100644 index acdf03e19146..000000000000 --- a/arch/x86_64/mm/srat.c +++ /dev/null @@ -1,566 +0,0 @@ -/* - * ACPI 3.0 based NUMA setup - * Copyright 2004 Andi Kleen, SuSE Labs. - * - * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. - * - * Called from acpi_numa_init while reading the SRAT and SLIT tables. - * Assumes all memory regions belonging to a single proximity domain - * are in one chunk. Holes between them will be included in the node. - */ - -#include <linux/kernel.h> -#include <linux/acpi.h> -#include <linux/mmzone.h> -#include <linux/bitmap.h> -#include <linux/module.h> -#include <linux/topology.h> -#include <linux/bootmem.h> -#include <linux/mm.h> -#include <asm/proto.h> -#include <asm/numa.h> -#include <asm/e820.h> - -int acpi_numa __initdata; - -static struct acpi_table_slit *acpi_slit; - -static nodemask_t nodes_parsed __initdata; -static struct bootnode nodes[MAX_NUMNODES] __initdata; -static struct bootnode nodes_add[MAX_NUMNODES]; -static int found_add_area __initdata; -int hotadd_percent __initdata = 0; - -/* Too small nodes confuse the VM badly. Usually they result - from BIOS bugs. */ -#define NODE_MIN_SIZE (4*1024*1024) - -static __init int setup_node(int pxm) -{ - return acpi_map_pxm_to_node(pxm); -} - -static __init int conflicting_nodes(unsigned long start, unsigned long end) -{ - int i; - for_each_node_mask(i, nodes_parsed) { - struct bootnode *nd = &nodes[i]; - if (nd->start == nd->end) - continue; - if (nd->end > start && nd->start < end) - return i; - if (nd->end == end && nd->start == start) - return i; - } - return -1; -} - -static __init void cutoff_node(int i, unsigned long start, unsigned long end) -{ - struct bootnode *nd = &nodes[i]; - - if (found_add_area) - return; - - if (nd->start < start) { - nd->start = start; - if (nd->end < nd->start) - nd->start = nd->end; - } - if (nd->end > end) { - nd->end = end; - if (nd->start > nd->end) - nd->start = nd->end; - } -} - -static __init void bad_srat(void) -{ - int i; - printk(KERN_ERR "SRAT: SRAT not used.\n"); - acpi_numa = -1; - found_add_area = 0; - for (i = 0; i < MAX_LOCAL_APIC; i++) - apicid_to_node[i] = NUMA_NO_NODE; - for (i = 0; i < MAX_NUMNODES; i++) - nodes_add[i].start = nodes[i].end = 0; - remove_all_active_ranges(); -} - -static __init inline int srat_disabled(void) -{ - return numa_off || acpi_numa < 0; -} - -/* - * A lot of BIOS fill in 10 (= no distance) everywhere. This messes - * up the NUMA heuristics which wants the local node to have a smaller - * distance than the others. - * Do some quick checks here and only use the SLIT if it passes. - */ -static __init int slit_valid(struct acpi_table_slit *slit) -{ - int i, j; - int d = slit->locality_count; - for (i = 0; i < d; i++) { - for (j = 0; j < d; j++) { - u8 val = slit->entry[d*i + j]; - if (i == j) { - if (val != LOCAL_DISTANCE) - return 0; - } else if (val <= LOCAL_DISTANCE) - return 0; - } - } - return 1; -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ - if (!slit_valid(slit)) { - printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); - return; - } - acpi_slit = slit; -} - -/* Callback for Proximity Domain -> LAPIC mapping */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) -{ - int pxm, node; - if (srat_disabled()) - return; - if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { - bad_srat(); - return; - } - if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; - pxm = pa->proximity_domain_lo; - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); - bad_srat(); - return; - } - apicid_to_node[pa->apic_id] = node; - acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", - pxm, pa->apic_id, node); -} - -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Protect against too large hotadd areas that would fill up memory. - */ -static int hotadd_enough_memory(struct bootnode *nd) -{ - static unsigned long allocated; - static unsigned long last_area_end; - unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; - long mem = pages * sizeof(struct page); - unsigned long addr; - unsigned long allowed; - unsigned long oldpages = pages; - - if (mem < 0) - return 0; - allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; - allowed = (allowed / 100) * hotadd_percent; - if (allocated + mem > allowed) { - unsigned long range; - /* Give them at least part of their hotadd memory upto hotadd_percent - It would be better to spread the limit out - over multiple hotplug areas, but that is too complicated - right now */ - if (allocated >= allowed) - return 0; - range = allowed - allocated; - pages = (range / PAGE_SIZE); - mem = pages * sizeof(struct page); - nd->end = nd->start + range; - } - /* Not completely fool proof, but a good sanity check */ - addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem); - if (addr == -1UL) - return 0; - if (pages != oldpages) - printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n", - pages << PAGE_SHIFT); - last_area_end = addr + mem; - allocated += mem; - return 1; -} - -static int update_end_of_memory(unsigned long end) -{ - found_add_area = 1; - if ((end >> PAGE_SHIFT) > end_pfn) - end_pfn = end >> PAGE_SHIFT; - return 1; -} - -static inline int save_add_info(void) -{ - return hotadd_percent > 0; -} -#else -int update_end_of_memory(unsigned long end) {return -1;} -static int hotadd_enough_memory(struct bootnode *nd) {return 1;} -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -static inline int save_add_info(void) {return 1;} -#else -static inline int save_add_info(void) {return 0;} -#endif -#endif -/* - * Update nodes_add and decide if to include add are in the zone. - * Both SPARSE and RESERVE need nodes_add infomation. - * This code supports one contigious hot add area per node. - */ -static int reserve_hotadd(int node, unsigned long start, unsigned long end) -{ - unsigned long s_pfn = start >> PAGE_SHIFT; - unsigned long e_pfn = end >> PAGE_SHIFT; - int ret = 0, changed = 0; - struct bootnode *nd = &nodes_add[node]; - - /* I had some trouble with strange memory hotadd regions breaking - the boot. Be very strict here and reject anything unexpected. - If you want working memory hotadd write correct SRATs. - - The node size check is a basic sanity check to guard against - mistakes */ - if ((signed long)(end - start) < NODE_MIN_SIZE) { - printk(KERN_ERR "SRAT: Hotplug area too small\n"); - return -1; - } - - /* This check might be a bit too strict, but I'm keeping it for now. */ - if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { - printk(KERN_ERR - "SRAT: Hotplug area %lu -> %lu has existing memory\n", - s_pfn, e_pfn); - return -1; - } - - if (!hotadd_enough_memory(&nodes_add[node])) { - printk(KERN_ERR "SRAT: Hotplug area too large\n"); - return -1; - } - - /* Looks good */ - - if (nd->start == nd->end) { - nd->start = start; - nd->end = end; - changed = 1; - } else { - if (nd->start == end) { - nd->start = start; - changed = 1; - } - if (nd->end == start) { - nd->end = end; - changed = 1; - } - if (!changed) - printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); - } - - ret = update_end_of_memory(nd->end); - - if (changed) - printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); - return ret; -} - -/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) -{ - struct bootnode *nd, oldnode; - unsigned long start, end; - int node, pxm; - int i; - - if (srat_disabled()) - return; - if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return; - } - if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; - - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) - return; - start = ma->base_address; - end = start + ma->length; - pxm = ma->proximity_domain; - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains.\n"); - bad_srat(); - return; - } - i = conflicting_nodes(start, end); - if (i == node) { - printk(KERN_WARNING - "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", - pxm, start, end, nodes[i].start, nodes[i].end); - } else if (i >= 0) { - printk(KERN_ERR - "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", - pxm, start, end, node_to_pxm(i), - nodes[i].start, nodes[i].end); - bad_srat(); - return; - } - nd = &nodes[node]; - oldnode = *nd; - if (!node_test_and_set(node, nodes_parsed)) { - nd->start = start; - nd->end = end; - } else { - if (start < nd->start) - nd->start = start; - if (nd->end < end) - nd->end = end; - } - - printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, - nd->start, nd->end); - e820_register_active_regions(node, nd->start >> PAGE_SHIFT, - nd->end >> PAGE_SHIFT); - push_node_boundaries(node, nd->start >> PAGE_SHIFT, - nd->end >> PAGE_SHIFT); - - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && - (reserve_hotadd(node, start, end) < 0)) { - /* Ignore hotadd region. Undo damage */ - printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); - *nd = oldnode; - if ((nd->start | nd->end) == 0) - node_clear(node, nodes_parsed); - } -} - -/* Sanity check to catch more bad SRATs (they are amazingly common). - Make sure the PXMs cover all memory. */ -static int __init nodes_cover_memory(const struct bootnode *nodes) -{ - int i; - unsigned long pxmram, e820ram; - - pxmram = 0; - for_each_node_mask(i, nodes_parsed) { - unsigned long s = nodes[i].start >> PAGE_SHIFT; - unsigned long e = nodes[i].end >> PAGE_SHIFT; - pxmram += e - s; - pxmram -= absent_pages_in_range(s, e); - if ((long)pxmram < 0) - pxmram = 0; - } - - e820ram = end_pfn - absent_pages_in_range(0, end_pfn); - /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ - if ((long)(e820ram - pxmram) >= 1*1024*1024) { - printk(KERN_ERR - "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", - (pxmram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return 0; - } - return 1; -} - -static void unparse_node(int node) -{ - int i; - node_clear(node, nodes_parsed); - for (i = 0; i < MAX_LOCAL_APIC; i++) { - if (apicid_to_node[i] == node) - apicid_to_node[i] = NUMA_NO_NODE; - } -} - -void __init acpi_numa_arch_fixup(void) {} - -/* Use the information discovered above to actually set up the nodes. */ -int __init acpi_scan_nodes(unsigned long start, unsigned long end) -{ - int i; - - if (acpi_numa <= 0) - return -1; - - /* First clean up the node list */ - for (i = 0; i < MAX_NUMNODES; i++) { - cutoff_node(i, start, end); - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { - unparse_node(i); - node_set_offline(i); - } - } - - if (!nodes_cover_memory(nodes)) { - bad_srat(); - return -1; - } - - memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); - if (memnode_shift < 0) { - printk(KERN_ERR - "SRAT: No NUMA node hash function found. Contact maintainer\n"); - bad_srat(); - return -1; - } - - node_possible_map = nodes_parsed; - - /* Finally register nodes */ - for_each_node_mask(i, node_possible_map) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - /* Try again in case setup_node_bootmem missed one due - to missing bootmem */ - for_each_node_mask(i, node_possible_map) - if (!node_online(i)) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - - for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node[i] == NUMA_NO_NODE) - continue; - if (!node_isset(cpu_to_node[i], node_possible_map)) - numa_set_node(i, NUMA_NO_NODE); - } - numa_init_array(); - return 0; -} - -#ifdef CONFIG_NUMA_EMU -static int __init find_node_by_addr(unsigned long addr) -{ - int ret = NUMA_NO_NODE; - int i; - - for_each_node_mask(i, nodes_parsed) { - /* - * Find the real node that this emulated node appears on. For - * the sake of simplicity, we only use a real node's starting - * address to determine which emulated node it appears on. - */ - if (addr >= nodes[i].start && addr < nodes[i].end) { - ret = i; - break; - } - } - return i; -} - -/* - * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID - * mappings that respect the real ACPI topology but reflect our emulated - * environment. For each emulated node, we find which real node it appears on - * and create PXM to NID mappings for those fake nodes which mirror that - * locality. SLIT will now represent the correct distances between emulated - * nodes as a result of the real topology. - */ -void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) -{ - int i, j; - int fake_node_to_pxm_map[MAX_NUMNODES] = { - [0 ... MAX_NUMNODES-1] = PXM_INVAL - }; - unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE - }; - - printk(KERN_INFO "Faking PXM affinity for fake nodes on real " - "topology.\n"); - for (i = 0; i < num_nodes; i++) { - int nid, pxm; - - nid = find_node_by_addr(fake_nodes[i].start); - if (nid == NUMA_NO_NODE) - continue; - pxm = node_to_pxm(nid); - if (pxm == PXM_INVAL) - continue; - fake_node_to_pxm_map[i] = pxm; - /* - * For each apicid_to_node mapping that exists for this real - * node, it must now point to the fake node ID. - */ - for (j = 0; j < MAX_LOCAL_APIC; j++) - if (apicid_to_node[j] == nid) - fake_apicid_to_node[j] = i; - } - for (i = 0; i < num_nodes; i++) - __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); - - nodes_clear(nodes_parsed); - for (i = 0; i < num_nodes; i++) - if (fake_nodes[i].start != fake_nodes[i].end) - node_set(i, nodes_parsed); - WARN_ON(!nodes_cover_memory(fake_nodes)); -} - -static int null_slit_node_compare(int a, int b) -{ - return node_to_pxm(a) == node_to_pxm(b); -} -#else -static int null_slit_node_compare(int a, int b) -{ - return a == b; -} -#endif /* CONFIG_NUMA_EMU */ - -void __init srat_reserve_add_area(int nodeid) -{ - if (found_add_area && nodes_add[nodeid].end) { - u64 total_mb; - - printk(KERN_INFO "SRAT: Reserving hot-add memory space " - "for node %d at %Lx-%Lx\n", - nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); - total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) - >> PAGE_SHIFT; - total_mb *= sizeof(struct page); - total_mb >>= 20; - printk(KERN_INFO "SRAT: This will cost you %Lu MB of " - "pre-allocated memory.\n", (unsigned long long)total_mb); - reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, - nodes_add[nodeid].end - nodes_add[nodeid].start); - } -} - -int __node_distance(int a, int b) -{ - int index; - - if (!acpi_slit) - return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : - REMOTE_DISTANCE; - index = acpi_slit->locality_count * node_to_pxm(a); - return acpi_slit->entry[index + node_to_pxm(b)]; -} - -EXPORT_SYMBOL(__node_distance); - -int memory_add_physaddr_to_nid(u64 start) -{ - int i, ret = 0; - - for_each_node(i) - if (nodes_add[i].start <= start && nodes_add[i].end > start) - ret = i; - - return ret; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); - diff --git a/arch/x86_64/oprofile/Kconfig b/arch/x86_64/oprofile/Kconfig deleted file mode 100644 index d8a84088471a..000000000000 --- a/arch/x86_64/oprofile/Kconfig +++ /dev/null @@ -1,17 +0,0 @@ -config PROFILING - bool "Profiling support (EXPERIMENTAL)" - help - Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. - - -config OPROFILE - tristate "OProfile system profiling (EXPERIMENTAL)" - depends on PROFILING - help - OProfile is a profiling system capable of profiling the - whole system, include the kernel, kernel modules, libraries, - and applications. - - If unsure, say N. - diff --git a/arch/x86_64/oprofile/Makefile b/arch/x86_64/oprofile/Makefile deleted file mode 100644 index 6be32683e1bc..000000000000 --- a/arch/x86_64/oprofile/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -# -# oprofile for x86-64. -# Just reuse the one from i386. -# - -obj-$(CONFIG_OPROFILE) += oprofile.o - -DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ - oprof.o cpu_buffer.o buffer_sync.o \ - event_buffer.o oprofile_files.o \ - oprofilefs.o oprofile_stats.o \ - timer_int.o ) - -OPROFILE-y := init.o backtrace.o -OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \ - op_model_ppro.o -OPROFILE-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o - -oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y)) diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile deleted file mode 100644 index c9eddc8859c0..000000000000 --- a/arch/x86_64/pci/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -# -# Makefile for X86_64 specific PCI routines -# -# Reuse the i386 PCI subsystem -# -EXTRA_CFLAGS += -Iarch/i386/pci - -obj-y := i386.o -obj-$(CONFIG_PCI_DIRECT)+= direct.o -obj-y += fixup.o init.o -obj-$(CONFIG_ACPI) += acpi.o -obj-y += legacy.o irq.o common.o early.o -# mmconfig has a 64bit special -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o - -obj-$(CONFIG_NUMA) += k8-bus.o - -direct-y += ../../i386/pci/direct.o -acpi-y += ../../i386/pci/acpi.o -legacy-y += ../../i386/pci/legacy.o -irq-y += ../../i386/pci/irq.o -common-y += ../../i386/pci/common.o -fixup-y += ../../i386/pci/fixup.o -i386-y += ../../i386/pci/i386.o -init-y += ../../i386/pci/init.o -early-y += ../../i386/pci/early.o -mmconfig-shared-y += ../../i386/pci/mmconfig-shared.o diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c deleted file mode 100644 index 9cc813e29706..000000000000 --- a/arch/x86_64/pci/k8-bus.c +++ /dev/null @@ -1,83 +0,0 @@ -#include <linux/init.h> -#include <linux/pci.h> -#include <asm/mpspec.h> -#include <linux/cpumask.h> - -/* - * This discovers the pcibus <-> node mapping on AMD K8. - * - * RED-PEN need to call this again on PCI hotplug - * RED-PEN empty cpus get reported wrong - */ - -#define NODE_ID_REGISTER 0x60 -#define NODE_ID(dword) (dword & 0x07) -#define LDT_BUS_NUMBER_REGISTER_0 0x94 -#define LDT_BUS_NUMBER_REGISTER_1 0xB4 -#define LDT_BUS_NUMBER_REGISTER_2 0xD4 -#define NR_LDT_BUS_NUMBER_REGISTERS 3 -#define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF) -#define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF) -#define PCI_DEVICE_ID_K8HTCONFIG 0x1100 - -/** - * fill_mp_bus_to_cpumask() - * fills the mp_bus_to_cpumask array based according to the LDT Bus Number - * Registers found in the K8 northbridge - */ -__init static int -fill_mp_bus_to_cpumask(void) -{ - struct pci_dev *nb_dev = NULL; - int i, j; - u32 ldtbus, nid; - static int lbnr[3] = { - LDT_BUS_NUMBER_REGISTER_0, - LDT_BUS_NUMBER_REGISTER_1, - LDT_BUS_NUMBER_REGISTER_2 - }; - - while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD, - PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) { - pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid); - - for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) { - pci_read_config_dword(nb_dev, lbnr[i], &ldtbus); - /* - * if there are no busses hanging off of the current - * ldt link then both the secondary and subordinate - * bus number fields are set to 0. - * - * RED-PEN - * This is slightly broken because it assumes - * HT node IDs == Linux node ids, which is not always - * true. However it is probably mostly true. - */ - if (!(SECONDARY_LDT_BUS_NUMBER(ldtbus) == 0 - && SUBORDINATE_LDT_BUS_NUMBER(ldtbus) == 0)) { - for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus); - j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); - j++) { - struct pci_bus *bus; - struct pci_sysdata *sd; - - long node = NODE_ID(nid); - /* Algorithm a bit dumb, but - it shouldn't matter here */ - bus = pci_find_bus(0, j); - if (!bus) - continue; - if (!node_online(node)) - node = 0; - - sd = bus->sysdata; - sd->node = node; - } - } - } - } - - return 0; -} - -fs_initcall(fill_mp_bus_to_cpumask); diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c deleted file mode 100644 index 4095e4d66a1d..000000000000 --- a/arch/x86_64/pci/mmconfig.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * mmconfig.c - Low-level direct PCI config space access via MMCONFIG - * - * This is an 64bit optimized version that always keeps the full mmconfig - * space mapped. This allows lockless config space operation. - */ - -#include <linux/pci.h> -#include <linux/init.h> -#include <linux/acpi.h> -#include <linux/bitmap.h> -#include <asm/e820.h> - -#include "pci.h" - -/* Static virtual mapping of the MMCONFIG aperture */ -struct mmcfg_virt { - struct acpi_mcfg_allocation *cfg; - char __iomem *virt; -}; -static struct mmcfg_virt *pci_mmcfg_virt; - -static char __iomem *get_virt(unsigned int seg, unsigned bus) -{ - struct acpi_mcfg_allocation *cfg; - int cfg_num; - - for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { - cfg = pci_mmcfg_virt[cfg_num].cfg; - if (cfg->pci_segment == seg && - (cfg->start_bus_number <= bus) && - (cfg->end_bus_number >= bus)) - return pci_mmcfg_virt[cfg_num].virt; - } - - /* Fall back to type 0 */ - return NULL; -} - -static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) -{ - char __iomem *addr; - if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS && - test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots)) - return NULL; - addr = get_virt(seg, bus); - if (!addr) - return NULL; - return addr + ((bus << 20) | (devfn << 12)); -} - -static int pci_mmcfg_read(unsigned int seg, unsigned int bus, - unsigned int devfn, int reg, int len, u32 *value) -{ - char __iomem *addr; - - /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ - if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) { - *value = -1; - return -EINVAL; - } - - addr = pci_dev_base(seg, bus, devfn); - if (!addr) - return pci_conf1_read(seg,bus,devfn,reg,len,value); - - switch (len) { - case 1: - *value = mmio_config_readb(addr + reg); - break; - case 2: - *value = mmio_config_readw(addr + reg); - break; - case 4: - *value = mmio_config_readl(addr + reg); - break; - } - - return 0; -} - -static int pci_mmcfg_write(unsigned int seg, unsigned int bus, - unsigned int devfn, int reg, int len, u32 value) -{ - char __iomem *addr; - - /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ - if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) - return -EINVAL; - - addr = pci_dev_base(seg, bus, devfn); - if (!addr) - return pci_conf1_write(seg,bus,devfn,reg,len,value); - - switch (len) { - case 1: - mmio_config_writeb(addr + reg, value); - break; - case 2: - mmio_config_writew(addr + reg, value); - break; - case 4: - mmio_config_writel(addr + reg, value); - break; - } - - return 0; -} - -static struct pci_raw_ops pci_mmcfg = { - .read = pci_mmcfg_read, - .write = pci_mmcfg_write, -}; - -static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) -{ - void __iomem *addr; - u32 size; - - size = (cfg->end_bus_number + 1) << 20; - addr = ioremap_nocache(cfg->address, size); - if (addr) { - printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", - cfg->address, cfg->address + size - 1); - } - return addr; -} - -int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, - unsigned int devfn) -{ - return pci_dev_base(seg, bus, devfn) != NULL; -} - -int __init pci_mmcfg_arch_init(void) -{ - int i; - pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * - pci_mmcfg_config_num, GFP_KERNEL); - if (pci_mmcfg_virt == NULL) { - printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); - return 0; - } - - for (i = 0; i < pci_mmcfg_config_num; ++i) { - pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; - pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); - if (!pci_mmcfg_virt[i].virt) { - printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " - "segment %d\n", - pci_mmcfg_config[i].pci_segment); - return 0; - } - } - raw_pci_ops = &pci_mmcfg; - return 1; -} diff --git a/arch/x86_64/vdso/.gitignore b/arch/x86_64/vdso/.gitignore deleted file mode 100644 index f8b69d84238e..000000000000 --- a/arch/x86_64/vdso/.gitignore +++ /dev/null @@ -1 +0,0 @@ -vdso.lds diff --git a/arch/x86_64/vdso/Makefile b/arch/x86_64/vdso/Makefile deleted file mode 100644 index 8d03de029d9b..000000000000 --- a/arch/x86_64/vdso/Makefile +++ /dev/null @@ -1,49 +0,0 @@ -# -# x86-64 vDSO. -# - -# files to link into the vdso -# vdso-start.o has to be first -vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o - -# files to link into kernel -obj-y := vma.o vdso.o vdso-syms.o - -vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) - -$(obj)/vdso.o: $(obj)/vdso.so - -targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o - -# The DSO images are built using a special linker script. -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \ - -Wl,-T,$(filter-out FORCE,$^) -o $@ - -export CPPFLAGS_vdso.lds += -P -C -U$(ARCH) - -vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) \ - -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 -SYSCFLAGS_vdso.so = $(vdso-flags) - -$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so - -$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE - $(call if_changed,syscall) - -CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64 - -$(obj)/vclock_gettime.o: CFLAGS = $(CFL) -$(obj)/vgetcpu.o: CFLAGS = $(CFL) - -# We also create a special relocatable object that should mirror the symbol -# table and layout of the linked DSO. With ld -R we can then refer to -# these symbols in the kernel code rather than hand-coded addresses. -extra-y += vdso-syms.o -$(obj)/built-in.o: $(obj)/vdso-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o - -SYSCFLAGS_vdso-syms.o = -r -d -$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE - $(call if_changed,syscall) diff --git a/arch/x86_64/vdso/vclock_gettime.c b/arch/x86_64/vdso/vclock_gettime.c deleted file mode 100644 index 5b54cdfb2b07..000000000000 --- a/arch/x86_64/vdso/vclock_gettime.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright 2006 Andi Kleen, SUSE Labs. - * Subject to the GNU Public License, v.2 - * - * Fast user context implementation of clock_gettime and gettimeofday. - * - * The code should have no internal unresolved relocations. - * Check with readelf after changing. - * Also alternative() doesn't work. - */ - -#include <linux/kernel.h> -#include <linux/posix-timers.h> -#include <linux/time.h> -#include <linux/string.h> -#include <asm/vsyscall.h> -#include <asm/vgtod.h> -#include <asm/timex.h> -#include <asm/hpet.h> -#include <asm/unistd.h> -#include <asm/io.h> -#include <asm/vgtod.h> -#include "vextern.h" - -#define gtod vdso_vsyscall_gtod_data - -static long vdso_fallback_gettime(long clock, struct timespec *ts) -{ - long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory"); - return ret; -} - -static inline long vgetns(void) -{ - long v; - cycles_t (*vread)(void); - vread = gtod->clock.vread; - v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; - return (v * gtod->clock.mult) >> gtod->clock.shift; -} - -static noinline int do_realtime(struct timespec *ts) -{ - unsigned long seq, ns; - do { - seq = read_seqbegin(>od->lock); - ts->tv_sec = gtod->wall_time_sec; - ts->tv_nsec = gtod->wall_time_nsec; - ns = vgetns(); - } while (unlikely(read_seqretry(>od->lock, seq))); - timespec_add_ns(ts, ns); - return 0; -} - -/* Copy of the version in kernel/time.c which we cannot directly access */ -static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) -{ - while (nsec >= NSEC_PER_SEC) { - nsec -= NSEC_PER_SEC; - ++sec; - } - while (nsec < 0) { - nsec += NSEC_PER_SEC; - --sec; - } - ts->tv_sec = sec; - ts->tv_nsec = nsec; -} - -static noinline int do_monotonic(struct timespec *ts) -{ - unsigned long seq, ns, secs; - do { - seq = read_seqbegin(>od->lock); - secs = gtod->wall_time_sec; - ns = gtod->wall_time_nsec + vgetns(); - secs += gtod->wall_to_monotonic.tv_sec; - ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); - vset_normalized_timespec(ts, secs, ns); - return 0; -} - -int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) -{ - if (likely(gtod->sysctl_enabled && gtod->clock.vread)) - switch (clock) { - case CLOCK_REALTIME: - return do_realtime(ts); - case CLOCK_MONOTONIC: - return do_monotonic(ts); - } - return vdso_fallback_gettime(clock, ts); -} -int clock_gettime(clockid_t, struct timespec *) - __attribute__((weak, alias("__vdso_clock_gettime"))); - -int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) -{ - long ret; - if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { - BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != - offsetof(struct timespec, tv_nsec) || - sizeof(*tv) != sizeof(struct timespec)); - do_realtime((struct timespec *)tv); - tv->tv_usec /= 1000; - if (unlikely(tz != NULL)) { - /* This relies on gcc inlining the memcpy. We'll notice - if it ever fails to do so. */ - memcpy(tz, >od->sys_tz, sizeof(struct timezone)); - } - return 0; - } - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); - return ret; -} -int gettimeofday(struct timeval *, struct timezone *) - __attribute__((weak, alias("__vdso_gettimeofday"))); diff --git a/arch/x86_64/vdso/vdso-note.S b/arch/x86_64/vdso/vdso-note.S deleted file mode 100644 index 79a071e4357e..000000000000 --- a/arch/x86_64/vdso/vdso-note.S +++ /dev/null @@ -1,12 +0,0 @@ -/* - * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. - * Here we can supply some information useful to userland. - */ - -#include <linux/uts.h> -#include <linux/version.h> -#include <linux/elfnote.h> - -ELFNOTE_START(Linux, 0, "a") - .long LINUX_VERSION_CODE -ELFNOTE_END diff --git a/arch/x86_64/vdso/vdso-start.S b/arch/x86_64/vdso/vdso-start.S deleted file mode 100644 index 2dc2cdb84d67..000000000000 --- a/arch/x86_64/vdso/vdso-start.S +++ /dev/null @@ -1,2 +0,0 @@ - .globl vdso_kernel_start -vdso_kernel_start: diff --git a/arch/x86_64/vdso/vdso.S b/arch/x86_64/vdso/vdso.S deleted file mode 100644 index 92e80c1972a7..000000000000 --- a/arch/x86_64/vdso/vdso.S +++ /dev/null @@ -1,2 +0,0 @@ - .section ".vdso","a" - .incbin "arch/x86_64/vdso/vdso.so" diff --git a/arch/x86_64/vdso/vdso.lds.S b/arch/x86_64/vdso/vdso.lds.S deleted file mode 100644 index b9a60e665d08..000000000000 --- a/arch/x86_64/vdso/vdso.lds.S +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Linker script for vsyscall DSO. The vsyscall page is an ELF shared - * object prelinked to its virtual address, and with only one read-only - * segment (that fits in one page). This script controls its layout. - */ -#include <asm/asm-offsets.h> -#include "voffset.h" - -#define VDSO_PRELINK 0xffffffffff700000 - -SECTIONS -{ - . = VDSO_PRELINK + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - /* This linker script is used both with -r and with -shared. - For the layouts to match, we need to skip more than enough - space for the dynamic symbol table et al. If this amount - is insufficient, ld -shared will barf. Just increase it here. */ - . = VDSO_PRELINK + VDSO_TEXT_OFFSET; - - .text : { *(.text) } :text - .text.ptr : { *(.text.ptr) } :text - . = VDSO_PRELINK + 0x900; - .data : { *(.data) } :text - .bss : { *(.bss) } :text - - .altinstructions : { *(.altinstructions) } :text - .altinstr_replacement : { *(.altinstr_replacement) } :text - - .note : { *(.note.*) } :text :note - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - .useless : { - *(.got.plt) *(.got) - *(.gnu.linkonce.d.*) - *(.dynbss) - *(.gnu.linkonce.b.*) - } :text -} - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ -} - -/* - * This controls what symbols we export from the DSO. - */ -VERSION -{ - LINUX_2.6 { - global: - clock_gettime; - __vdso_clock_gettime; - gettimeofday; - __vdso_gettimeofday; - getcpu; - __vdso_getcpu; - local: *; - }; -} diff --git a/arch/x86_64/vdso/vextern.h b/arch/x86_64/vdso/vextern.h deleted file mode 100644 index 1683ba2ae3e8..000000000000 --- a/arch/x86_64/vdso/vextern.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef VEXTERN -#include <asm/vsyscall.h> -#define VEXTERN(x) \ - extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden"))); -#endif - -#define VMAGIC 0xfeedbabeabcdefabUL - -/* Any kernel variables used in the vDSO must be exported in the main - kernel's vmlinux.lds.S/vsyscall.h/proper __section and - put into vextern.h and be referenced as a pointer with vdso prefix. - The main kernel later fills in the values. */ - -VEXTERN(jiffies) -VEXTERN(vgetcpu_mode) -VEXTERN(vsyscall_gtod_data) diff --git a/arch/x86_64/vdso/vgetcpu.c b/arch/x86_64/vdso/vgetcpu.c deleted file mode 100644 index 91f6e85d0fc2..000000000000 --- a/arch/x86_64/vdso/vgetcpu.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2006 Andi Kleen, SUSE Labs. - * Subject to the GNU Public License, v.2 - * - * Fast user context implementation of getcpu() - */ - -#include <linux/kernel.h> -#include <linux/getcpu.h> -#include <linux/jiffies.h> -#include <linux/time.h> -#include <asm/vsyscall.h> -#include <asm/vgtod.h> -#include "vextern.h" - -long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) -{ - unsigned int dummy, p; - unsigned long j = 0; - - /* Fast cache - only recompute value once per jiffies and avoid - relatively costly rdtscp/cpuid otherwise. - This works because the scheduler usually keeps the process - on the same CPU and this syscall doesn't guarantee its - results anyways. - We do this here because otherwise user space would do it on - its own in a likely inferior way (no access to jiffies). - If you don't like it pass NULL. */ - if (tcache && tcache->blob[0] == (j = *vdso_jiffies)) { - p = tcache->blob[1]; - } else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - rdtscp(dummy, dummy, p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } - if (tcache) { - tcache->blob[0] = j; - tcache->blob[1] = p; - } - if (cpu) - *cpu = p & 0xfff; - if (node) - *node = p >> 12; - return 0; -} - -long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) - __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86_64/vdso/vma.c b/arch/x86_64/vdso/vma.c deleted file mode 100644 index ff9333e5fb08..000000000000 --- a/arch/x86_64/vdso/vma.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Set up the VMAs to tell the VM about the vDSO. - * Copyright 2007 Andi Kleen, SUSE Labs. - * Subject to the GPL, v.2 - */ -#include <linux/mm.h> -#include <linux/err.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/random.h> -#include <asm/vsyscall.h> -#include <asm/vgtod.h> -#include <asm/proto.h> -#include "voffset.h" - -int vdso_enabled = 1; - -#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x; -#include "vextern.h" -#undef VEXTERN - -extern char vdso_kernel_start[], vdso_start[], vdso_end[]; -extern unsigned short vdso_sync_cpuid; - -struct page **vdso_pages; - -static inline void *var_ref(void *vbase, char *var, char *name) -{ - unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET; - void *p = vbase + offset; - if (*(void **)p != (void *)VMAGIC) { - printk("VDSO: variable %s broken\n", name); - vdso_enabled = 0; - } - return p; -} - -static int __init init_vdso_vars(void) -{ - int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; - int i; - char *vbase; - - vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); - if (!vdso_pages) - goto oom; - for (i = 0; i < npages; i++) { - struct page *p; - p = alloc_page(GFP_KERNEL); - if (!p) - goto oom; - vdso_pages[i] = p; - copy_page(page_address(p), vdso_start + i*PAGE_SIZE); - } - - vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL); - if (!vbase) - goto oom; - - if (memcmp(vbase, "\177ELF", 4)) { - printk("VDSO: I'm broken; not ELF\n"); - vdso_enabled = 0; - } - -#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x) -#define VEXTERN(x) \ - V(vdso_ ## x) = &__ ## x; -#include "vextern.h" -#undef VEXTERN - return 0; - - oom: - printk("Cannot allocate vdso\n"); - vdso_enabled = 0; - return -ENOMEM; -} -__initcall(init_vdso_vars); - -struct linux_binprm; - -/* Put the vdso above the (randomized) stack with another randomized offset. - This way there is no hole in the middle of address space. - To save memory make sure it is still in the same PTE as the stack top. - This doesn't give that many random bits */ -static unsigned long vdso_addr(unsigned long start, unsigned len) -{ - unsigned long addr, end; - unsigned offset; - end = (start + PMD_SIZE - 1) & PMD_MASK; - if (end >= TASK_SIZE64) - end = TASK_SIZE64; - end -= len; - /* This loses some more bits than a modulo, but is cheaper */ - offset = get_random_int() & (PTRS_PER_PTE - 1); - addr = start + (offset << PAGE_SHIFT); - if (addr >= end) - addr = end; - return addr; -} - -/* Setup a VMA at program startup for the vsyscall page. - Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) -{ - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret; - unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE); - - if (!vdso_enabled) - return 0; - - down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, len); - addr = get_unmapped_area(NULL, addr, len, 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - - ret = install_special_mapping(mm, addr, len, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, - vdso_pages); - if (ret) - goto up_fail; - - current->mm->context.vdso = (void *)addr; -up_fail: - up_write(&mm->mmap_sem); - return ret; -} - -static __init int vdso_setup(char *s) -{ - vdso_enabled = simple_strtoul(s, NULL, 0); - return 0; -} -__setup("vdso=", vdso_setup); diff --git a/arch/x86_64/vdso/voffset.h b/arch/x86_64/vdso/voffset.h deleted file mode 100644 index 4af67c79085f..000000000000 --- a/arch/x86_64/vdso/voffset.h +++ /dev/null @@ -1 +0,0 @@ -#define VDSO_TEXT_OFFSET 0x600 diff --git a/arch/x86_64/vdso/vvar.c b/arch/x86_64/vdso/vvar.c deleted file mode 100644 index 6fc22219a472..000000000000 --- a/arch/x86_64/vdso/vvar.c +++ /dev/null @@ -1,12 +0,0 @@ -/* Define pointer to external vDSO variables. - These are part of the vDSO. The kernel fills in the real addresses - at boot time. This is done because when the vdso is linked the - kernel isn't yet and we don't know the final addresses. */ -#include <linux/kernel.h> -#include <linux/time.h> -#include <asm/vsyscall.h> -#include <asm/timex.h> -#include <asm/vgtod.h> - -#define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC; -#include "vextern.h" |