From cc1a9d86ce989083703c4bdc11b75a87e1cc404a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Jun 2008 19:39:16 -0700 Subject: mm, x86: shrink_active_range() should check all Now we are using register_e820_active_regions() instead of add_active_range() directly. So end_pfn could be different between the value in early_node_map to node_end_pfn. So we need to make shrink_active_range() smarter. shrink_active_range() is a generic MM function in mm/page_alloc.c but it is only used on 32-bit x86. Should we move it back to some file in arch/x86? Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/mm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index c31a9cd2a30e..7cbd949f2516 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -997,8 +997,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn); +extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); -- cgit v1.2.3 From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 19:08:52 -0700 Subject: x86: replace shrink_active_range() with remove_active_range() in case we have kva before ramdisk on a node, we still need to use those ranges. v2: reserve_early kva ram area, in case there are holes in highmem, to avoid those area could be treat as free high pages. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 45 ++++++++++++++++++++++++--------------------- include/linux/mm.h | 3 ++- mm/page_alloc.c | 29 +++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 28 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index accc7c6c57fc..c3f119e99e0d 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -230,8 +230,8 @@ static unsigned long calculate_numa_remap_pages(void) unsigned long size, reserve_pages = 0; for_each_online_node(nid) { - u64 node_end_target; - u64 node_end_final; + u64 node_kva_target; + u64 node_kva_final; /* * The acpi/srat node info can show hot-add memroy zones @@ -254,42 +254,45 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_end_target = round_down(node_end_pfn[nid] - size, + node_kva_target = round_down(node_end_pfn[nid] - size, PTRS_PER_PTE); - node_end_target <<= PAGE_SHIFT; + node_kva_target <<= PAGE_SHIFT; do { - node_end_final = find_e820_area(node_end_target, + node_kva_final = find_e820_area(node_kva_target, ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + node_kva_target -= LARGE_PAGE_BYTES; + } while (node_kva_final == -1ULL && + (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - if (node_end_final == -1ULL) + if (node_kva_final == -1ULL) panic("Can not get kva ram\n"); - printk("Reserving %ld pages of KVA for lmem_map of node %d\n", - size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %lld pages\n", - nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + size, nid, node_kva_final>>PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system * with less memory later. * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then max_low_pfn + * but we could have some hole in high memory, and it will only + * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide + * to use it as free. + * So reserve_early here, hope we don't run out of that array */ - if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) - reserve_early(node_end_final, - node_end_final+(((u64)size)<>PAGE_SHIFT; - node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, node_end_pfn[nid]); + reserve_early(node_kva_final, + node_kva_final+(((u64)size)<>PAGE_SHIFT; + remove_active_range(nid, node_remap_start_pfn[nid], + node_remap_start_pfn[nid] + size); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index ce8e397a61f6..034a3156d2f0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -998,7 +998,8 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); +extern void remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee5ba7509c1..d80e1868e570 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3552,30 +3552,47 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /** - * shrink_active_range - Shrink an existing registered range of PFNs + * remove_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @new_end_pfn: The new PFN of the range + * @start_pfn: The new PFN of the range + * @end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. * The map is kept near the end physical page range that has already been * registered. This function allows an arch to shrink an existing registered * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) +void __init remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) { int i, j; int removed = 0; + printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", + nid, start_pfn, end_pfn); + /* Find the old active region end and shrink */ for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= new_end_pfn) { + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn <= end_pfn) { /* clear it */ + early_node_map[i].start_pfn = 0; early_node_map[i].end_pfn = 0; removed = 1; continue; } - if (early_node_map[i].end_pfn > new_end_pfn) { - early_node_map[i].end_pfn = new_end_pfn; + if (early_node_map[i].start_pfn < start_pfn && + early_node_map[i].end_pfn > start_pfn) { + unsigned long temp_end_pfn = early_node_map[i].end_pfn; + early_node_map[i].end_pfn = start_pfn; + if (temp_end_pfn > end_pfn) + add_active_range(nid, end_pfn, temp_end_pfn); + continue; + } + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn > end_pfn && + early_node_map[i].start_pfn < end_pfn) { + early_node_map[i].start_pfn = end_pfn; continue; } } -- cgit v1.2.3 From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Jun 2008 18:32:52 -0700 Subject: x86, mm: use add_highpages_with_active_regions() for high pages init v2 use early_node_map to init high pages, so we can remove page_is_ram() and page_is_reserved_early() in the big loop with add_one_highpage also remove page_is_reserved_early(), it is not needed anymore. v2: fix the build of other platforms Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 11 -------- arch/x86/mm/discontig_32.c | 19 ++++++-------- arch/x86/mm/init_32.c | 62 ++++++++++++++++++++++++++++++++++++++-------- include/asm-x86/e820.h | 1 - include/asm-x86/highmem.h | 3 +++ include/linux/mm.h | 2 ++ mm/page_alloc.c | 8 ++++++ 7 files changed, 71 insertions(+), 35 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5051ce744b4e..ed46b7a6bc13 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -612,17 +612,6 @@ void __init free_early(u64 start, u64 end) early_res[j - 1].end = 0; } -int __init page_is_reserved_early(unsigned long pagenr) -{ - u64 start = (u64)pagenr << PAGE_SHIFT; - int i; - struct early_res *r; - - i = find_overlapped_early(start, start + PAGE_SIZE); - r = &early_res[i]; - return (i < MAX_EARLY_RES && r->end); -} - void __init early_res_to_bootmem(u64 start, u64 end) { int i; diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c3f119e99e0d..7c4d0255f8d8 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, #endif extern unsigned long find_max_low_pfn(void); -extern void add_one_highpage_init(struct page *, int, int); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) @@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro) { #ifdef CONFIG_HIGHMEM struct zone *zone; - struct page *page; + int nid; for_each_zone(zone) { - unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; if (!is_highmem(zone)) continue; @@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro) zone_start_pfn = zone->zone_start_pfn; zone_end_pfn = zone_start_pfn + zone->spanned_pages; + nid = zone_to_nid(zone); printk("Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, zone_to_nid(zone), - zone_start_pfn, zone_end_pfn); - - for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); - } + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn, bad_ppro); } totalram_pages += totalhigh_pages; #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index abadb1da70df..ba07a489230e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -287,10 +287,10 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __init +add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) && - !page_is_reserved_early(pfn)) { + if (!(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); init_page_count(page); __free_page(page); @@ -299,18 +299,58 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } +struct add_highpages_data { + unsigned long start_pfn; + unsigned long end_pfn; + int bad_ppro; +}; + +static void __init add_highpages_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + int node_pfn; + struct page *page; + unsigned long final_start_pfn, final_end_pfn; + struct add_highpages_data *data; + int bad_ppro; + + data = (struct add_highpages_data *)datax; + bad_ppro = data->bad_ppro; + + final_start_pfn = max(start_pfn, data->start_pfn); + final_end_pfn = min(end_pfn, data->end_pfn); + if (final_start_pfn >= final_end_pfn) + return; + + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; + node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page, node_pfn, bad_ppro); + } + +} + +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, + int bad_ppro) +{ + struct add_highpages_data data; + + data.start_pfn = start_pfn; + data.end_pfn = end_pfn; + data.bad_ppro = bad_ppro; + + work_with_active_regions(nid, add_highpages_work_fn, &data); +} + #ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { - int pfn; + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn, + bad_ppro); - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { - /* - * Holes under sparsemem might not have no mem_map[]: - */ - if (pfn_valid(pfn)) - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); - } totalram_pages += totalhigh_pages; } #endif /* !CONFIG_NUMA */ diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 6b0ce745a60c..55d310596907 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -86,7 +86,6 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); -extern int page_is_reserved_early(unsigned long pagenr); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); extern unsigned long e820_end_of_ram(void); diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h index e153f3b44774..85c4fea41ff6 100644 --- a/include/asm-x86/highmem.h +++ b/include/asm-x86/highmem.h @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, int bad_ppro); + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 034a3156d2f0..e4de460907c1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,6 +1011,8 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); +typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID extern int early_pfn_to_nid(unsigned long pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d80e1868e570..41c6e3aa059f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid, } } +void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, + data); +} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit v1.2.3 From 3461b0af025251bbc6b3d56c821c6ac2de6f7209 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: x86: remove static boot_cpu_pda array v2 * Remove the boot_cpu_pda array and pointer table from the data section. Allocate the pointer table and array during init. do_boot_cpu() will reallocate the pda in node local memory and if the cpu is being brought up before the bootmem array is released (after_bootmem = 0), then it will free the initial pda. This will happen for all cpus present at system startup. This removes 512k + 32k bytes from the data section. For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head64.c | 26 +++++++++++++++-- arch/x86/kernel/setup.c | 73 ++++++++++++++++++++++++++++++++++++----------- arch/x86/kernel/setup64.c | 8 ++++-- arch/x86/kernel/smpboot.c | 59 +++++++++++++++++++++++++++++--------- include/asm-x86/pda.h | 6 ++-- include/linux/mm.h | 1 + 6 files changed, 135 insertions(+), 38 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index e25c57b8aa84..0ab59edd7067 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -25,6 +25,24 @@ #include #include +/* boot cpu pda */ +static struct x8664_pda _boot_cpu_pda __read_mostly; + +#ifdef CONFIG_SMP +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +/* + * We install an empty cpu_pda pointer table to trap references before + * the actual cpu_pda pointer table is created in setup_cpu_pda_map(). + */ +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; +#else +static struct x8664_pda *__cpu_pda[1] __read_mostly; +#endif + +#else /* !CONFIG_SMP (NR_CPUS will be 1) */ +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; +#endif + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -156,10 +174,12 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel alive\n"); - for (i = 0; i < NR_CPUS; i++) - cpu_pda(i) = &boot_cpu_pda[i]; - + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; pda_init(0); + + early_printk("Kernel really alive\n"); + copy_bootdata(__va(real_mode_data)); reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 913af838c3c5..dd12c1c84a8f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -101,6 +101,50 @@ static inline void setup_cpumask_of_cpu(void) { } */ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +static inline void setup_cpu_pda_map(void) { } + +#elif !defined(CONFIG_SMP) +static inline void setup_cpu_pda_map(void) { } + +#else /* CONFIG_SMP && CONFIG_X86_64 */ + +/* + * Allocate cpu_pda pointer table and array via alloc_bootmem. + */ +static void __init setup_cpu_pda_map(void) +{ + char *pda; + struct x8664_pda **new_cpu_pda; + unsigned long size; + int cpu; + + size = roundup(sizeof(struct x8664_pda), cache_line_size()); + + /* allocate cpu_pda array and pointer table */ + { + unsigned long tsize = nr_cpu_ids * sizeof(void *); + unsigned long asize = size * (nr_cpu_ids - 1); + + tsize = roundup(tsize, cache_line_size()); + new_cpu_pda = alloc_bootmem(tsize + asize); + pda = (char *)new_cpu_pda + tsize; + } + + /* initialize pointer table to static pda's */ + for_each_possible_cpu(cpu) { + if (cpu == 0) { + /* leave boot cpu pda in place */ + new_cpu_pda[0] = cpu_pda(0); + continue; + } + new_cpu_pda[cpu] = (struct x8664_pda *)pda; + new_cpu_pda[cpu]->in_bootmem = 1; + pda += size; + } + + /* point to new pointer table */ + _cpu_pda = new_cpu_pda; +} #endif /* @@ -110,46 +154,43 @@ EXPORT_SYMBOL(__per_cpu_offset); */ void __init setup_per_cpu_areas(void) { - int i, highest_cpu = 0; - unsigned long size; + ssize_t size = PERCPU_ENOUGH_ROOM; + char *ptr; + int cpu; #ifdef CONFIG_HOTPLUG_CPU prefill_possible_map(); +#else + nr_cpu_ids = num_processors; #endif + /* Setup cpu_pda map */ + setup_cpu_pda_map(); + /* Copy section for each CPU (we discard the original) */ size = PERCPU_ENOUGH_ROOM; printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); - for_each_possible_cpu(i) { - char *ptr; + for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES ptr = alloc_bootmem_pages(size); #else - int node = early_cpu_to_node(i); + int node = early_cpu_to_node(cpu); if (!node_online(node) || !NODE_DATA(node)) { ptr = alloc_bootmem_pages(size); printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", - i, node); + cpu, node); } else ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); #endif - if (!ptr) - panic("Cannot allocate cpu data for CPU %d\n", i); -#ifdef CONFIG_X86_64 - cpu_pda(i)->data_offset = ptr - __per_cpu_start; -#else - __per_cpu_offset[i] = ptr - __per_cpu_start; -#endif + per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - highest_cpu = i; } - nr_cpu_ids = highest_cpu + 1; printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", NR_CPUS, nr_cpu_ids, nr_node_ids); @@ -199,7 +240,7 @@ void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - if (node != NUMA_NO_NODE) + if (cpu_pda(cpu) && node != NUMA_NO_NODE) cpu_pda(cpu)->nodenumber = node; if (cpu_to_node_map) diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index aee0e8200777..631ea6cc01d8 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -34,9 +35,8 @@ struct boot_params boot_params; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; +struct x8664_pda **_cpu_pda __read_mostly; EXPORT_SYMBOL(_cpu_pda); -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; @@ -114,8 +114,10 @@ void pda_init(int cpu) __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); if (!pda->irqstackptr) panic("cannot allocate irqstack for cpu %d", cpu); - } + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } pda->irqstackptr += IRQSTACKSIZE-64; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 036604d3daed..bf0833487455 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -816,6 +816,43 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } +/* + * Allocate node local memory for the AP pda. + * + * Must be called after the _cpu_pda pointer table is initialized. + */ +static int __cpuinit get_local_pda(int cpu) +{ + struct x8664_pda *oldpda, *newpda; + unsigned long size = sizeof(struct x8664_pda); + int node = cpu_to_node(cpu); + + if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem) + return 0; + + oldpda = cpu_pda(cpu); + newpda = kmalloc_node(size, GFP_ATOMIC, node); + if (!newpda) { + printk(KERN_ERR "Could not allocate node local PDA " + "for CPU %d on node %d\n", cpu, node); + + if (oldpda) + return 0; /* have a usable pda */ + else + return -1; + } + + if (oldpda) { + memcpy(newpda, oldpda, size); + if (!after_bootmem) + free_bootmem((unsigned long)oldpda, size); + } + + newpda->in_bootmem = 0; + cpu_pda(cpu) = newpda; + return 0; +} + static int __cpuinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -841,19 +878,11 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) } /* Allocate node local memory for AP pdas */ - if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { - struct x8664_pda *newpda, *pda; - int node = cpu_to_node(cpu); - pda = cpu_pda(cpu); - newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC, - node); - if (newpda) { - memcpy(newpda, pda, sizeof(struct x8664_pda)); - cpu_pda(cpu) = newpda; - } else - printk(KERN_ERR - "Could not allocate node local PDA for CPU %d on node %d\n", - cpu, node); + if (cpu > 0) { + boot_error = get_local_pda(cpu); + if (boot_error) + goto restore_state; + /* if can't get pda memory, can't start cpu */ } #endif @@ -972,6 +1001,8 @@ do_rest: } } +restore_state: + if (boot_error) { /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); @@ -1347,6 +1378,8 @@ __init void prefill_possible_map(void) for (i = 0; i < possible; i++) cpu_set(i, cpu_possible_map); + + nr_cpu_ids = possible; } static void __ref remove_cpu_from_maps(int cpu) diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h index de2ad9ac35a9..b34e9a7cc80b 100644 --- a/include/asm-x86/pda.h +++ b/include/asm-x86/pda.h @@ -22,7 +22,8 @@ struct x8664_pda { offset 40!!! */ #endif char *irqstackptr; - int nodenumber; /* number of current node */ + short nodenumber; /* number of current node (32k max) */ + short in_bootmem; /* pda lives in bootmem */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ short mmu_state; @@ -38,8 +39,7 @@ struct x8664_pda { unsigned irq_spurious_count; } ____cacheline_aligned_in_smp; -extern struct x8664_pda *_cpu_pda[]; -extern struct x8664_pda boot_cpu_pda[]; +extern struct x8664_pda **_cpu_pda; extern void pda_init(int); #define cpu_pda(i) (_cpu_pda[i]) diff --git a/include/linux/mm.h b/include/linux/mm.h index 586a943cab01..0ea48a5af823 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1024,6 +1024,7 @@ extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); +extern int after_bootmem; #ifdef CONFIG_NUMA extern void setup_per_cpu_pageset(void); -- cgit v1.2.3 From d52d53b8a5b258bfaab9223a5e7284fcfdd48577 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 20:10:55 -0700 Subject: RFC x86: try to remove arch_get_ram_range want to remove arch_get_ram_range, and use early_node_map instead. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 6 ++++-- drivers/pci/intel-iommu.c | 51 ++++++++++++++++++++++++++++++++++------------- include/linux/mm.h | 2 +- mm/page_alloc.c | 10 +++++++--- 4 files changed, 49 insertions(+), 20 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 65d55056b6e7..a0484adbf59d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -298,7 +298,7 @@ struct add_highpages_data { unsigned long end_pfn; }; -static void __init add_highpages_work_fn(unsigned long start_pfn, +static int __init add_highpages_work_fn(unsigned long start_pfn, unsigned long end_pfn, void *datax) { int node_pfn; @@ -311,7 +311,7 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, final_start_pfn = max(start_pfn, data->start_pfn); final_end_pfn = min(end_pfn, data->end_pfn); if (final_start_pfn >= final_end_pfn) - return; + return 0; for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; node_pfn++) { @@ -321,6 +321,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, add_one_highpage_init(page, node_pfn); } + return 0; + } void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 66c0fd21894b..bb0642318a95 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1637,12 +1637,43 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, } #ifdef CONFIG_DMAR_GFX_WA -extern int arch_get_ram_range(int slot, u64 *addr, u64 *size); +struct iommu_prepare_data { + struct pci_dev *pdev; + int ret; +}; + +static int __init iommu_prepare_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + struct iommu_prepare_data *data; + + data = (struct iommu_prepare_data *)datax; + + data->ret = iommu_prepare_identity_map(data->pdev, + start_pfn<ret; + +} + +static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev) +{ + int nid; + struct iommu_prepare_data data; + + data.pdev = pdev; + data.ret = 0; + + for_each_online_node(nid) { + work_with_active_regions(nid, iommu_prepare_work_fn, &data); + if (data.ret) + return data.ret; + } + return data.ret; +} + static void __init iommu_prepare_gfx_mapping(void) { struct pci_dev *pdev = NULL; - u64 base, size; - int slot; int ret; for_each_pci_dev(pdev) { @@ -1651,17 +1682,9 @@ static void __init iommu_prepare_gfx_mapping(void) continue; printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n", pci_name(pdev)); - slot = arch_get_ram_range(0, &base, &size); - while (slot >= 0) { - ret = iommu_prepare_identity_map(pdev, - base, base + size); - if (ret) - goto error; - slot = arch_get_ram_range(slot, &base, &size); - } - continue; -error: - printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); + ret = iommu_prepare_with_active_regions(pdev); + if (ret) + printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); } } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d647b24041f..cf1cd3a2ed78 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,7 +1011,7 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); -typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 41c6e3aa059f..e25b6b24f844 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2932,10 +2932,14 @@ void __init free_bootmem_with_active_regions(int nid, void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; + int ret; - for_each_active_range_index_in_nid(i, nid) - work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, - data); + for_each_active_range_index_in_nid(i, nid) { + ret = work_fn(early_node_map[i].start_pfn, + early_node_map[i].end_pfn, data); + if (ret) + break; + } } /** * sparse_memory_present_with_active_regions - Call memory_present for each active range -- cgit v1.2.3 From aba46c5027cb59d98052231b36efcbbde9c77a1d Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 8 Jul 2008 00:28:52 +1000 Subject: powerpc/mm: Define flags for Strong Access Ordering This patch defines: - PROT_SAO, which is passed into mmap() and mprotect() in the prot field - VM_SAO in vma->vm_flags, and - _PAGE_SAO, the combination of WIMG bits in the pte that enables strong access ordering for the page. Signed-off-by: Dave Kleikamp Signed-off-by: Benjamin Herrenschmidt --- include/asm-powerpc/mman.h | 2 ++ include/asm-powerpc/pgtable-ppc64.h | 3 +++ include/linux/mm.h | 1 + 3 files changed, 6 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/asm-powerpc/mman.h b/include/asm-powerpc/mman.h index 24cf664a8295..0c46bf2c7d5f 100644 --- a/include/asm-powerpc/mman.h +++ b/include/asm-powerpc/mman.h @@ -10,6 +10,8 @@ * 2 of the License, or (at your option) any later version. */ +#define PROT_SAO 0x10 /* Strong Access Ordering */ + #define MAP_RENAME MAP_ANONYMOUS /* In SunOS terminology */ #define MAP_NORESERVE 0x40 /* don't reserve swap pages */ #define MAP_LOCKED 0x80 diff --git a/include/asm-powerpc/pgtable-ppc64.h b/include/asm-powerpc/pgtable-ppc64.h index b2754d46be44..d09599cccb35 100644 --- a/include/asm-powerpc/pgtable-ppc64.h +++ b/include/asm-powerpc/pgtable-ppc64.h @@ -93,6 +93,9 @@ #define _PAGE_RW 0x0200 /* software: user write access allowed */ #define _PAGE_BUSY 0x0800 /* software: PTE & hash are busy */ +/* Strong Access Ordering */ +#define _PAGE_SAO (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT) + #define _PAGE_BASE (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_COHERENT) #define _PAGE_WRENABLE (_PAGE_RW | _PAGE_DIRTY) diff --git a/include/linux/mm.h b/include/linux/mm.h index 586a943cab01..689184446fc6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -108,6 +108,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ +#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -- cgit v1.2.3 From 0d71d10a4252a3938e6b70189bc776171c02e076 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 23 Jul 2008 21:27:05 -0700 Subject: mm: remove nopfn There are no users of nopfn in the tree. Remove it. [hugh@veritas.com: fix build error] Signed-off-by: Nick Piggin Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 -------- mm/memory.c | 67 ++++++------------------------------------------------ 2 files changed, 7 insertions(+), 69 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2128ef7780c6..eb815cfc1b35 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -166,8 +166,6 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); - unsigned long (*nopfn)(struct vm_area_struct *area, - unsigned long address); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ @@ -674,13 +672,6 @@ static inline int page_mapped(struct page *page) return atomic_read(&(page)->_mapcount) >= 0; } -/* - * Error return values for the *_nopfn functions - */ -#define NOPFN_SIGBUS ((unsigned long) -1) -#define NOPFN_OOM ((unsigned long) -2) -#define NOPFN_REFAULT ((unsigned long) -3) - /* * Different kinds of faults, as returned by handle_mm_fault(). * Used to decide whether a process gets delivered SIGBUS or diff --git a/mm/memory.c b/mm/memory.c index 2302d228fe04..46dbed4b7446 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1058,11 +1058,9 @@ static inline int use_zero_page(struct vm_area_struct *vma) if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) return 0; /* - * And if we have a fault or a nopfn routine, it's not an - * anonymous region. + * And if we have a fault routine, it's not an anonymous region. */ - return !vma->vm_ops || - (!vma->vm_ops->fault && !vma->vm_ops->nopfn); + return !vma->vm_ops || !vma->vm_ops->fault; } int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, @@ -1338,6 +1336,11 @@ out: * * This function should only be called from a vm_ops->fault handler, and * in that case the handler should return NULL. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. */ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) @@ -2501,59 +2504,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } - -/* - * do_no_pfn() tries to create a new page mapping for a page without - * a struct_page backing it - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - * - * It is expected that the ->nopfn handler always returns the same pfn - * for a given virtual mapping. - * - * Mark this `noinline' to prevent it from bloating the main pagefault code. - */ -static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) -{ - spinlock_t *ptl; - pte_t entry; - unsigned long pfn; - - pte_unmap(page_table); - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); - BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - - pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); - - BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); - - if (unlikely(pfn == NOPFN_OOM)) - return VM_FAULT_OOM; - else if (unlikely(pfn == NOPFN_SIGBUS)) - return VM_FAULT_SIGBUS; - else if (unlikely(pfn == NOPFN_REFAULT)) - return 0; - - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - - /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - entry = pfn_pte(pfn, vma->vm_page_prot); - if (write_access) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - set_pte_at(mm, address, page_table, entry); - } - pte_unmap_unlock(page_table, ptl); - return 0; -} - /* * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable @@ -2614,9 +2564,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, pte, pmd, write_access, entry); - if (unlikely(vma->vm_ops->nopfn)) - return do_no_pfn(mm, vma, address, pte, - pmd, write_access); } return do_anonymous_page(mm, vma, address, pte, pmd, write_access); -- cgit v1.2.3 From 28b2ee20c7cba812b6f2ccf6d722cf86d00a84dc Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 23 Jul 2008 21:27:05 -0700 Subject: access_process_vm device memory infrastructure In order to be able to debug things like the X server and programs using the PPC Cell SPUs, the debugger needs to be able to access device memory through ptrace and /proc/pid/mem. This patch: Add the generic_access_phys access function and put the hooks in place to allow access_process_vm to access device or PPC Cell SPU memory. [riel@redhat.com: Add documentation for the vm_ops->access function] Signed-off-by: Rik van Riel Signed-off-by: Benjamin Herrensmidt Cc: Dave Airlie Cc: Hugh Dickins Cc: Paul Mackerras Cc: Arnd Bergmann Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 7 ++ arch/Kconfig | 3 + arch/x86/Kconfig | 1 + arch/x86/mm/ioremap.c | 8 +++ include/asm-x86/io_32.h | 2 + include/asm-x86/io_64.h | 2 + include/linux/mm.h | 8 +++ mm/memory.c | 131 ++++++++++++++++++++++++++++++++------ 8 files changed, 144 insertions(+), 18 deletions(-) (limited to 'include/linux/mm.h') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 8b22d7d8b991..680fb566b928 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -510,6 +510,7 @@ prototypes: void (*close)(struct vm_area_struct*); int (*fault)(struct vm_area_struct*, struct vm_fault *); int (*page_mkwrite)(struct vm_area_struct *, struct page *); + int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: BKL mmap_sem PageLocked(page) @@ -517,6 +518,7 @@ open: no yes close: no yes fault: no yes page_mkwrite: no yes no +access: no yes ->page_mkwrite() is called when a previously read-only page is about to become writeable. The file system is responsible for @@ -525,6 +527,11 @@ taking to lock out truncate, the page range should be verified to be within i_size. The page mapping should also be checked that it is not NULL. + ->access() is called when get_user_pages() fails in +acces_process_vm(), typically used to debug a process through +/proc/pid/mem or ptrace. This function is needed only for +VM_IO | VM_PFNMAP VMAs. + ================================================================================ Dubious stuff diff --git a/arch/Kconfig b/arch/Kconfig index 4d5ebbc1e72b..6093c0be58b0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -31,6 +31,9 @@ config KRETPROBES def_bool y depends on KPROBES && HAVE_KRETPROBES +config HAVE_IOREMAP_PROT + def_bool n + config HAVE_KPROBES def_bool n diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 03980cb04291..b2ddfcf01728 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -21,6 +21,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE + select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_DYNAMIC_FTRACE diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 24c1d3c30186..016f335bbeea 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -330,6 +330,14 @@ static void __iomem *ioremap_default(resource_size_t phys_addr, return (void __iomem *)ret; } +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK), + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_prot); + /** * iounmap - Free a IO remapping * @addr: virtual address from ioremap_* diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h index 4df44ed54077..e876d89ac156 100644 --- a/include/asm-x86/io_32.h +++ b/include/asm-x86/io_32.h @@ -110,6 +110,8 @@ static inline void *phys_to_virt(unsigned long address) */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, + unsigned long prot_val); /* * The default ioremap() behavior is non-cached: diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h index ddd8058a5026..22995c5c5adc 100644 --- a/include/asm-x86/io_64.h +++ b/include/asm-x86/io_64.h @@ -175,6 +175,8 @@ extern void early_iounmap(void *addr, unsigned long size); */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, + unsigned long prot_val); /* * The default ioremap() behavior is non-cached: diff --git a/include/linux/mm.h b/include/linux/mm.h index eb815cfc1b35..5c7f8f64f70e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -170,6 +170,12 @@ struct vm_operations_struct { /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page); + + /* called by access_process_vm when get_user_pages() fails, typically + * for use by special VMAs that can switch between memory and hardware + */ + int (*access)(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy @@ -771,6 +777,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); static inline void unmap_shared_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) diff --git a/mm/memory.c b/mm/memory.c index 46dbed4b7446..87350321e66f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2751,6 +2751,86 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ +#ifdef CONFIG_HAVE_IOREMAP_PROT +static resource_size_t follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + resource_size_t phys_addr = 0; + struct mm_struct *mm = vma->vm_mm; + + VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP))); + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto no_page_table; + + /* We cannot handle huge page PFN maps. Luckily they don't exist. */ + if (pmd_huge(*pmd)) + goto no_page_table; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + goto out; + + pte = *ptep; + if (!pte_present(pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + phys_addr = pte_pfn(pte); + phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ + + *prot = pgprot_val(pte_pgprot(pte)); + +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return phys_addr; +no_page_table: + return 0; +} + +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + resource_size_t phys_addr; + unsigned long prot = 0; + void *maddr; + int offset = addr & (PAGE_SIZE-1); + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + phys_addr = follow_phys(vma, addr, write, &prot); + + if (!phys_addr) + return -EINVAL; + + maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); + if (write) + memcpy_toio(maddr + offset, buf, len); + else + memcpy_fromio(buf, maddr + offset, len); + iounmap(maddr); + + return len; +} +#endif + /* * Access another process' address space. * Source/target buffer must be kernel space, @@ -2760,7 +2840,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in { struct mm_struct *mm; struct vm_area_struct *vma; - struct page *page; void *old_buf = buf; mm = get_task_mm(tsk); @@ -2772,28 +2851,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in while (len) { int bytes, ret, offset; void *maddr; + struct page *page = NULL; ret = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; - - maddr = kmap(page); - if (write) { - copy_to_user_page(vma, page, addr, - maddr + offset, buf, bytes); - set_page_dirty_lock(page); + if (ret <= 0) { + /* + * Check if this is a VM_IO | VM_PFNMAP VMA, which + * we can access using slightly different code. + */ +#ifdef CONFIG_HAVE_IOREMAP_PROT + vma = find_vma(mm, addr); + if (!vma) + break; + if (vma->vm_ops && vma->vm_ops->access) + ret = vma->vm_ops->access(vma, addr, buf, + len, write); + if (ret <= 0) +#endif + break; + bytes = ret; } else { - copy_from_user_page(vma, page, addr, - buf, maddr + offset, bytes); + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); } - kunmap(page); - page_cache_release(page); len -= bytes; buf += bytes; addr += bytes; -- cgit v1.2.3 From 42b7772812d15b86543a23b82bd6070eef9a08b1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 23 Jul 2008 21:27:10 -0700 Subject: mm: remove double indirection on tlb parameter to free_pgd_range() & Co The double indirection here is not needed anywhere and hence (at least) confusing. Signed-off-by: Jan Beulich Cc: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Luck, Tony" Cc: Paul Mundt Cc: "David S. Miller" Acked-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/hugetlbpage.c | 2 +- arch/powerpc/mm/hugetlbpage.c | 8 ++++---- fs/exec.c | 4 ++-- include/asm-ia64/hugetlb.h | 2 +- include/asm-powerpc/hugetlb.h | 2 +- include/asm-sh/hugetlb.h | 2 +- include/asm-sparc/hugetlb.h | 2 +- include/asm-x86/hugetlb.h | 2 +- include/linux/mm.h | 4 +--- mm/internal.h | 3 +++ mm/memory.c | 10 ++++++---- mm/mmap.c | 6 ++++-- 12 files changed, 26 insertions(+), 21 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index d3ce8f3bcaa6..cd49e2860eef 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -112,7 +112,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int wri return NULL; } -void hugetlb_free_pgd_range(struct mmu_gather **tlb, +void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0d12fba31bc5..1a96cc891cf5 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -255,7 +255,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, * * Must be called with pagetable lock held. */ -void hugetlb_free_pgd_range(struct mmu_gather **tlb, +void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -315,13 +315,13 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb, return; start = addr; - pgd = pgd_offset((*tlb)->mm, addr); + pgd = pgd_offset(tlb->mm, addr); do { - BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize); + BUG_ON(get_slice_psize(tlb->mm, addr) != mmu_huge_psize); next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling); + hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } diff --git a/fs/exec.c b/fs/exec.c index fd9234379e8d..190ed1f92774 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -541,7 +541,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * when the old and new regions overlap clear from new_end. */ - free_pgd_range(&tlb, new_end, old_end, new_end, + free_pgd_range(tlb, new_end, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } else { /* @@ -550,7 +550,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ - free_pgd_range(&tlb, old_start, old_end, new_end, + free_pgd_range(tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } tlb_finish_mmu(tlb, new_end, old_end); diff --git a/include/asm-ia64/hugetlb.h b/include/asm-ia64/hugetlb.h index f28a9701f1cf..e9d1e5e2382d 100644 --- a/include/asm-ia64/hugetlb.h +++ b/include/asm-ia64/hugetlb.h @@ -4,7 +4,7 @@ #include -void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, +void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); diff --git a/include/asm-powerpc/hugetlb.h b/include/asm-powerpc/hugetlb.h index be32ff02f4a0..0a37aa5ecaa5 100644 --- a/include/asm-powerpc/hugetlb.h +++ b/include/asm-powerpc/hugetlb.h @@ -7,7 +7,7 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len); -void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, +void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); diff --git a/include/asm-sh/hugetlb.h b/include/asm-sh/hugetlb.h index 02402303d89b..fb30018938c7 100644 --- a/include/asm-sh/hugetlb.h +++ b/include/asm-sh/hugetlb.h @@ -26,7 +26,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len) static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { } -static inline void hugetlb_free_pgd_range(struct mmu_gather **tlb, +static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) diff --git a/include/asm-sparc/hugetlb.h b/include/asm-sparc/hugetlb.h index 412af58926a0..aeb92374ca3d 100644 --- a/include/asm-sparc/hugetlb.h +++ b/include/asm-sparc/hugetlb.h @@ -31,7 +31,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len) return 0; } -static inline void hugetlb_free_pgd_range(struct mmu_gather **tlb, +static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) diff --git a/include/asm-x86/hugetlb.h b/include/asm-x86/hugetlb.h index 14171a4924f6..7eed6e0883bf 100644 --- a/include/asm-x86/hugetlb.h +++ b/include/asm-x86/hugetlb.h @@ -26,7 +26,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len) static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { } -static inline void hugetlb_free_pgd_range(struct mmu_gather **tlb, +static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c7f8f64f70e..f8071097302a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -769,10 +769,8 @@ struct mm_walk { int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk); -void free_pgd_range(struct mmu_gather **tlb, unsigned long addr, +void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, diff --git a/mm/internal.h b/mm/internal.h index 50807e12490e..858ad01864dc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -13,6 +13,9 @@ #include +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); + static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); diff --git a/mm/memory.c b/mm/memory.c index 87350321e66f..82f3f1c5cf17 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -61,6 +61,8 @@ #include #include +#include "internal.h" + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, * * Must be called with pagetable lock held. */ -void free_pgd_range(struct mmu_gather **tlb, +void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb, return; start = addr; - pgd = pgd_offset((*tlb)->mm, addr); + pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - free_pud_range(*tlb, pgd, addr, next, floor, ceiling); + free_pud_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { while (vma) { diff --git a/mm/mmap.c b/mm/mmap.c index 1d102b956fd8..75e0d0673d78 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -32,6 +32,8 @@ #include #include +#include "internal.h" + #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) #endif @@ -1763,7 +1765,7 @@ static void unmap_region(struct mm_struct *mm, update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); } @@ -2063,7 +2065,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); /* -- cgit v1.2.3 From 9109fb7b3520de187ebc3646c209d66a233f7169 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:27:20 -0700 Subject: mm: drop unneeded pgdat argument from free_area_init_node() free_area_init_node() gets passed in the node id as well as the node descriptor. This is redundant as the function can trivially get the node descriptor itself by means of NODE_DATA() and the node's id. I checked all the users and NODE_DATA() seems to be usable everywhere from where this function is called. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/numa.c | 2 +- arch/arm/mm/init.c | 2 +- arch/avr32/mm/init.c | 2 +- arch/cris/arch-v10/mm/init.c | 2 +- arch/cris/arch-v32/mm/init.c | 2 +- arch/m32r/mm/discontig.c | 3 +-- arch/m32r/mm/init.c | 2 +- arch/m68k/mm/motorola.c | 2 +- arch/m68k/mm/sun3mmu.c | 2 +- arch/parisc/mm/init.c | 2 +- arch/sparc/mm/srmmu.c | 3 +-- arch/sparc/mm/sun4c.c | 3 +-- arch/v850/kernel/setup.c | 3 +-- include/linux/mm.h | 5 ++--- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 11 ++++++----- 16 files changed, 22 insertions(+), 26 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index a53fda0481ca..def0c74a78a8 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -313,7 +313,7 @@ void __init paging_init(void) zones_size[ZONE_DMA] = dma_local_pfn; zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn; } - free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, NULL); + free_area_init_node(nid, zones_size, start_pfn, NULL); } /* Initialize the kernel's ZERO_PGE. */ diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index b657f1719af0..e6352946dde0 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -284,7 +284,7 @@ bootmem_init_node(int node, int initrd_node, struct meminfo *mi) */ arch_adjust_zones(node, zone_size, zhole_size); - free_area_init_node(node, pgdat, zone_size, start_pfn, zhole_size); + free_area_init_node(node, zone_size, start_pfn, zhole_size); return end_pfn; } diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c index 3f90a87527bb..786de88a82a7 100644 --- a/arch/avr32/mm/init.c +++ b/arch/avr32/mm/init.c @@ -129,7 +129,7 @@ void __init paging_init(void) printk("Node %u: start_pfn = 0x%lx, low = 0x%lx\n", nid, start_pfn, low); - free_area_init_node(nid, pgdat, zones_size, start_pfn, NULL); + free_area_init_node(nid, zones_size, start_pfn, NULL); printk("Node %u: mem_map starts at %p\n", pgdat->node_id, pgdat->node_mem_map); diff --git a/arch/cris/arch-v10/mm/init.c b/arch/cris/arch-v10/mm/init.c index e0fcd1a9bfd5..742fd1974c2e 100644 --- a/arch/cris/arch-v10/mm/init.c +++ b/arch/cris/arch-v10/mm/init.c @@ -182,7 +182,7 @@ paging_init(void) * mem_map page array. */ - free_area_init_node(0, &contig_page_data, zones_size, PAGE_OFFSET >> PAGE_SHIFT, 0); + free_area_init_node(0, zones_size, PAGE_OFFSET >> PAGE_SHIFT, 0); } /* Initialize remaps of some I/O-ports. It is important that this diff --git a/arch/cris/arch-v32/mm/init.c b/arch/cris/arch-v32/mm/init.c index 5a9ac5834647..8a34b8b74293 100644 --- a/arch/cris/arch-v32/mm/init.c +++ b/arch/cris/arch-v32/mm/init.c @@ -162,7 +162,7 @@ paging_init(void) * substantially higher than 0, like us (we start at PAGE_OFFSET). This * saves space in the mem_map page array. */ - free_area_init_node(0, &contig_page_data, zones_size, PAGE_OFFSET >> PAGE_SHIFT, 0); + free_area_init_node(0, zones_size, PAGE_OFFSET >> PAGE_SHIFT, 0); mem_map = contig_page_data.node_mem_map; } diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c index aa9145ef6cca..cc23934bc41e 100644 --- a/arch/m32r/mm/discontig.c +++ b/arch/m32r/mm/discontig.c @@ -147,8 +147,7 @@ unsigned long __init zone_sizes_init(void) zholes_size[ZONE_DMA] = mp->holes; holes += zholes_size[ZONE_DMA]; - free_area_init_node(nid, NODE_DATA(nid), zones_size, - start_pfn, zholes_size); + free_area_init_node(nid, zones_size, start_pfn, zholes_size); } /* diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index bbd97c85bc5d..28799af15e95 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -123,7 +123,7 @@ unsigned long __init zone_sizes_init(void) start_pfn = __MEMORY_START >> PAGE_SHIFT; #endif /* CONFIG_MMU */ - free_area_init_node(0, NODE_DATA(0), zones_size, start_pfn, 0); + free_area_init_node(0, zones_size, start_pfn, 0); return 0; } diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 226795bdf355..c5dbb9bdb322 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -296,7 +296,7 @@ void __init paging_init(void) #endif for (i = 0; i < m68k_num_memory; i++) { zones_size[ZONE_DMA] = m68k_memory[i].size >> PAGE_SHIFT; - free_area_init_node(i, pg_data_map + i, zones_size, + free_area_init_node(i, zones_size, m68k_memory[i].addr >> PAGE_SHIFT, NULL); } } diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index edceefc18870..1b902dbd4376 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -94,7 +94,7 @@ void __init paging_init(void) /* I really wish I knew why the following change made things better... -- Sam */ /* free_area_init(zones_size); */ - free_area_init_node(0, NODE_DATA(0), zones_size, + free_area_init_node(0, zones_size, (__pa(PAGE_OFFSET) >> PAGE_SHIFT) + 1, NULL); diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 0ddf4904640a..7c155c254e72 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -887,7 +887,7 @@ void __init paging_init(void) } #endif - free_area_init_node(i, NODE_DATA(i), zones_size, + free_area_init_node(i, zones_size, pmem_ranges[i].start_pfn, NULL); } } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index c624e04ff03e..ee30462598fc 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -1352,8 +1352,7 @@ void __init srmmu_paging_init(void) zones_size[ZONE_HIGHMEM] = npages; zholes_size[ZONE_HIGHMEM] = npages - calc_highpages(); - free_area_init_node(0, &contig_page_data, zones_size, - pfn_base, zholes_size); + free_area_init_node(0, zones_size, pfn_base, zholes_size); } } diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c index 2375fe9dc312..d1782f6368be 100644 --- a/arch/sparc/mm/sun4c.c +++ b/arch/sparc/mm/sun4c.c @@ -2123,8 +2123,7 @@ void __init sun4c_paging_init(void) zones_size[ZONE_HIGHMEM] = npages; zholes_size[ZONE_HIGHMEM] = npages - calc_highpages(); - free_area_init_node(0, &contig_page_data, zones_size, - pfn_base, zholes_size); + free_area_init_node(0, zones_size, pfn_base, zholes_size); } cnt = 0; diff --git a/arch/v850/kernel/setup.c b/arch/v850/kernel/setup.c index a0a8456a8430..10335cecf7bd 100644 --- a/arch/v850/kernel/setup.c +++ b/arch/v850/kernel/setup.c @@ -295,8 +295,7 @@ init_mem_alloc (unsigned long ram_start, unsigned long ram_len) #error MAX_ORDER is too large for given PAGE_OFFSET (use CONFIG_FORCE_MAX_ZONEORDER to change it) #endif NODE_DATA(0)->node_mem_map = NULL; - free_area_init_node (0, NODE_DATA(0), zones_size, - ADDR_TO_PAGE (PAGE_OFFSET), 0); + free_area_init_node(0, zones_size, ADDR_TO_PAGE (PAGE_OFFSET), 0); } diff --git a/include/linux/mm.h b/include/linux/mm.h index f8071097302a..196924b657bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -962,9 +962,8 @@ static inline void pgtable_page_dtor(struct page *page) NULL: pte_offset_kernel(pmd, address)) extern void free_area_init(unsigned long * zones_size); -extern void free_area_init_node(int nid, pg_data_t *pgdat, - unsigned long * zones_size, unsigned long zone_start_pfn, - unsigned long *zholes_size); +extern void free_area_init_node(int nid, unsigned long * zones_size, + unsigned long zone_start_pfn, unsigned long *zholes_size); #ifdef CONFIG_ARCH_POPULATES_NODE_MAP /* * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 833f854eabe5..6e26adc08f14 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -455,7 +455,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start) /* we can use NODE_DATA(nid) from here */ /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); + free_area_init_node(nid, zones_size, start_pfn, zholes_size); return pgdat; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 24aa3d1b9d96..e43aae135b38 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3461,10 +3461,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } -void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long node_start_pfn, - unsigned long *zholes_size) +void __paginginit free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, unsigned long *zholes_size) { + pg_data_t *pgdat = NODE_DATA(nid); + pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; calculate_node_totalpages(pgdat, zones_size, zholes_size); @@ -3961,7 +3962,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) setup_nr_node_ids(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, pgdat, NULL, + free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL); /* Any memory on that node */ @@ -4032,7 +4033,7 @@ EXPORT_SYMBOL(contig_page_data); void __init free_area_init(unsigned long *zones_size) { - free_area_init_node(0, NODE_DATA(0), zones_size, + free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -- cgit v1.2.3 From cdfd4325c0d878679bd6a3ba8285b71d9980e3c0 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:28 -0700 Subject: mm: record MAP_NORESERVE status on vmas and fix small page mprotect reservations With Mel's hugetlb private reservation support patches applied, strict overcommit semantics are applied to both shared and private huge page mappings. This can be a problem if an application relied on unlimited overcommit semantics for private mappings. An example of this would be an application which maps a huge area with the intention of using it very sparsely. These application would benefit from being able to opt-out of the strict overcommit. It should be noted that prior to hugetlb supporting demand faulting all mappings were fully populated and so applications of this type should be rare. This patch stack implements the MAP_NORESERVE mmap() flag for huge page mappings. This flag has the same meaning as for small page mappings, suppressing reservations for that mapping. Thanks to Mel Gorman for reviewing a number of early versions of these patches. This patch: When a small page mapping is created with mmap() reservations are created by default for any memory pages required. When the region is read/write the reservation is increased for every page, no reservation is needed for read-only regions (as they implicitly share the zero page). Reservations are tracked via the VM_ACCOUNT vma flag which is present when the region has reservation backing it. When we convert a region from read-only to read-write new reservations are aquired and VM_ACCOUNT is set. However, when a read-only map is created with MAP_NORESERVE it is indistinguishable from a normal mapping. When we then convert that to read/write we are forced to incorrectly create reservations for it as we have no record of the original MAP_NORESERVE. This patch introduces a new vma flag VM_NORESERVE which records the presence of the original MAP_NORESERVE flag. This allows us to distinguish these two circumstances and correctly account the reserve. As well as fixing this FIXME in the code, this makes it much easier to introduce MAP_NORESERVE support for huge pages as this flag is available consistantly for the life of the mapping. VM_ACCOUNT on the other hand is heavily used at the generic level in association with small pages. Signed-off-by: Andy Whitcroft Cc: Mel Gorman Cc: Adam Litke Cc: Johannes Weiner Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/mmap.c | 3 +++ mm/mprotect.c | 6 ++---- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 196924b657bc..df322fb4df31 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -100,6 +100,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ +#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ diff --git a/mm/mmap.c b/mm/mmap.c index 75e0d0673d78..57d3b6097deb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1110,6 +1110,9 @@ munmap_back: if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; + if (flags & MAP_NORESERVE) + vm_flags |= VM_NORESERVE; + if (accountable && (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { if (vm_flags & VM_SHARED) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 360d9cc8b38c..abd645a3b0a0 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -153,12 +153,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we * make it unwritable again. - * - * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting - * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE| + VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) return -ENOMEM; -- cgit v1.2.3 From 27ac792ca0b0a1e7e65f20342260650516c95864 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 23 Jul 2008 21:28:13 -0700 Subject: PAGE_ALIGN(): correctly handle 64-bit values on 32-bit architectures On 32-bit architectures PAGE_ALIGN() truncates 64-bit values to the 32-bit boundary. For example: u64 val = PAGE_ALIGN(size); always returns a value < 4GB even if size is greater than 4GB. The problem resides in PAGE_MASK definition (from include/asm-x86/page.h for example): #define PAGE_SHIFT 12 #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) ... #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) The "~" is performed on a 32-bit value, so everything in "and" with PAGE_MASK greater than 4GB will be truncated to the 32-bit boundary. Using the ALIGN() macro seems to be the right way, because it uses typeof(addr) for the mask. Also move the PAGE_ALIGN() definitions out of include/asm-*/page.h in include/linux/mm.h. See also lkml discussion: http://lkml.org/lkml/2008/6/11/237 [akpm@linux-foundation.org: fix drivers/media/video/uvc/uvc_queue.c] [akpm@linux-foundation.org: fix v850] [akpm@linux-foundation.org: fix powerpc] [akpm@linux-foundation.org: fix arm] [akpm@linux-foundation.org: fix mips] [akpm@linux-foundation.org: fix drivers/media/video/pvrusb2/pvrusb2-dvb.c] [akpm@linux-foundation.org: fix drivers/mtd/maps/uclinux.c] [akpm@linux-foundation.org: fix powerpc] Signed-off-by: Andrea Righi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/module.c | 1 + arch/arm/plat-omap/fb.c | 1 + arch/avr32/mm/ioremap.c | 1 + arch/h8300/kernel/setup.c | 1 + arch/m68k/amiga/chipram.c | 1 + arch/m68knommu/kernel/setup.c | 1 + arch/mips/kernel/module.c | 1 + arch/mips/sgi-ip27/ip27-klnuma.c | 1 + arch/powerpc/kernel/suspend.c | 1 + arch/powerpc/lib/code-patching.c | 1 + arch/sparc64/kernel/iommu_common.h | 2 +- arch/x86/kernel/module_64.c | 1 + arch/xtensa/kernel/setup.c | 1 + drivers/char/random.c | 1 + drivers/ieee1394/iso.c | 1 + drivers/media/video/pvrusb2/pvrusb2-dvb.c | 1 + drivers/media/video/pvrusb2/pvrusb2-ioread.c | 1 + drivers/media/video/uvc/uvc_queue.c | 1 + drivers/media/video/videobuf-core.c | 1 + drivers/mtd/maps/uclinux.c | 1 + drivers/net/mlx4/eq.c | 1 + drivers/pcmcia/electra_cf.c | 1 + drivers/scsi/sun_esp.c | 1 + drivers/video/acornfb.c | 1 + drivers/video/imxfb.c | 1 + drivers/video/omap/dispc.c | 1 + drivers/video/omap/omapfb_main.c | 1 + drivers/video/pxafb.c | 1 + drivers/video/sa1100fb.c | 1 + include/asm-alpha/page.h | 3 --- include/asm-arm/page-nommu.h | 4 +--- include/asm-arm/page.h | 3 --- include/asm-avr32/page.h | 3 --- include/asm-blackfin/page.h | 3 --- include/asm-cris/page.h | 3 --- include/asm-frv/page.h | 3 --- include/asm-h8300/page.h | 3 --- include/asm-ia64/page.h | 1 - include/asm-m32r/page.h | 3 --- include/asm-m68k/dvma.h | 2 +- include/asm-m68k/page.h | 3 --- include/asm-m68knommu/page.h | 3 --- include/asm-mips/page.h | 3 --- include/asm-mips/processor.h | 2 +- include/asm-mn10300/page.h | 3 --- include/asm-parisc/page.h | 4 ---- include/asm-powerpc/page.h | 3 --- include/asm-s390/page.h | 3 --- include/asm-sh/page.h | 3 --- include/asm-sparc/page_32.h | 3 --- include/asm-sparc/page_64.h | 3 --- include/asm-um/page.h | 3 --- include/asm-v850/page.h | 4 ---- include/asm-x86/page.h | 3 --- include/asm-xtensa/page.h | 2 -- include/linux/mm.h | 3 +++ sound/core/info.c | 1 + 57 files changed, 36 insertions(+), 74 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 79b7e5cf5416..a68259a0cccd 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/arm/plat-omap/fb.c b/arch/arm/plat-omap/fb.c index 96d6f0619733..5d107520e6b9 100644 --- a/arch/arm/plat-omap/fb.c +++ b/arch/arm/plat-omap/fb.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/arch/avr32/mm/ioremap.c b/arch/avr32/mm/ioremap.c index 3437c82434ac..f03b79f0e0ab 100644 --- a/arch/avr32/mm/ioremap.c +++ b/arch/avr32/mm/ioremap.c @@ -6,6 +6,7 @@ * published by the Free Software Foundation. */ #include +#include #include #include diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c index b1f25c20a5db..7fda657110eb 100644 --- a/arch/h8300/kernel/setup.c +++ b/arch/h8300/kernel/setup.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/m68k/amiga/chipram.c b/arch/m68k/amiga/chipram.c index cbe36538af47..61df1d33c050 100644 --- a/arch/m68k/amiga/chipram.c +++ b/arch/m68k/amiga/chipram.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/arch/m68knommu/kernel/setup.c b/arch/m68knommu/kernel/setup.c index 03f4fe6a2fc0..5985f1989021 100644 --- a/arch/m68knommu/kernel/setup.c +++ b/arch/m68knommu/kernel/setup.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index e7ed0ac48537..1f60e27523d9 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/arch/mips/sgi-ip27/ip27-klnuma.c b/arch/mips/sgi-ip27/ip27-klnuma.c index 48932ce1d730..d9c79d8be81d 100644 --- a/arch/mips/sgi-ip27/ip27-klnuma.c +++ b/arch/mips/sgi-ip27/ip27-klnuma.c @@ -4,6 +4,7 @@ * Copyright 2000 - 2001 Kanoj Sarcar (kanoj@sgi.com) */ #include +#include #include #include #include diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c index 8cee57107541..6fc6328dc626 100644 --- a/arch/powerpc/kernel/suspend.c +++ b/arch/powerpc/kernel/suspend.c @@ -7,6 +7,7 @@ * Copyright (c) 2001 Patrick Mochel */ +#include #include /* References to section boundaries */ diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 0559fe086eb4..7c975d43e3f3 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/arch/sparc64/kernel/iommu_common.h b/arch/sparc64/kernel/iommu_common.h index f3575a614fa2..53b19c8231a9 100644 --- a/arch/sparc64/kernel/iommu_common.h +++ b/arch/sparc64/kernel/iommu_common.h @@ -23,7 +23,7 @@ #define IO_PAGE_SHIFT 13 #define IO_PAGE_SIZE (1UL << IO_PAGE_SHIFT) #define IO_PAGE_MASK (~(IO_PAGE_SIZE-1)) -#define IO_PAGE_ALIGN(addr) (((addr)+IO_PAGE_SIZE-1)&IO_PAGE_MASK) +#define IO_PAGE_ALIGN(addr) ALIGN(addr, IO_PAGE_SIZE) #define IO_TSB_ENTRIES (128*1024) #define IO_TSB_SIZE (IO_TSB_ENTRIES * 8) diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index 0e867676b5a5..6ba87830d4b1 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index 5e6d75c9f92b..a00359e8f7a8 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/drivers/char/random.c b/drivers/char/random.c index 0cf98bd4f2d2..e0d0e371909c 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -236,6 +236,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/ieee1394/iso.c b/drivers/ieee1394/iso.c index 07ca35c98f96..1cf6487b65ba 100644 --- a/drivers/ieee1394/iso.c +++ b/drivers/ieee1394/iso.c @@ -11,6 +11,7 @@ #include #include +#include #include #include "hosts.h" diff --git a/drivers/media/video/pvrusb2/pvrusb2-dvb.c b/drivers/media/video/pvrusb2/pvrusb2-dvb.c index 6ec4bf81fc7f..77b3c3385066 100644 --- a/drivers/media/video/pvrusb2/pvrusb2-dvb.c +++ b/drivers/media/video/pvrusb2/pvrusb2-dvb.c @@ -20,6 +20,7 @@ #include #include +#include #include "dvbdev.h" #include "pvrusb2-debug.h" #include "pvrusb2-hdw-internal.h" diff --git a/drivers/media/video/pvrusb2/pvrusb2-ioread.c b/drivers/media/video/pvrusb2/pvrusb2-ioread.c index 05a1376405e7..b4824782d858 100644 --- a/drivers/media/video/pvrusb2/pvrusb2-ioread.c +++ b/drivers/media/video/pvrusb2/pvrusb2-ioread.c @@ -22,6 +22,7 @@ #include "pvrusb2-debug.h" #include #include +#include #include #include #include diff --git a/drivers/media/video/uvc/uvc_queue.c b/drivers/media/video/uvc/uvc_queue.c index 7388d0cee3d4..5646a6a32939 100644 --- a/drivers/media/video/uvc/uvc_queue.c +++ b/drivers/media/video/uvc/uvc_queue.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/drivers/media/video/videobuf-core.c b/drivers/media/video/videobuf-core.c index 0a88c44ace00..b7b05842cf28 100644 --- a/drivers/media/video/videobuf-core.c +++ b/drivers/media/video/videobuf-core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/drivers/mtd/maps/uclinux.c b/drivers/mtd/maps/uclinux.c index c42f4b83f686..3fcf92130aa4 100644 --- a/drivers/mtd/maps/uclinux.c +++ b/drivers/mtd/maps/uclinux.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index e141a1513f07..ea3a09aaa844 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -33,6 +33,7 @@ #include #include +#include #include #include diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c index c21f9a9c3e3f..a34284b1482a 100644 --- a/drivers/pcmcia/electra_cf.c +++ b/drivers/pcmcia/electra_cf.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include diff --git a/drivers/scsi/sun_esp.c b/drivers/scsi/sun_esp.c index 2c87db98cdfb..f9cf70151366 100644 --- a/drivers/scsi/sun_esp.c +++ b/drivers/scsi/sun_esp.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c index eedb8285e32f..017233d0c481 100644 --- a/drivers/video/acornfb.c +++ b/drivers/video/acornfb.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/video/imxfb.c b/drivers/video/imxfb.c index 94e4d3ac1a05..0c5a475c1cae 100644 --- a/drivers/video/imxfb.c +++ b/drivers/video/imxfb.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/video/omap/dispc.c b/drivers/video/omap/dispc.c index ab32ceb06178..ab77c51fe9d6 100644 --- a/drivers/video/omap/dispc.c +++ b/drivers/video/omap/dispc.c @@ -20,6 +20,7 @@ */ #include #include +#include #include #include #include diff --git a/drivers/video/omap/omapfb_main.c b/drivers/video/omap/omapfb_main.c index 14d0f7a11145..f85af5c4fa68 100644 --- a/drivers/video/omap/omapfb_main.c +++ b/drivers/video/omap/omapfb_main.c @@ -25,6 +25,7 @@ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include +#include #include #include diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c index bb2514369507..5e8a140399fc 100644 --- a/drivers/video/pxafb.c +++ b/drivers/video/pxafb.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/video/sa1100fb.c b/drivers/video/sa1100fb.c index ab2b2110478b..4a9f7e121807 100644 --- a/drivers/video/sa1100fb.c +++ b/drivers/video/sa1100fb.c @@ -167,6 +167,7 @@ #include #include #include +#include #include #include #include diff --git a/include/asm-alpha/page.h b/include/asm-alpha/page.h index 22ff9762d17b..0995f9d13417 100644 --- a/include/asm-alpha/page.h +++ b/include/asm-alpha/page.h @@ -80,9 +80,6 @@ typedef struct page *pgtable_t; #endif /* !__ASSEMBLY__ */ -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - #define __pa(x) ((unsigned long) (x) - PAGE_OFFSET) #define __va(x) ((void *)((unsigned long) (x) + PAGE_OFFSET)) #ifndef CONFIG_DISCONTIGMEM diff --git a/include/asm-arm/page-nommu.h b/include/asm-arm/page-nommu.h index a1bcad060480..ea1cde84f500 100644 --- a/include/asm-arm/page-nommu.h +++ b/include/asm-arm/page-nommu.h @@ -7,6 +7,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ + #ifndef _ASMARM_PAGE_NOMMU_H #define _ASMARM_PAGE_NOMMU_H @@ -42,9 +43,6 @@ typedef unsigned long pgprot_t; #define __pmd(x) (x) #define __pgprot(x) (x) -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - extern unsigned long memory_start; extern unsigned long memory_end; diff --git a/include/asm-arm/page.h b/include/asm-arm/page.h index 8e05bdb5f12f..7c5fc5582e5d 100644 --- a/include/asm-arm/page.h +++ b/include/asm-arm/page.h @@ -15,9 +15,6 @@ #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - #ifndef __ASSEMBLY__ #ifndef CONFIG_MMU diff --git a/include/asm-avr32/page.h b/include/asm-avr32/page.h index cbbc5ca9728b..f805d1cb11bc 100644 --- a/include/asm-avr32/page.h +++ b/include/asm-avr32/page.h @@ -57,9 +57,6 @@ static inline int get_order(unsigned long size) #endif /* !__ASSEMBLY__ */ -/* Align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) - /* * The hardware maps the virtual addresses 0x80000000 -> 0x9fffffff * permanently to the physical addresses 0x00000000 -> 0x1fffffff when diff --git a/include/asm-blackfin/page.h b/include/asm-blackfin/page.h index c7db0220fbd6..344f6a8c1f22 100644 --- a/include/asm-blackfin/page.h +++ b/include/asm-blackfin/page.h @@ -51,9 +51,6 @@ typedef struct page *pgtable_t; #define __pgd(x) ((pgd_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } ) -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - extern unsigned long memory_start; extern unsigned long memory_end; diff --git a/include/asm-cris/page.h b/include/asm-cris/page.h index c45bb1ef397c..d19272ba6b69 100644 --- a/include/asm-cris/page.h +++ b/include/asm-cris/page.h @@ -60,9 +60,6 @@ typedef struct page *pgtable_t; #define page_to_phys(page) __pa((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET) -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - #ifndef __ASSEMBLY__ #endif /* __ASSEMBLY__ */ diff --git a/include/asm-frv/page.h b/include/asm-frv/page.h index c2c1e89e747d..bd9c220094c7 100644 --- a/include/asm-frv/page.h +++ b/include/asm-frv/page.h @@ -40,9 +40,6 @@ typedef struct page *pgtable_t; #define __pgprot(x) ((pgprot_t) { (x) } ) #define PTE_MASK PAGE_MASK -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) - #define devmem_is_allowed(pfn) 1 #define __pa(vaddr) virt_to_phys((void *) (unsigned long) (vaddr)) diff --git a/include/asm-h8300/page.h b/include/asm-h8300/page.h index d6a3eaf3b27e..0b6acf0b03aa 100644 --- a/include/asm-h8300/page.h +++ b/include/asm-h8300/page.h @@ -43,9 +43,6 @@ typedef struct page *pgtable_t; #define __pgd(x) ((pgd_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } ) -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - extern unsigned long memory_start; extern unsigned long memory_end; diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h index 36f39321b768..5f271bc712ee 100644 --- a/include/asm-ia64/page.h +++ b/include/asm-ia64/page.h @@ -40,7 +40,6 @@ #define PAGE_SIZE (__IA64_UL_CONST(1) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE - 1)) -#define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) #define PERCPU_PAGE_SHIFT 16 /* log2() of max. size of per-CPU area */ #define PERCPU_PAGE_SIZE (__IA64_UL_CONST(1) << PERCPU_PAGE_SHIFT) diff --git a/include/asm-m32r/page.h b/include/asm-m32r/page.h index 8a677f3fca68..c9333089fe11 100644 --- a/include/asm-m32r/page.h +++ b/include/asm-m32r/page.h @@ -41,9 +41,6 @@ typedef struct page *pgtable_t; #endif /* !__ASSEMBLY__ */ -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) - /* * This handles the memory map.. We could make this a config * option, but too many people screw it up, and too few need diff --git a/include/asm-m68k/dvma.h b/include/asm-m68k/dvma.h index 4fff408d0150..890bbf7e7758 100644 --- a/include/asm-m68k/dvma.h +++ b/include/asm-m68k/dvma.h @@ -13,7 +13,7 @@ #define DVMA_PAGE_SHIFT 13 #define DVMA_PAGE_SIZE (1UL << DVMA_PAGE_SHIFT) #define DVMA_PAGE_MASK (~(DVMA_PAGE_SIZE-1)) -#define DVMA_PAGE_ALIGN(addr) (((addr)+DVMA_PAGE_SIZE-1)&DVMA_PAGE_MASK) +#define DVMA_PAGE_ALIGN(addr) ALIGN(addr, DVMA_PAGE_SIZE) extern void dvma_init(void); extern int dvma_map_iommu(unsigned long kaddr, unsigned long baddr, diff --git a/include/asm-m68k/page.h b/include/asm-m68k/page.h index 880c2cbff8a6..a34b8bad7847 100644 --- a/include/asm-m68k/page.h +++ b/include/asm-m68k/page.h @@ -103,9 +103,6 @@ typedef struct page *pgtable_t; #define __pgd(x) ((pgd_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } ) -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - #endif /* !__ASSEMBLY__ */ #include diff --git a/include/asm-m68knommu/page.h b/include/asm-m68knommu/page.h index 1e82ebb7d644..3a1ede4544cb 100644 --- a/include/asm-m68knommu/page.h +++ b/include/asm-m68knommu/page.h @@ -43,9 +43,6 @@ typedef struct page *pgtable_t; #define __pgd(x) ((pgd_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } ) -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - extern unsigned long memory_start; extern unsigned long memory_end; diff --git a/include/asm-mips/page.h b/include/asm-mips/page.h index 494f00ba9541..fe7a88ea066e 100644 --- a/include/asm-mips/page.h +++ b/include/asm-mips/page.h @@ -137,9 +137,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #endif /* !__ASSEMBLY__ */ -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) - /* * __pa()/__va() should be used only during mem init. */ diff --git a/include/asm-mips/processor.h b/include/asm-mips/processor.h index 58cbac5a64e4..a1e4453469f9 100644 --- a/include/asm-mips/processor.h +++ b/include/asm-mips/processor.h @@ -45,7 +45,7 @@ extern unsigned int vced_count, vcei_count; * This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#define TASK_UNMAPPED_BASE ((TASK_SIZE / 3) & ~(PAGE_SIZE)) #endif #ifdef CONFIG_64BIT diff --git a/include/asm-mn10300/page.h b/include/asm-mn10300/page.h index 124971b9fb9b..8288e124165b 100644 --- a/include/asm-mn10300/page.h +++ b/include/asm-mn10300/page.h @@ -61,9 +61,6 @@ typedef struct page *pgtable_t; #endif /* !__ASSEMBLY__ */ -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) - /* * This handles the memory map.. We could make this a config * option, but too many people screw it up, and too few need diff --git a/include/asm-parisc/page.h b/include/asm-parisc/page.h index 27d50b859541..c3941f09a878 100644 --- a/include/asm-parisc/page.h +++ b/include/asm-parisc/page.h @@ -119,10 +119,6 @@ extern int npmem_ranges; #define PMD_ENTRY_SIZE (1UL << BITS_PER_PMD_ENTRY) #define PTE_ENTRY_SIZE (1UL << BITS_PER_PTE_ENTRY) -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - - #define LINUX_GATEWAY_SPACE 0 /* This governs the relationship between virtual and physical addresses. diff --git a/include/asm-powerpc/page.h b/include/asm-powerpc/page.h index cffdf0eb0df6..e088545cb3f5 100644 --- a/include/asm-powerpc/page.h +++ b/include/asm-powerpc/page.h @@ -119,9 +119,6 @@ extern phys_addr_t kernstart_addr; /* align addr on a size boundary - adjust address up if needed */ #define _ALIGN(addr,size) _ALIGN_UP(addr,size) -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) - /* * Don't compare things with KERNELBASE or PAGE_OFFSET to test for * "kernelness", use is_kernel_addr() - it should do what you want. diff --git a/include/asm-s390/page.h b/include/asm-s390/page.h index 12fd9c4f0f15..991ba939408c 100644 --- a/include/asm-s390/page.h +++ b/include/asm-s390/page.h @@ -138,9 +138,6 @@ void arch_alloc_page(struct page *page, int order); #endif /* !__ASSEMBLY__ */ -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - #define __PAGE_OFFSET 0x0UL #define PAGE_OFFSET 0x0UL #define __pa(x) (unsigned long)(x) diff --git a/include/asm-sh/page.h b/include/asm-sh/page.h index 304c30b5d947..5dc01d2fcc4c 100644 --- a/include/asm-sh/page.h +++ b/include/asm-sh/page.h @@ -22,9 +22,6 @@ #define PAGE_MASK (~(PAGE_SIZE-1)) #define PTE_MASK PAGE_MASK -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - #if defined(CONFIG_HUGETLB_PAGE_SIZE_64K) #define HPAGE_SHIFT 16 #elif defined(CONFIG_HUGETLB_PAGE_SIZE_256K) diff --git a/include/asm-sparc/page_32.h b/include/asm-sparc/page_32.h index 14de518cc38f..cf5fb70ca1c1 100644 --- a/include/asm-sparc/page_32.h +++ b/include/asm-sparc/page_32.h @@ -134,9 +134,6 @@ BTFIXUPDEF_SETHI(sparc_unmapped_base) #endif /* !(__ASSEMBLY__) */ -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - #define PAGE_OFFSET 0xf0000000 #ifndef __ASSEMBLY__ extern unsigned long phys_base; diff --git a/include/asm-sparc/page_64.h b/include/asm-sparc/page_64.h index a8a2bba032c1..b579b910ef51 100644 --- a/include/asm-sparc/page_64.h +++ b/include/asm-sparc/page_64.h @@ -106,9 +106,6 @@ typedef struct page *pgtable_t; #endif /* !(__ASSEMBLY__) */ -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - /* We used to stick this into a hard-coded global register (%g4) * but that does not make sense anymore. */ diff --git a/include/asm-um/page.h b/include/asm-um/page.h index 916e1a61999f..335c57383c02 100644 --- a/include/asm-um/page.h +++ b/include/asm-um/page.h @@ -92,9 +92,6 @@ typedef struct page *pgtable_t; #define __pgd(x) ((pgd_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } ) -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - extern unsigned long uml_physmem; #define PAGE_OFFSET (uml_physmem) diff --git a/include/asm-v850/page.h b/include/asm-v850/page.h index 74a539a9bd59..f9de35d873fa 100644 --- a/include/asm-v850/page.h +++ b/include/asm-v850/page.h @@ -94,10 +94,6 @@ typedef unsigned long pgprot_t; #endif /* !__ASSEMBLY__ */ -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) - - /* No current v850 processor has virtual memory. */ #define __virt_to_phys(addr) (addr) #define __phys_to_virt(addr) (addr) diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h index 6e02098b1605..49982110e4d9 100644 --- a/include/asm-x86/page.h +++ b/include/asm-x86/page.h @@ -34,9 +34,6 @@ #define HUGE_MAX_HSTATE 2 -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - #ifndef __ASSEMBLY__ #include #endif diff --git a/include/asm-xtensa/page.h b/include/asm-xtensa/page.h index 80a6ae0dd259..11f7dc2dbec7 100644 --- a/include/asm-xtensa/page.h +++ b/include/asm-xtensa/page.h @@ -26,13 +26,11 @@ /* * PAGE_SHIFT determines the page size - * PAGE_ALIGN(x) aligns the pointer to the (next) page boundary */ #define PAGE_SHIFT 12 #define PAGE_SIZE (__XTENSA_UL_CONST(1) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE - 1) & PAGE_MASK) #define PAGE_OFFSET XCHAL_KSEG_CACHED_VADDR #define MAX_MEM_PFN XCHAL_KSEG_SIZE diff --git a/include/linux/mm.h b/include/linux/mm.h index df322fb4df31..d87a5a5fe87d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -41,6 +41,9 @@ extern unsigned long mmap_min_addr; #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) +/* to align the pointer to the (next) page boundary */ +#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) + /* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way diff --git a/sound/core/info.c b/sound/core/info.c index cb5ead3e202d..c67773ad9298 100644 --- a/sound/core/info.c +++ b/sound/core/info.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From 21cc199baa815d7b3f1ace4be20b9558cbddc00f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:22 -0700 Subject: mm: introduce get_user_pages_fast Introduce a new get_user_pages_fast mm API, which is basically a get_user_pages with a less general API (but still tends to be suited to the common case): - task and mm are always current and current->mm - force is always 0 - pages is always non-NULL - don't pass back vmas This restricted API can be implemented in a much more scalable way on many architectures when the ptes are present, by walking the page tables locklessly (no mmap_sem or page table locks). When the ptes are not populated, get_user_pages_fast() could be slower. This is implemented locklessly on x86, and used in some key direct IO call sites, in later patches, which provides nearly 10% performance improvement on a threaded database workload. Lots of other code could use this too, depending on use cases (eg. grep drivers/). And it might inspire some new and clever ways to use it. [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Nick Piggin Cc: Dave Kleikamp Cc: Andy Whitcroft Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andi Kleen Cc: Dave Kleikamp Cc: Badari Pulavarty Cc: Zach Brown Cc: Jens Axboe Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index d87a5a5fe87d..f3fd70d6029f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -833,6 +833,39 @@ extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); +#ifdef CONFIG_HAVE_GET_USER_PAGES_FAST +/* + * get_user_pages_fast provides equivalent functionality to get_user_pages, + * operating on current and current->mm (force=0 and doesn't return any vmas). + * + * get_user_pages_fast may take mmap_sem and page tables, so no assumptions + * can be made about locking. get_user_pages_fast is to be implemented in a + * way that is advantageous (vs get_user_pages()) when the user memory area is + * already faulted in and present in ptes. However if the pages have to be + * faulted in, it may turn out to be slightly slower). + */ +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages); + +#else +/* + * Should probably be moved to asm-generic, and architectures can include it if + * they don't implement their own get_user_pages_fast. + */ +#define get_user_pages_fast(start, nr_pages, write, pages) \ +({ \ + struct mm_struct *mm = current->mm; \ + int ret; \ + \ + down_read(&mm->mmap_sem); \ + ret = get_user_pages(current, mm, start, nr_pages, \ + write, 0, pages, NULL); \ + up_read(&mm->mmap_sem); \ + \ + ret; \ +}) +#endif + /* * A callback you can register to apply pressure to ageable caches. * -- cgit v1.2.3 From 15f59adae001766a2c7f7fe4f196387bb04bcff5 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 25 Jul 2008 19:46:23 -0700 Subject: make mm/memory.c:print_bad_pte() static This patch makes the needlessly global print_bad_pte() static. Signed-off-by: Adrian Bunk Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/memory.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index f3fd70d6029f..6e695eaab4ce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -810,7 +810,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void * int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); -void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned long offset); diff --git a/mm/memory.c b/mm/memory.c index 262e3eb6601a..a8ca04faaea6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -374,7 +374,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) * * The calling function must still handle the error. */ -void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) +static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, + unsigned long vaddr) { printk(KERN_ERR "Bad pte = %08llx, process = %s, " "vm_flags = %lx, vaddr = %lx\n", -- cgit v1.2.3 From 7906d00cd1f687268f0a3599442d113767795ae6 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jul 2008 15:46:26 -0700 Subject: mmu-notifiers: add mm_take_all_locks() operation mm_take_all_locks holds off reclaim from an entire mm_struct. This allows mmu notifiers to register into the mm at any time with the guarantee that no mmu operation is in progress on the mm. This operation locks against the VM for all pte/vma/mm related operations that could ever happen on a certain mm. This includes vmtruncate, try_to_unmap, and all page faults. The caller must take the mmap_sem in write mode before calling mm_take_all_locks(). The caller isn't allowed to release the mmap_sem until mm_drop_all_locks() returns. mmap_sem in write mode is required in order to block all operations that could modify pagetables and free pages without need of altering the vma layout (for example populate_range() with nonlinear vmas). It's also needed in write mode to avoid new anon_vmas to be associated with existing vmas. A single task can't take more than one mm_take_all_locks() in a row or it would deadlock. mm_take_all_locks() and mm_drop_all_locks are expensive operations that may have to take thousand of locks. mm_take_all_locks() can fail if it's interrupted by signals. When mmu_notifier_register returns, we must be sure that the driver is notified if some task is in the middle of a vmtruncate for the 'mm' where the mmu notifier was registered (mmu_notifier_invalidate_range_start/end is run around the vmtruncation but mmu_notifier_register can run after mmu_notifier_invalidate_range_start and before mmu_notifier_invalidate_range_end). Same problem for rmap paths. And we've to remove page pinning to avoid replicating the tlb_gather logic inside KVM (and GRU doesn't work well with page pinning regardless of needing tlb_gather), so without mm_take_all_locks when vmtruncate frees the page, kvm would have no way to notice that it mapped into sptes a page that is going into the freelist without a chance of any further mmu_notifier notification. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli Acked-by: Linus Torvalds Cc: Christoph Lameter Cc: Jack Steiner Cc: Robin Holt Cc: Nick Piggin Cc: Peter Zijlstra Cc: Kanoj Sarcar Cc: Roland Dreier Cc: Steve Wise Cc: Avi Kivity Cc: Hugh Dickins Cc: Rusty Russell Cc: Anthony Liguori Cc: Chris Wright Cc: Marcelo Tosatti Cc: Eric Dumazet Cc: "Paul E. McKenney" Cc: Izik Eidus Cc: Anthony Liguori Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 + include/linux/pagemap.h | 1 + include/linux/rmap.h | 8 +++ mm/mmap.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 170 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6e695eaab4ce..866a3dbe5c75 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1104,6 +1104,9 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff); extern void exit_mmap(struct mm_struct *); +extern int mm_take_all_locks(struct mm_struct *mm); +extern void mm_drop_all_locks(struct mm_struct *mm); + #ifdef CONFIG_PROC_FS /* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */ extern void added_exe_file_vma(struct mm_struct *mm); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a81d81890422..a39b38ccdc97 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -20,6 +20,7 @@ */ #define AS_EIO (__GFP_BITS_SHIFT + 0) /* IO error on async write */ #define AS_ENOSPC (__GFP_BITS_SHIFT + 1) /* ENOSPC on async write */ +#define AS_MM_ALL_LOCKS (__GFP_BITS_SHIFT + 2) /* under mm_take_all_locks() */ static inline void mapping_set_error(struct address_space *mapping, int error) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1383692ac5bd..69407f85e10b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -26,6 +26,14 @@ */ struct anon_vma { spinlock_t lock; /* Serialize access to vma list */ + /* + * NOTE: the LSB of the head.next is set by + * mm_take_all_locks() _after_ taking the above lock. So the + * head must only be read/written after taking the above lock + * to be sure to see a valid next pointer. The LSB bit itself + * is serialized by a system wide lock only visible to + * mm_take_all_locks() (mm_all_locks_mutex). + */ struct list_head head; /* List of private "related" vmas */ }; diff --git a/mm/mmap.c b/mm/mmap.c index 5e0cc99e9cd5..e5f9cb83d6d4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2268,3 +2268,161 @@ int install_special_mapping(struct mm_struct *mm, return 0; } + +static DEFINE_MUTEX(mm_all_locks_mutex); + +static void vm_lock_anon_vma(struct anon_vma *anon_vma) +{ + if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { + /* + * The LSB of head.next can't change from under us + * because we hold the mm_all_locks_mutex. + */ + spin_lock(&anon_vma->lock); + /* + * We can safely modify head.next after taking the + * anon_vma->lock. If some other vma in this mm shares + * the same anon_vma we won't take it again. + * + * No need of atomic instructions here, head.next + * can't change from under us thanks to the + * anon_vma->lock. + */ + if (__test_and_set_bit(0, (unsigned long *) + &anon_vma->head.next)) + BUG(); + } +} + +static void vm_lock_mapping(struct address_space *mapping) +{ + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change from under us because + * we hold the mm_all_locks_mutex. + * + * Operations on ->flags have to be atomic because + * even if AS_MM_ALL_LOCKS is stable thanks to the + * mm_all_locks_mutex, there may be other cpus + * changing other bitflags in parallel to us. + */ + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) + BUG(); + spin_lock(&mapping->i_mmap_lock); + } +} + +/* + * This operation locks against the VM for all pte/vma/mm related + * operations that could ever happen on a certain mm. This includes + * vmtruncate, try_to_unmap, and all page faults. + * + * The caller must take the mmap_sem in write mode before calling + * mm_take_all_locks(). The caller isn't allowed to release the + * mmap_sem until mm_drop_all_locks() returns. + * + * mmap_sem in write mode is required in order to block all operations + * that could modify pagetables and free pages without need of + * altering the vma layout (for example populate_range() with + * nonlinear vmas). It's also needed in write mode to avoid new + * anon_vmas to be associated with existing vmas. + * + * A single task can't take more than one mm_take_all_locks() in a row + * or it would deadlock. + * + * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * mapping->flags avoid to take the same lock twice, if more than one + * vma in this mm is backed by the same anon_vma or address_space. + * + * We can take all the locks in random order because the VM code + * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never + * takes more than one of them in a row. Secondly we're protected + * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. + * + * mm_take_all_locks() and mm_drop_all_locks are expensive operations + * that may have to take thousand of locks. + * + * mm_take_all_locks() can fail if it's interrupted by signals. + */ +int mm_take_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int ret = -EINTR; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + + mutex_lock(&mm_all_locks_mutex); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + vm_lock_anon_vma(vma->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_lock_mapping(vma->vm_file->f_mapping); + } + ret = 0; + +out_unlock: + if (ret) + mm_drop_all_locks(mm); + + return ret; +} + +static void vm_unlock_anon_vma(struct anon_vma *anon_vma) +{ + if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { + /* + * The LSB of head.next can't change to 0 from under + * us because we hold the mm_all_locks_mutex. + * + * We must however clear the bitflag before unlocking + * the vma so the users using the anon_vma->head will + * never see our bitflag. + * + * No need of atomic instructions here, head.next + * can't change from under us until we release the + * anon_vma->lock. + */ + if (!__test_and_clear_bit(0, (unsigned long *) + &anon_vma->head.next)) + BUG(); + spin_unlock(&anon_vma->lock); + } +} + +static void vm_unlock_mapping(struct address_space *mapping) +{ + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change to 0 from under us + * because we hold the mm_all_locks_mutex. + */ + spin_unlock(&mapping->i_mmap_lock); + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, + &mapping->flags)) + BUG(); + } +} + +/* + * The mmap_sem cannot be released by the caller until + * mm_drop_all_locks() returns. + */ +void mm_drop_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->anon_vma) + vm_unlock_anon_vma(vma->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } + + mutex_unlock(&mm_all_locks_mutex); +} -- cgit v1.2.3