From af1839eb4bd4fe079a125eb199205fceb6ae19e6 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:28:08 -0700 Subject: Kconfig: clean up the long arch list for the UID16 config option Introduce HAVE_UID16 config option and select it in corresponding architecture Kconfig files. UID16 now only depends on HAVE_UID16. Signed-off-by: Catalin Marinas Acked-by: Geert Uytterhoeven Cc: Russell King Cc: Mike Frysinger Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Yoshinori Sato Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: "David S. Miller" Cc: Jeff Dike Cc: Richard Weinberger Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b72777ff32a9..fd5d7c2c2daa 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -10,6 +10,7 @@ config X86_32 def_bool y depends on !64BIT select CLKSRC_I8253 + select HAVE_UID16 config X86_64 def_bool y @@ -2168,6 +2169,7 @@ config IA32_EMULATION bool "IA32 Emulation" depends on X86_64 select COMPAT_BINFMT_ELF + select HAVE_UID16 ---help--- Include code to run legacy 32-bit programs under a 64-bit kernel. You should likely turn this on, unless you're -- cgit v1.2.3 From b69ec42b1b194cc88f04b3fbcda8d3f93182d6c3 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:28:11 -0700 Subject: Kconfig: clean up the long arch list for the DEBUG_KMEMLEAK config option Introduce HAVE_DEBUG_KMEMLEAK config option and select it in corresponding architecture Kconfig files. DEBUG_KMEMLEAK now only depends on HAVE_DEBUG_KMEMLEAK. Signed-off-by: Catalin Marinas Cc: Russell King Cc: Michal Simek Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + arch/arm64/Kconfig | 1 + arch/microblaze/Kconfig | 1 + arch/mips/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/tile/Kconfig | 1 + arch/x86/Kconfig | 1 + lib/Kconfig.debug | 8 ++++---- 11 files changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 5f5439672932..2867a7742306 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -25,6 +25,7 @@ config ARM select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_GENERIC_DMA_COHERENT + select HAVE_DEBUG_KMEMLEAK select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZO select HAVE_KERNEL_LZMA diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e61acae0d891..5dc9273781d6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -10,6 +10,7 @@ config ARM64 select GENERIC_TIME_VSYSCALL select HARDIRQS_SW_RESEND select HAVE_ARCH_TRACEHOOK + select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG select HAVE_DMA_ATTRS select HAVE_GENERIC_DMA_COHERENT diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 6133bed2b855..53fd94ab60f0 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -16,6 +16,7 @@ config MICROBLAZE select OF select OF_EARLY_FLATTREE select ARCH_WANT_IPC_PARSE_VERSION + select HAVE_DEBUG_KMEMLEAK select IRQ_DOMAIN select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index faf65286574e..335115e5bdd9 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -17,6 +17,7 @@ config MIPS select HAVE_FUNCTION_GRAPH_TRACER select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_DEBUG_KMEMLEAK select ARCH_BINFMT_ELF_RANDOMIZE_PIE select RTC_LIB if !MACH_LOONGSON select GENERIC_ATOMIC64 if !64BIT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4ce0be32d153..6a798a70a6d1 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -113,6 +113,7 @@ config PPC select HAVE_DMA_API_DEBUG select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE + select HAVE_DEBUG_KMEMLEAK select HAVE_SYSCALL_WRAPPERS if PPC64 select GENERIC_ATOMIC64 if PPC32 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index baba37cfcf84..8c6d7986f6d2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -80,6 +80,7 @@ config S390 select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select ARCH_HAVE_NMI_SAFE_CMPXCHG + select HAVE_DEBUG_KMEMLEAK select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index f0c85e424777..cfbf3e3c982b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -16,6 +16,7 @@ config SUPERH select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select PERF_USE_VMALLOC + select HAVE_DEBUG_KMEMLEAK select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index e66481015d3b..274d6cf0ada2 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -60,6 +60,7 @@ config SPARC64 select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_SYSCALL_TRACEPOINTS + select HAVE_DEBUG_KMEMLEAK select RTC_DRV_CMOS select RTC_DRV_BQ4802 select RTC_DRV_SUN4V diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index c9a3c1fe7297..9a0d77d3ba14 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -9,6 +9,7 @@ config TILE select GENERIC_FIND_FIRST_BIT select USE_GENERIC_SMP_HELPERS select CC_OPTIMIZE_FOR_SIZE + select HAVE_DEBUG_KMEMLEAK select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fd5d7c2c2daa..3fea1848d955 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -66,6 +66,7 @@ config X86 select HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_DEBUG_KMEMLEAK select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7fba3a98967f..736db3990506 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -450,12 +450,12 @@ config SLUB_STATS out which slabs are relevant to a particular load. Try running: slabinfo -DA +config HAVE_DEBUG_KMEMLEAK + bool + config DEBUG_KMEMLEAK bool "Kernel memory leak detector" - depends on DEBUG_KERNEL && EXPERIMENTAL && \ - (X86 || ARM || PPC || MIPS || S390 || SPARC64 || SUPERH || \ - MICROBLAZE || TILE || ARM64) - + depends on DEBUG_KERNEL && EXPERIMENTAL && HAVE_DEBUG_KMEMLEAK select DEBUG_FS select STACKTRACE if STACKTRACE_SUPPORT select KALLSYMS -- cgit v1.2.3 From 7ac57a89de958fbb5271dc504d0c25e34dbeec32 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:28:16 -0700 Subject: Kconfig: clean up the "#if defined(arch)" list for exception-trace sysctl entry Introduce SYSCTL_EXCEPTION_TRACE config option and selec it in the architectures requiring support for the "exception-trace" debug_table entry in kernel/sysctl.c. Signed-off-by: Catalin Marinas Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/tile/Kconfig | 1 + arch/x86/Kconfig | 1 + init/Kconfig | 5 +++++ kernel/sysctl.c | 3 +-- 8 files changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a30856058742..7ff68c946073 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -28,6 +28,7 @@ config ARM64 select PERF_USE_VMALLOC select RTC_LIB select SPARSE_IRQ + select SYSCTL_EXCEPTION_TRACE help ARM 64-bit (AArch64) Linux support. diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6a798a70a6d1..df7edb887a04 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -99,6 +99,7 @@ config PPC select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER + select SYSCTL_EXCEPTION_TRACE select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_IDE select HAVE_IOREMAP_PROT diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8c6d7986f6d2..ceff7aef2477 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -68,6 +68,7 @@ config S390 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_C_RECORDMCOUNT select HAVE_SYSCALL_TRACEPOINTS + select SYSCTL_EXCEPTION_TRACE select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_GRAPH_TRACER select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 700a01adec3a..e184075877d7 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -18,6 +18,7 @@ config SPARC select HAVE_OPROFILE select HAVE_ARCH_KGDB if !SMP || SPARC64 select HAVE_ARCH_TRACEHOOK + select SYSCTL_EXCEPTION_TRACE select ARCH_WANT_OPTIONAL_GPIOLIB select RTC_CLASS select RTC_DRV_M48T59 diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index df69d4296b4b..dc46490adca0 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -7,6 +7,7 @@ config TILE select HAVE_DMA_API_DEBUG select HAVE_KVM if !TILEGX select GENERIC_FIND_FIRST_BIT + select SYSCTL_EXCEPTION_TRACE select USE_GENERIC_SMP_HELPERS select CC_OPTIMIZE_FOR_SIZE select HAVE_DEBUG_KMEMLEAK diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3fea1848d955..6119d6c7002e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -47,6 +47,7 @@ config X86 select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_SYSCALL_TRACEPOINTS + select SYSCTL_EXCEPTION_TRACE select HAVE_KVM select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK diff --git a/init/Kconfig b/init/Kconfig index 38bab420bd9b..4c93533da42c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1152,6 +1152,11 @@ config SYSCTL_SYSCALL If unsure say N here. +config SYSCTL_EXCEPTION_TRACE + bool + help + Enable support for /proc/sys/debug/exception-trace. + config KALLSYMS bool "Load all symbols for debugging/ksymoops" if EXPERT default y diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c2a2f8084bad..26f65eaa01f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1549,8 +1549,7 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) || defined(CONFIG_TILE) || defined(CONFIG_ARM64) +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", .data = &show_unhandled_signals, -- cgit v1.2.3 From b1a86e15dc0304366f50ba1720834bc419c801b1 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 8 Oct 2012 16:28:23 -0700 Subject: x86, pat: remove the dependency on 'vm_pgoff' in track/untrack pfn vma routines 'pfn' argument for track_pfn_vma_new() can be used for reserving the attribute for the pfn range. No need to depend on 'vm_pgoff' Similarly, untrack_pfn_vma() can depend on the 'pfn' argument if it is non-zero or can use follow_phys() to get the starting value of the pfn range. Also the non zero 'size' argument can be used instead of recomputing it from vma. This cleanup also prepares the ground for the track/untrack pfn vma routines to take over the ownership of setting PAT specific vm_flag in the 'vma'. [khlebnikov@openvz.org: Clear pfn to paddr conversion] Signed-off-by: Suresh Siddha Signed-off-by: Konstantin Khlebnikov Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Tetsuo Handa Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 3d68ef6d2266..de36c886cd38 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -704,21 +704,18 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long size) { + resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; unsigned long flags; - resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; - if (is_linear_pfn_mapping(vma)) { - /* reserve the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot, 0); - } + /* reserve the whole chunk starting from paddr */ + if (is_linear_pfn_mapping(vma)) + return reserve_pfn_range(paddr, size, prot, 0); if (!pat_enabled) return 0; /* for vm_insert_pfn and friends, we set prot based on lookup */ - flags = lookup_memtype(pfn << PAGE_SHIFT); + flags = lookup_memtype(paddr); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | flags); @@ -728,20 +725,28 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, /* * untrack_pfn_vma is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). + * can be for the entire vma (in which case pfn, size are zero). */ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size) { resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long prot; - if (is_linear_pfn_mapping(vma)) { - /* free the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - free_pfn_range(paddr, vma_size); + if (!is_linear_pfn_mapping(vma)) return; + + /* free the chunk starting from pfn or the whole chunk */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + if (!paddr && !size) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return; + } + + size = vma->vm_end - vma->vm_start; } + free_pfn_range(paddr, size); } pgprot_t pgprot_writecombine(pgprot_t prot) -- cgit v1.2.3 From 5180da410db6369d1f95c9014da1c9bc33fb043e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 8 Oct 2012 16:28:29 -0700 Subject: x86, pat: separate the pfn attribute tracking for remap_pfn_range and vm_insert_pfn With PAT enabled, vm_insert_pfn() looks up the existing pfn memory attribute and uses it. Expectation is that the driver reserves the memory attributes for the pfn before calling vm_insert_pfn(). remap_pfn_range() (when called for the whole vma) will setup a new attribute (based on the prot argument) for the specified pfn range. This addresses the legacy usage which typically calls remap_pfn_range() with a desired memory attribute. For ranges smaller than the vma size (which is typically not the case), remap_pfn_range() will use the existing memory attribute for the pfn range. Expose two different API's for these different behaviors. track_pfn_insert() for tracking the pfn attribute set by vm_insert_pfn() and track_pfn_remap() for the remap_pfn_range(). This cleanup also prepares the ground for the track/untrack pfn vma routines to take over the ownership of setting PAT specific vm_flag in the 'vma'. [khlebnikov@openvz.org: Clear checks in track_pfn_remap()] [akpm@linux-foundation.org: tweak a few comments] Signed-off-by: Suresh Siddha Signed-off-by: Konstantin Khlebnikov Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Hugh Dickins Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Konstantin Khlebnikov Cc: Matt Helsley Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat.c | 47 +++++++++++++++++++++++++++--------- include/asm-generic/pgtable.h | 55 +++++++++++++++++++++++++------------------ mm/memory.c | 13 ++++------ 3 files changed, 73 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index de36c886cd38..74a702674e86 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -664,13 +664,13 @@ static void free_pfn_range(u64 paddr, unsigned long size) } /* - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). * * If the vma has a linear pfn mapping for the entire range, we get the prot * from pte and reserve the entire vma range with single reserve_pfn_range call. */ -int track_pfn_vma_copy(struct vm_area_struct *vma) +int track_pfn_copy(struct vm_area_struct *vma) { resource_size_t paddr; unsigned long prot; @@ -694,15 +694,12 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) } /* - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. - * * prot is passed in as a parameter for the new mapping. If the vma has a * linear pfn mapping for the entire range reserve the entire vma range with * single reserve_pfn_range call. */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size) +int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long size) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; unsigned long flags; @@ -714,8 +711,36 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, if (!pat_enabled) return 0; - /* for vm_insert_pfn and friends, we set prot based on lookup */ + /* + * For anything smaller than the vma size we set prot based on the + * lookup. + */ flags = lookup_memtype(paddr); + + /* Check memtype for the remaining pages */ + while (size > PAGE_SIZE) { + size -= PAGE_SIZE; + paddr += PAGE_SIZE; + if (flags != lookup_memtype(paddr)) + return -EINVAL; + } + + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + + return 0; +} + +int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn) +{ + unsigned long flags; + + if (!pat_enabled) + return 0; + + /* Set prot based on lookup */ + flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | flags); @@ -723,12 +748,12 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, } /* - * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ -void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) +void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) { resource_size_t paddr; unsigned long prot; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index ff4947b7a976..d4d4592c97fc 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -381,48 +381,57 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #ifndef __HAVE_PFNMAP_TRACKING /* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. + * Interfaces that can be used by architecture code to keep track of + * memory type of pfn mappings specified by the remap_pfn_range, + * vm_insert_pfn. + */ + +/* + * track_pfn_remap is called when a _new_ pfn mapping is being established + * by remap_pfn_range() for physical range indicated by pfn and size. */ -static inline int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size) +static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long size) { return 0; } /* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * track_pfn_insert is called when a _new_ single pfn is established + * by vm_insert_pfn(). + */ +static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn) +{ + return 0; +} + +/* + * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). */ -static inline int track_pfn_vma_copy(struct vm_area_struct *vma) +static inline int track_pfn_copy(struct vm_area_struct *vma) { return 0; } /* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * * untrack_pfn_vma is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). + * can be for the entire vma (in which case pfn, size are zero). */ -static inline void untrack_pfn_vma(struct vm_area_struct *vma, - unsigned long pfn, unsigned long size) +static inline void untrack_pfn(struct vm_area_struct *vma, + unsigned long pfn, unsigned long size) { } #else -extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size); -extern int track_pfn_vma_copy(struct vm_area_struct *vma); -extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); +extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long size); +extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn); +extern int track_pfn_copy(struct vm_area_struct *vma); +extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size); #endif #ifdef CONFIG_MMU diff --git a/mm/memory.c b/mm/memory.c index 57361708d1a5..6bef278ad303 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1060,7 +1060,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ - ret = track_pfn_vma_copy(vma); + ret = track_pfn_copy(vma); if (ret) return ret; } @@ -1328,7 +1328,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, uprobe_munmap(vma, start, end); if (unlikely(is_pfn_mapping(vma))) - untrack_pfn_vma(vma, 0, 0); + untrack_pfn(vma, 0, 0); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { @@ -2162,14 +2162,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) + if (track_pfn_insert(vma, &pgprot, pfn)) return -EINVAL; ret = insert_pfn(vma, addr, pfn, pgprot); - if (ret) - untrack_pfn_vma(vma, pfn, PAGE_SIZE); - return ret; } EXPORT_SYMBOL(vm_insert_pfn); @@ -2311,7 +2308,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); + err = track_pfn_remap(vma, &prot, pfn, PAGE_ALIGN(size)); if (err) { /* * To indicate that track_pfn related cleanup is not @@ -2335,7 +2332,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } while (pgd++, addr = next, addr != end); if (err) - untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); return err; } -- cgit v1.2.3 From b3b9c2932c32e0692018ed5f12f3fd8c70eea8ce Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:34 -0700 Subject: mm, x86, pat: rework linear pfn-mmap tracking Replace the generic vma-flag VM_PFN_AT_MMAP with x86-only VM_PAT. We can toss mapping address from remap_pfn_range() into track_pfn_vma_new(), and collect all PAT-related logic together in arch/x86/. This patch also restores orignal frustration-free is_cow_mapping() check in remap_pfn_range(), as it was before commit v2.6.28-rc8-88-g3c8bb73 ("x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3") is_linear_pfn_mapping() checks can be removed from mm/huge_memory.c, because it already handled by VM_PFNMAP in VM_NO_THP bit-mask. [suresh.b.siddha@intel.com: Reset the VM_PAT flag as part of untrack_pfn_vma()] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Suresh Siddha Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Hugh Dickins Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat.c | 17 ++++++++++++----- include/asm-generic/pgtable.h | 6 ++++-- include/linux/mm.h | 20 +------------------- mm/huge_memory.c | 19 +++---------------- mm/memory.c | 26 ++++++++++---------------- 5 files changed, 30 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 74a702674e86..0eb572eda406 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -677,7 +677,7 @@ int track_pfn_copy(struct vm_area_struct *vma) unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (is_linear_pfn_mapping(vma)) { + if (vma->vm_flags & VM_PAT) { /* * reserve the whole chunk covered by vma. We need the * starting address and protection from pte. @@ -699,14 +699,20 @@ int track_pfn_copy(struct vm_area_struct *vma) * single reserve_pfn_range call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size) + unsigned long pfn, unsigned long addr, unsigned long size) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; unsigned long flags; /* reserve the whole chunk starting from paddr */ - if (is_linear_pfn_mapping(vma)) - return reserve_pfn_range(paddr, size, prot, 0); + if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + int ret; + + ret = reserve_pfn_range(paddr, size, prot, 0); + if (!ret) + vma->vm_flags |= VM_PAT; + return ret; + } if (!pat_enabled) return 0; @@ -758,7 +764,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long prot; - if (!is_linear_pfn_mapping(vma)) + if (!(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ @@ -772,6 +778,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); + vma->vm_flags &= ~VM_PAT; } pgprot_t pgprot_writecombine(pgprot_t prot) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index d4d4592c97fc..c9a612069c8e 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -391,7 +391,8 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, * by remap_pfn_range() for physical range indicated by pfn and size. */ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size) + unsigned long pfn, unsigned long addr, + unsigned long size) { return 0; } @@ -426,7 +427,8 @@ static inline void untrack_pfn(struct vm_area_struct *vma, } #else extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size); + unsigned long pfn, unsigned long addr, + unsigned long size); extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn); extern int track_pfn_copy(struct vm_area_struct *vma); diff --git a/include/linux/mm.h b/include/linux/mm.h index 311be906b57d..75d1632d3477 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -117,7 +117,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ -#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ +#define VM_PAT 0x40000000 /* PAT reserves whole VMA at once (x86) */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ /* Bits set in the VMA until the stack is in its final location */ @@ -158,24 +158,6 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ -/* - * This interface is used by x86 PAT code to identify a pfn mapping that is - * linear over entire vma. This is to optimize PAT code that deals with - * marking the physical region with a particular prot. This is not for generic - * mm use. Note also that this check will not work if the pfn mapping is - * linear for a vma starting at physical address 0. In which case PAT code - * falls back to slow path of reserving physical range page by page. - */ -static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) -{ - return !!(vma->vm_flags & VM_PFN_AT_MMAP); -} - -static inline int is_pfn_mapping(struct vm_area_struct *vma) -{ - return !!(vma->vm_flags & VM_PFNMAP); -} - /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 141dbb695097..73cb22ee9665 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1655,11 +1655,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -1912,11 +1908,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; if (is_vma_temporary_stack(vma)) goto out; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -2154,12 +2146,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, goto skip; if (is_vma_temporary_stack(vma)) goto skip; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() - * must be true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || - vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; diff --git a/mm/memory.c b/mm/memory.c index 6bef278ad303..655e1429388a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1055,7 +1055,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); - if (unlikely(is_pfn_mapping(vma))) { + if (unlikely(vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine @@ -1327,7 +1327,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) uprobe_munmap(vma, start, end); - if (unlikely(is_pfn_mapping(vma))) + if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn(vma, 0, 0); if (start != end) { @@ -2299,26 +2299,20 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. */ - if (addr == vma->vm_start && end == vma->vm_end) { + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; vma->vm_pgoff = pfn; - vma->vm_flags |= VM_PFN_AT_MMAP; - } else if (is_cow_mapping(vma->vm_flags)) + } + + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + if (err) return -EINVAL; vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - err = track_pfn_remap(vma, &prot, pfn, PAGE_ALIGN(size)); - if (err) { - /* - * To indicate that track_pfn related cleanup is not - * needed from higher level routine calling unmap_vmas - */ - vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); - vma->vm_flags &= ~VM_PFN_AT_MMAP; - return -EINVAL; - } - BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); -- cgit v1.2.3 From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/unevictable-lru.txt | 4 ++-- arch/alpha/kernel/pci-sysfs.c | 2 +- arch/ia64/kernel/perfmon.c | 2 +- arch/ia64/mm/init.c | 3 ++- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/sparc/kernel/pci.c | 2 +- arch/unicore32/kernel/process.c | 2 +- arch/x86/xen/mmu.c | 3 +-- drivers/char/mbcs.c | 2 +- drivers/char/mem.c | 2 +- drivers/char/mspec.c | 2 +- drivers/gpu/drm/drm_gem.c | 2 +- drivers/gpu/drm/drm_vm.c | 10 ++-------- drivers/gpu/drm/exynos/exynos_drm_gem.c | 2 +- drivers/gpu/drm/gma500/framebuffer.c | 3 +-- drivers/gpu/drm/ttm/ttm_bo_vm.c | 4 ++-- drivers/gpu/drm/udl/udl_fb.c | 2 +- drivers/infiniband/hw/ehca/ehca_uverbs.c | 4 ++-- drivers/infiniband/hw/ipath/ipath_file_ops.c | 2 +- drivers/infiniband/hw/qib/qib_file_ops.c | 2 +- drivers/media/pci/meye/meye.c | 2 +- drivers/media/platform/omap/omap_vout.c | 2 +- drivers/media/platform/vino.c | 2 +- drivers/media/usb/sn9c102/sn9c102_core.c | 3 +-- drivers/media/usb/usbvision/usbvision-video.c | 3 +-- drivers/media/v4l2-core/videobuf-dma-sg.c | 2 +- drivers/media/v4l2-core/videobuf-vmalloc.c | 2 +- drivers/media/v4l2-core/videobuf2-memops.c | 2 +- drivers/misc/carma/carma-fpga.c | 2 -- drivers/misc/sgi-gru/grufile.c | 5 ++--- drivers/mtd/mtdchar.c | 2 +- drivers/scsi/sg.c | 2 +- drivers/staging/omapdrm/omap_gem_dmabuf.c | 2 +- drivers/staging/tidspbridge/rmgr/drv_interface.c | 2 +- drivers/uio/uio.c | 4 +--- drivers/usb/mon/mon_bin.c | 2 +- drivers/video/68328fb.c | 2 +- drivers/video/aty/atyfb_base.c | 3 +-- drivers/video/fb-puv3.c | 3 +-- drivers/video/fb_defio.c | 2 +- drivers/video/fbmem.c | 3 +-- drivers/video/gbefb.c | 2 +- drivers/video/omap2/omapfb/omapfb-main.c | 2 +- drivers/video/sbuslib.c | 5 ++--- drivers/video/smscufx.c | 1 - drivers/video/udlfb.c | 1 - drivers/video/vermilion/vermilion.c | 1 - drivers/video/vfb.c | 1 - drivers/xen/gntalloc.c | 2 +- drivers/xen/gntdev.c | 2 +- drivers/xen/privcmd.c | 3 ++- fs/binfmt_elf.c | 2 +- fs/binfmt_elf_fdpic.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mempolicy.h | 2 +- include/linux/mm.h | 3 +-- include/linux/mm_types.h | 1 - kernel/events/core.c | 2 +- mm/ksm.c | 3 +-- mm/memory.c | 11 +++++------ mm/mlock.c | 2 +- mm/mmap.c | 2 -- mm/nommu.c | 2 +- mm/vmalloc.c | 3 +-- security/selinux/selinuxfs.c | 2 +- sound/core/pcm_native.c | 6 +++--- sound/usb/usx2y/us122l.c | 2 +- sound/usb/usx2y/usX2Yhwdep.c | 2 +- sound/usb/usx2y/usx2yhwdeppcm.c | 2 +- 70 files changed, 77 insertions(+), 105 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index fa206cccf89f..323ff5dba1cc 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt @@ -371,8 +371,8 @@ mlock_fixup() filters several classes of "special" VMAs: mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to allocate the huge pages and populate the ptes. -3) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of - kernel pages, such as the VDSO page, relay channel pages, etc. These pages +3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, + such as the VDSO page, relay channel pages, etc. These pages are inherently unevictable and are not managed on the LRU lists. mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls make_pages_present() to populate the ptes. diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c index 53649c7d0068..b51f7b4818cd 100644 --- a/arch/alpha/kernel/pci-sysfs.c +++ b/arch/alpha/kernel/pci-sysfs.c @@ -26,7 +26,7 @@ static int hose_mmap_page_range(struct pci_controller *hose, base = sparse ? hose->sparse_io_base : hose->dense_io_base; vma->vm_pgoff += base >> PAGE_SHIFT; - vma->vm_flags |= (VM_IO | VM_RESERVED); + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index f388b4e18a37..ea39eba61ef5 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2307,7 +2307,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t */ vma->vm_mm = mm; vma->vm_file = get_file(filp); - vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED; + vma->vm_flags = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ /* diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 0eab454867a2..082e383c1b6f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -138,7 +138,8 @@ ia64_init_addr_space (void) vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); - vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED; + vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | + VM_DONTEXPAND | VM_DONTDUMP; down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 83e929e66f9d..721d4603a235 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1183,7 +1183,7 @@ static const struct vm_operations_struct kvm_rma_vm_ops = { static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) { - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &kvm_rma_vm_ops; return 0; } diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index acc8c838ff72..75b31bcdeadf 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -779,7 +779,7 @@ static int __pci_mmap_make_offset(struct pci_dev *pdev, static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state) { - vma->vm_flags |= (VM_IO | VM_RESERVED); + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; } /* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index b6f0458c3143..b008586dad75 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -380,7 +380,7 @@ int vectors_user_mapping(void) return install_special_mapping(mm, 0xffff0000, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | - VM_RESERVED, + VM_DONTEXPAND | VM_DONTDUMP, NULL); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5a16824cc2b3..fd28d86fe3d2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2451,8 +2451,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); - BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == - (VM_PFNMAP | VM_RESERVED | VM_IO))); + BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); rmd.mfn = mfn; rmd.prot = prot; diff --git a/drivers/char/mbcs.c b/drivers/char/mbcs.c index 0c7d340b9ab9..f74e892711dd 100644 --- a/drivers/char/mbcs.c +++ b/drivers/char/mbcs.c @@ -507,7 +507,7 @@ static int mbcs_gscr_mmap(struct file *fp, struct vm_area_struct *vma) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */ + /* Remap-pfn-range will mark the range VM_IO */ if (remap_pfn_range(vma, vma->vm_start, __pa(soft->gscr_addr) >> PAGE_SHIFT, diff --git a/drivers/char/mem.c b/drivers/char/mem.c index e5eedfa24c91..0537903c985b 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -322,7 +322,7 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &mmap_mem_ops; - /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */ + /* Remap-pfn-range will mark the range VM_IO */ if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index 845f97fd1832..e1f60f968fdd 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -286,7 +286,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, atomic_set(&vdata->refcnt, 1); vma->vm_private_data = vdata; - vma->vm_flags |= (VM_IO | VM_RESERVED | VM_PFNMAP | VM_DONTEXPAND); + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; if (vdata->type == MSPEC_FETCHOP || vdata->type == MSPEC_UNCACHED) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &mspec_vm_ops; diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 92177d5aedee..24efae464e2c 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -706,7 +706,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma) goto out_unlock; } - vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = obj->dev->driver->gem_vm_ops; vma->vm_private_data = map->handle; vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index 23a824e6a22a..db7bd292410b 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -514,8 +514,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &drm_vm_dma_ops; - vma->vm_flags |= VM_RESERVED; /* Don't swap */ - vma->vm_flags |= VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; drm_vm_open_locked(dev, vma); return 0; @@ -643,21 +642,16 @@ int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) case _DRM_SHM: vma->vm_ops = &drm_vm_shm_ops; vma->vm_private_data = (void *)map; - /* Don't let this area swap. Change when - DRM_KERNEL advisory is supported. */ - vma->vm_flags |= VM_RESERVED; break; case _DRM_SCATTER_GATHER: vma->vm_ops = &drm_vm_sg_ops; vma->vm_private_data = (void *)map; - vma->vm_flags |= VM_RESERVED; vma->vm_page_prot = drm_dma_prot(map->type, vma); break; default: return -EINVAL; /* This should never happen. */ } - vma->vm_flags |= VM_RESERVED; /* Don't swap */ - vma->vm_flags |= VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; drm_vm_open_locked(dev, vma); return 0; diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index fcdbe46914f7..d2545560664f 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -500,7 +500,7 @@ static int exynos_drm_gem_mmap_buffer(struct file *filp, DRM_DEBUG_KMS("%s\n", __FILE__); - vma->vm_flags |= (VM_IO | VM_RESERVED); + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; update_vm_cache_attr(exynos_gem_obj, vma); diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 884ba73ac6ce..afded54dbb10 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c @@ -178,8 +178,7 @@ static int psbfb_mmap(struct fb_info *info, struct vm_area_struct *vma) */ vma->vm_ops = &psbfb_vm_ops; vma->vm_private_data = (void *)psbfb; - vma->vm_flags |= VM_RESERVED | VM_IO | - VM_MIXEDMAP | VM_DONTEXPAND; + vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP; return 0; } diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index a877813571a4..3ba72dbdc4bd 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -285,7 +285,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma, */ vma->vm_private_data = bo; - vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; + vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP; return 0; out_unref: ttm_bo_unref(&bo); @@ -300,7 +300,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo) vma->vm_ops = &ttm_bo_vm_ops; vma->vm_private_data = ttm_bo_reference(bo); - vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; + vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; return 0; } EXPORT_SYMBOL(ttm_fbdev_mmap); diff --git a/drivers/gpu/drm/udl/udl_fb.c b/drivers/gpu/drm/udl/udl_fb.c index 67df842fbb33..69a2b16f42a6 100644 --- a/drivers/gpu/drm/udl/udl_fb.c +++ b/drivers/gpu/drm/udl/udl_fb.c @@ -243,7 +243,7 @@ static int udl_fb_mmap(struct fb_info *info, struct vm_area_struct *vma) size = 0; } - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ return 0; } diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c index 45ee89b65c23..1a1d5d99fcf9 100644 --- a/drivers/infiniband/hw/ehca/ehca_uverbs.c +++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c @@ -117,7 +117,7 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas, physical = galpas->user.fw_handle; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical); - /* VM_IO | VM_RESERVED are set by remap_pfn_range() */ + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT, vma->vm_page_prot); if (unlikely(ret)) { @@ -139,7 +139,7 @@ static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue, u64 start, ofs; struct page *page; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; start = vma->vm_start; for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) { u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs); diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 736d9edbdbe7..3eb7e454849b 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -1225,7 +1225,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; vma->vm_ops = &ipath_file_vm_ops; - vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; ret = 1; bail: diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index faa44cb08071..959a5c4ff812 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -971,7 +971,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; vma->vm_ops = &qib_file_vm_ops; - vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; ret = 1; bail: diff --git a/drivers/media/pci/meye/meye.c b/drivers/media/pci/meye/meye.c index 7bc775219f97..e5a76da86081 100644 --- a/drivers/media/pci/meye/meye.c +++ b/drivers/media/pci/meye/meye.c @@ -1647,7 +1647,7 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &meye_vm_ops; vma->vm_flags &= ~VM_IO; /* not I/O memory */ - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = (void *) (offset / gbufsize); meye_vm_open(vma); diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c index 66ac21d466af..134016f0e660 100644 --- a/drivers/media/platform/omap/omap_vout.c +++ b/drivers/media/platform/omap/omap_vout.c @@ -911,7 +911,7 @@ static int omap_vout_mmap(struct file *file, struct vm_area_struct *vma) q->bufs[i]->baddr = vma->vm_start; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); vma->vm_ops = &omap_vout_vm_ops; vma->vm_private_data = (void *) vout; diff --git a/drivers/media/platform/vino.c b/drivers/media/platform/vino.c index 790d96cffeea..70b0bf4b2900 100644 --- a/drivers/media/platform/vino.c +++ b/drivers/media/platform/vino.c @@ -3950,7 +3950,7 @@ found: fb->map_count = 1; - vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_flags &= ~VM_IO; vma->vm_private_data = fb; vma->vm_file = file; diff --git a/drivers/media/usb/sn9c102/sn9c102_core.c b/drivers/media/usb/sn9c102/sn9c102_core.c index 19ea780b16ff..5bfc8e2f018f 100644 --- a/drivers/media/usb/sn9c102/sn9c102_core.c +++ b/drivers/media/usb/sn9c102/sn9c102_core.c @@ -2126,8 +2126,7 @@ static int sn9c102_mmap(struct file* filp, struct vm_area_struct *vma) return -EINVAL; } - vma->vm_flags |= VM_IO; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; pos = cam->frame[i].bufmem; while (size > 0) { /* size is page-aligned */ diff --git a/drivers/media/usb/usbvision/usbvision-video.c b/drivers/media/usb/usbvision/usbvision-video.c index f67018ed3795..5c36a57e6590 100644 --- a/drivers/media/usb/usbvision/usbvision-video.c +++ b/drivers/media/usb/usbvision/usbvision-video.c @@ -1108,8 +1108,7 @@ static int usbvision_mmap(struct file *file, struct vm_area_struct *vma) } /* VM_IO is eventually going to replace PageReserved altogether */ - vma->vm_flags |= VM_IO; - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; pos = usbvision->frame[i].data; while (size > 0) { diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index f300deafd268..828e7c10bd70 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -582,7 +582,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, map->count = 1; map->q = q; vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n", diff --git a/drivers/media/v4l2-core/videobuf-vmalloc.c b/drivers/media/v4l2-core/videobuf-vmalloc.c index df142580e44c..2ff7fcc77b11 100644 --- a/drivers/media/v4l2-core/videobuf-vmalloc.c +++ b/drivers/media/v4l2-core/videobuf-vmalloc.c @@ -270,7 +270,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, } vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", diff --git a/drivers/media/v4l2-core/videobuf2-memops.c b/drivers/media/v4l2-core/videobuf2-memops.c index 504cd4cbe29e..051ea3571b20 100644 --- a/drivers/media/v4l2-core/videobuf2-memops.c +++ b/drivers/media/v4l2-core/videobuf2-memops.c @@ -163,7 +163,7 @@ int vb2_mmap_pfn_range(struct vm_area_struct *vma, unsigned long paddr, return ret; } - vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = priv; vma->vm_ops = vm_ops; diff --git a/drivers/misc/carma/carma-fpga.c b/drivers/misc/carma/carma-fpga.c index 0c43297ed9ac..8835eabb3b87 100644 --- a/drivers/misc/carma/carma-fpga.c +++ b/drivers/misc/carma/carma-fpga.c @@ -1243,8 +1243,6 @@ static int data_mmap(struct file *filp, struct vm_area_struct *vma) return -EINVAL; } - /* IO memory (stop cacheing) */ - vma->vm_flags |= VM_IO | VM_RESERVED; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return io_remap_pfn_range(vma, vma->vm_start, addr, vsize, diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index ecafa4ba238b..492c8cac69ac 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -108,9 +108,8 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_end & (GRU_GSEG_PAGESIZE - 1)) return -EINVAL; - vma->vm_flags |= - (VM_IO | VM_DONTCOPY | VM_LOCKED | VM_DONTEXPAND | VM_PFNMAP | - VM_RESERVED); + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED | + VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_page_prot = PAGE_SHARED; vma->vm_ops = &gru_vm_ops; diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index a6e74514e662..73ae81a629f2 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -1182,7 +1182,7 @@ static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; if (set_vm_offset(vma, off) < 0) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED; + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; #ifdef pgprot_noncached if (file->f_flags & O_DSYNC || off >= __pa(high_memory)) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 9c5c5f2b3962..be2c9a6561ff 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1257,7 +1257,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) } sfp->mmap_called = 1; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = sfp; vma->vm_ops = &sg_mmap_vm_ops; return 0; diff --git a/drivers/staging/omapdrm/omap_gem_dmabuf.c b/drivers/staging/omapdrm/omap_gem_dmabuf.c index 42728e0cc194..c6f3ef6f57b9 100644 --- a/drivers/staging/omapdrm/omap_gem_dmabuf.c +++ b/drivers/staging/omapdrm/omap_gem_dmabuf.c @@ -160,7 +160,7 @@ static int omap_gem_dmabuf_mmap(struct dma_buf *buffer, goto out_unlock; } - vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = obj->dev->driver->gem_vm_ops; vma->vm_private_data = obj; vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); diff --git a/drivers/staging/tidspbridge/rmgr/drv_interface.c b/drivers/staging/tidspbridge/rmgr/drv_interface.c index bddea1d3b2c3..701a11ac676d 100644 --- a/drivers/staging/tidspbridge/rmgr/drv_interface.c +++ b/drivers/staging/tidspbridge/rmgr/drv_interface.c @@ -261,7 +261,7 @@ static int bridge_mmap(struct file *filp, struct vm_area_struct *vma) { u32 status; - vma->vm_flags |= VM_RESERVED | VM_IO; + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); dev_dbg(bridge, "%s: vm filp %p start %lx end %lx page_prot %ulx " diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index a783d533a1a6..5110f367f1f1 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -653,8 +653,6 @@ static int uio_mmap_physical(struct vm_area_struct *vma) if (mi < 0) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return remap_pfn_range(vma, @@ -666,7 +664,7 @@ static int uio_mmap_physical(struct vm_area_struct *vma) static int uio_mmap_logical(struct vm_area_struct *vma) { - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &uio_vm_ops; uio_vma_open(vma); return 0; diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index 91cd85076a44..9a62e89d6dc0 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1247,7 +1247,7 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) { /* don't do anything here: "fault" will set up page table entries */ vma->vm_ops = &mon_bin_vm_ops; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = filp->private_data; mon_bin_vma_open(vma); return 0; diff --git a/drivers/video/68328fb.c b/drivers/video/68328fb.c index a425d65d5ba2..fa44fbed397d 100644 --- a/drivers/video/68328fb.c +++ b/drivers/video/68328fb.c @@ -400,7 +400,7 @@ static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma) #ifndef MMU /* this is uClinux (no MMU) specific code */ - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_start = videomemory; return 0; diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c index 3f2e8c13f1ca..868932f904ef 100644 --- a/drivers/video/aty/atyfb_base.c +++ b/drivers/video/aty/atyfb_base.c @@ -1942,8 +1942,7 @@ static int atyfb_mmap(struct fb_info *info, struct vm_area_struct *vma) off = vma->vm_pgoff << PAGE_SHIFT; size = vma->vm_end - vma->vm_start; - /* To stop the swapper from even considering these pages. */ - vma->vm_flags |= (VM_IO | VM_RESERVED); + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ if (((vma->vm_pgoff == 0) && (size == info->fix.smem_len)) || ((off == info->fix.smem_len) && (size == PAGE_SIZE))) diff --git a/drivers/video/fb-puv3.c b/drivers/video/fb-puv3.c index 60a787fa32cf..7d106f1f4906 100644 --- a/drivers/video/fb-puv3.c +++ b/drivers/video/fb-puv3.c @@ -653,9 +653,8 @@ int unifb_mmap(struct fb_info *info, vma->vm_page_prot)) return -EAGAIN; - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ return 0; - } static struct fb_ops unifb_ops = { diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c index 64cda560c488..88cad6b8b479 100644 --- a/drivers/video/fb_defio.c +++ b/drivers/video/fb_defio.c @@ -166,7 +166,7 @@ static const struct address_space_operations fb_deferred_io_aops = { static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) { vma->vm_ops = &fb_deferred_io_vm_ops; - vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND ); + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; if (!(info->flags & FBINFO_VIRTFB)) vma->vm_flags |= VM_IO; vma->vm_private_data = info; diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 0dff12a1daef..3ff0105a496a 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1410,8 +1410,7 @@ fb_mmap(struct file *file, struct vm_area_struct * vma) return -EINVAL; off += start; vma->vm_pgoff = off >> PAGE_SHIFT; - /* This is an IO map - tell maydump to skip this VMA */ - vma->vm_flags |= VM_IO | VM_RESERVED; + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by io_remap_pfn_range()*/ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); fb_pgprotect(file, vma, off); if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT, diff --git a/drivers/video/gbefb.c b/drivers/video/gbefb.c index 7e7b7a9ba274..05e2a8a99d8f 100644 --- a/drivers/video/gbefb.c +++ b/drivers/video/gbefb.c @@ -1024,7 +1024,7 @@ static int gbefb_mmap(struct fb_info *info, pgprot_val(vma->vm_page_prot) = pgprot_fb(pgprot_val(vma->vm_page_prot)); - vma->vm_flags |= VM_IO | VM_RESERVED; + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ /* look for the starting tile */ tile = &gbe_tiles.cpu[offset >> TILE_SHIFT]; diff --git a/drivers/video/omap2/omapfb/omapfb-main.c b/drivers/video/omap2/omapfb/omapfb-main.c index 3c39aa8de928..15373f4aee19 100644 --- a/drivers/video/omap2/omapfb/omapfb-main.c +++ b/drivers/video/omap2/omapfb/omapfb-main.c @@ -1128,7 +1128,7 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma) DBG("user mmap region start %lx, len %d, off %lx\n", start, len, off); vma->vm_pgoff = off >> PAGE_SHIFT; - vma->vm_flags |= VM_IO | VM_RESERVED; + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); vma->vm_ops = &mmap_user_ops; vma->vm_private_data = rg; diff --git a/drivers/video/sbuslib.c b/drivers/video/sbuslib.c index 3c1de981a18c..296afae442f4 100644 --- a/drivers/video/sbuslib.c +++ b/drivers/video/sbuslib.c @@ -57,9 +57,8 @@ int sbusfb_mmap_helper(struct sbus_mmap_map *map, off = vma->vm_pgoff << PAGE_SHIFT; - /* To stop the swapper from even considering these pages */ - vma->vm_flags |= (VM_IO | VM_RESERVED); - + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); /* Each page, see which map applies */ diff --git a/drivers/video/smscufx.c b/drivers/video/smscufx.c index 5533a32c6ca1..97bd6620c364 100644 --- a/drivers/video/smscufx.c +++ b/drivers/video/smscufx.c @@ -803,7 +803,6 @@ static int ufx_ops_mmap(struct fb_info *info, struct vm_area_struct *vma) size = 0; } - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ return 0; } diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c index 8af64148294b..f45eba3d6150 100644 --- a/drivers/video/udlfb.c +++ b/drivers/video/udlfb.c @@ -345,7 +345,6 @@ static int dlfb_ops_mmap(struct fb_info *info, struct vm_area_struct *vma) size = 0; } - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ return 0; } diff --git a/drivers/video/vermilion/vermilion.c b/drivers/video/vermilion/vermilion.c index 970e43d13f52..89aef343e295 100644 --- a/drivers/video/vermilion/vermilion.c +++ b/drivers/video/vermilion/vermilion.c @@ -1018,7 +1018,6 @@ static int vmlfb_mmap(struct fb_info *info, struct vm_area_struct *vma) offset += vinfo->vram_start; pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; pgprot_val(vma->vm_page_prot) &= ~_PAGE_PWT; - vma->vm_flags |= VM_RESERVED | VM_IO; if (remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT, size, vma->vm_page_prot)) return -EAGAIN; diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c index 501a922aa9dc..c7f692525b88 100644 --- a/drivers/video/vfb.c +++ b/drivers/video/vfb.c @@ -439,7 +439,6 @@ static int vfb_mmap(struct fb_info *info, size = 0; } - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ return 0; } diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 934985d14c24..4097987b330e 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -535,7 +535,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &gntalloc_vmops; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 5df9fd847b2e..610bfc6be177 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -720,7 +720,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_RESERVED|VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; if (use_ptemod) vma->vm_flags |= VM_DONTCOPY; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index ef6389580b8c..8adb9cc267f9 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -455,7 +455,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) { /* DONTCOPY is essential for Xen because copy_page_range doesn't know * how to recreate these mappings */ - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2b72d26e2e4b..e800dec958c3 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1135,7 +1135,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, } /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & (VM_IO | VM_RESERVED)) + if (vma->vm_flags & VM_IO) return 0; /* By default, dump shared memory if mapped from an anonymous file. */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 08d812b32282..262db114ff01 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1205,7 +1205,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) int dump_ok; /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & (VM_IO | VM_RESERVED)) { + if (vma->vm_flags & VM_IO) { kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); return 0; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9460120a5170..0a0ab8e21b19 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap_pgoff unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_RESERVED; + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &hugetlb_vm_ops; if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4540b8f76f16..79827ce03e3b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPTE:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), - (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 95b738c7abff..ba7a0ff19d39 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -239,7 +239,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, /* Check if a vma is migratable */ static inline int vma_migratable(struct vm_area_struct *vma) { - if (vma->vm_flags & (VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) + if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP)) return 0; /* * Migration allocates pages in the highest zone. If we cannot diff --git a/include/linux/mm.h b/include/linux/mm.h index dc08d558e058..0514fe9d3c84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -96,7 +96,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ @@ -148,7 +147,7 @@ extern unsigned int kobjsize(const void *objp); * Special vmas that are non-mergable, non-mlock()able. * Note: mm/huge_memory.c VM_NO_THP depends on this definition. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP) /* * mapping from the currently active vm_flags protection bits (the diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 58d3173eb365..a57a43f5ca7c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -349,7 +349,6 @@ struct mm_struct { unsigned long shared_vm; /* Shared pages (files) */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ unsigned long stack_vm; /* VM_GROWSUP/DOWN */ - unsigned long reserved_vm; /* VM_RESERVED|VM_IO pages */ unsigned long def_flags; unsigned long nr_ptes; /* Page table pages */ unsigned long start_code, end_code, start_data, end_data; diff --git a/kernel/events/core.c b/kernel/events/core.c index f16f3c58f11a..cda3ebd49e86 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3671,7 +3671,7 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; diff --git a/mm/ksm.c b/mm/ksm.c index f9ccb16559ee..9638620a7530 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1469,8 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | - VM_NONLINEAR | VM_MIXEDMAP)) + VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ #ifdef VM_SAO diff --git a/mm/memory.c b/mm/memory.c index 7b1e4feaec06..e09c04813186 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2297,14 +2297,13 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED is specified all over the place, because - * in 2.4 it kept swapout's vma scan off this vma; but - * in 2.6 the LRU scan won't even find its pages, so this - * flag means no more than count its pages in reserved_vm, - * and omit it from core dump, even when VM_IO turned off. * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" @@ -2321,7 +2320,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (err) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; diff --git a/mm/mlock.c b/mm/mlock.c index ef726e8aa8e9..a948be4b7ba7 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_IO | VM_PFNMAP)) goto no_mlock; - if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + if (!((vma->vm_flags & VM_DONTEXPAND) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))) { diff --git a/mm/mmap.c b/mm/mmap.c index c1ad2e78ea58..a76042dc806d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -945,8 +945,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, mm->exec_vm += pages; } else if (flags & stack_flags) mm->stack_vm += pages; - if (flags & (VM_RESERVED|VM_IO)) - mm->reserved_vm += pages; } #endif /* CONFIG_PROC_FS */ diff --git a/mm/nommu.c b/mm/nommu.c index 9c4a7b63a4df..12e84e69dd06 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1811,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; return 0; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2bb90b1d241c..8de704679bfc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, usize -= PAGE_SIZE; } while (usize > 0); - /* Prevent "things" like memory migration? VM_flags need a cleanup... */ - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 55af8c5b57e6..3a6e8731646c 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -485,7 +485,7 @@ static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma) return -EACCES; } - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &sel_mmap_policy_ops; return 0; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 20554eff5a21..5e12e5bacbba 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3039,7 +3039,7 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_status; area->vm_private_data = substream; - area->vm_flags |= VM_RESERVED; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } @@ -3076,7 +3076,7 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_control; area->vm_private_data = substream; - area->vm_flags |= VM_RESERVED; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } #else /* ! coherent mmap */ @@ -3170,7 +3170,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, struct vm_area_struct *area) { - area->vm_flags |= VM_RESERVED; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; #ifdef ARCH_HAS_DMA_MMAP_COHERENT if (!substream->ops->page && substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV) diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c index c4fd3b1d9592..d0323a693ba2 100644 --- a/sound/usb/usx2y/us122l.c +++ b/sound/usb/usx2y/us122l.c @@ -262,7 +262,7 @@ static int usb_stream_hwdep_mmap(struct snd_hwdep *hw, } area->vm_ops = &usb_stream_hwdep_vm_ops; - area->vm_flags |= VM_RESERVED; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; area->vm_private_data = us122l; atomic_inc(&us122l->mmap_count); out: diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c index 04aafb43a13c..0b34dbc8f302 100644 --- a/sound/usb/usx2y/usX2Yhwdep.c +++ b/sound/usb/usx2y/usX2Yhwdep.c @@ -82,7 +82,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep * hw, struct file *filp, struct v us428->us428ctls_sharedmem->CtlSnapShotLast = -2; } area->vm_ops = &us428ctls_vm_ops; - area->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; area->vm_private_data = hw->private_data; return 0; } diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c index 8e40b6e67e9e..cc56007791e0 100644 --- a/sound/usb/usx2y/usx2yhwdeppcm.c +++ b/sound/usb/usx2y/usx2yhwdeppcm.c @@ -723,7 +723,7 @@ static int snd_usX2Y_hwdep_pcm_mmap(struct snd_hwdep * hw, struct file *filp, st return -ENODEV; } area->vm_ops = &snd_usX2Y_hwdep_pcm_vm_ops; - area->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; area->vm_private_data = hw->private_data; return 0; } -- cgit v1.2.3 From 5d3a551c28c6669dc43be40d8fafafbc2ec8f42b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 8 Oct 2012 16:29:32 -0700 Subject: mm: hugetlb: add arch hook for clearing page flags before entering pool The core page allocator ensures that page flags are zeroed when freeing pages via free_pages_check. A number of architectures (ARM, PPC, MIPS) rely on this property to treat new pages as dirty with respect to the data cache and perform the appropriate flushing before mapping the pages into userspace. This can lead to cache synchronisation problems when using hugepages, since the allocator keeps its own pool of pages above the usual page allocator and does not reset the page flags when freeing a page into the pool. This patch adds a new architecture hook, arch_clear_hugepage_flags, so that architectures which rely on the page flags being in a particular state for fresh allocations can adjust the flags accordingly when a page is freed into the pool. Signed-off-by: Will Deacon Cc: Michal Hocko Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/hugetlb.h | 4 ++++ arch/mips/include/asm/hugetlb.h | 4 ++++ arch/powerpc/include/asm/hugetlb.h | 4 ++++ arch/s390/include/asm/hugetlb.h | 1 + arch/sh/include/asm/hugetlb.h | 6 ++++++ arch/sparc/include/asm/hugetlb.h | 4 ++++ arch/tile/include/asm/hugetlb.h | 4 ++++ arch/x86/include/asm/hugetlb.h | 4 ++++ mm/hugetlb.c | 1 + 9 files changed, 32 insertions(+) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index da55c63728e0..94eaa5bd5d0c 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -77,4 +77,8 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #endif /* _ASM_IA64_HUGETLB_H */ diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 58d36889f09b..bd94946a18f3 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -112,4 +112,8 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index dfdb95bc59a5..62e11a32c4c2 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -151,6 +151,10 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #else /* ! CONFIG_HUGETLB_PAGE */ static inline void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 2d6e6e380564..fc322421b1cc 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -33,6 +33,7 @@ static inline int prepare_hugepage_range(struct file *file, } #define hugetlb_prefault_arch_hook(mm) do { } while (0) +#define arch_clear_hugepage_flags(page) do { } while (0) int arch_prepare_hugepage(struct page *page); void arch_release_hugepage(struct page *page); diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 967068fb79ac..b3808c7d67b2 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -1,6 +1,7 @@ #ifndef _ASM_SH_HUGETLB_H #define _ASM_SH_HUGETLB_H +#include #include @@ -89,4 +90,9 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ + clear_bit(PG_dcache_clean, &page->flags); +} + #endif /* _ASM_SH_HUGETLB_H */ diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 177061064ee6..e7927c9758a1 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -82,4 +82,8 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #endif /* _ASM_SPARC64_HUGETLB_H */ diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h index b2042380a5aa..0f885af2b621 100644 --- a/arch/tile/include/asm/hugetlb.h +++ b/arch/tile/include/asm/hugetlb.h @@ -106,6 +106,10 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #ifdef CONFIG_HUGETLB_SUPER_PAGES static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable) diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 439a9acc132d..bdd35dbd0605 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -90,4 +90,8 @@ static inline void arch_release_hugepage(struct page *page) { } +static inline void arch_clear_hugepage_flags(struct page *page) +{ +} + #endif /* _ASM_X86_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc727122dd44..f1bb534254f6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -637,6 +637,7 @@ static void free_huge_page(struct page *page) h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } else { + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); -- cgit v1.2.3 From 15626062f4a98279c59a2a5208c496cf65cbf8c0 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Oct 2012 16:30:04 -0700 Subject: thp, x86: introduce HAVE_ARCH_TRANSPARENT_HUGEPAGE Cleanup patch in preparation for transparent hugepage support on s390. Adding new architectures to the TRANSPARENT_HUGEPAGE config option can make the "depends" line rather ugly, like "depends on (X86 || (S390 && 64BIT)) && MMU". This patch adds a HAVE_ARCH_TRANSPARENT_HUGEPAGE instead. x86 already has MMU "def_bool y", so the MMU check is superfluous there and HAVE_ARCH_TRANSPARENT_HUGEPAGE can be selected in arch/x86/Kconfig. Signed-off-by: Gerald Schaefer Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Hugh Dickins Cc: Hillf Danton Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + mm/Kconfig | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index a62965d057f6..550cce4dd648 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -313,4 +313,7 @@ config HAVE_IRQ_TIME_ACCOUNTING Archs need to ensure they use a high enough resolution clock to support irq time accounting and then call enable_sched_clock_irqtime(). +config HAVE_ARCH_TRANSPARENT_HUGEPAGE + bool + source "kernel/gcov/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6119d6c7002e..1ae94bcae5d9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -88,6 +88,7 @@ config X86 select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if X86_64 + select HAVE_ARCH_TRANSPARENT_HUGEPAGE select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP diff --git a/mm/Kconfig b/mm/Kconfig index d5c8019c6627..3322342a1ffb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -318,7 +318,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" - depends on X86 && MMU + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION help Transparent Hugepages allows the kernel to use huge pages and -- cgit v1.2.3 From 9d9e6f9703bbd642f3f2f807e6aaa642a4cbcec9 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:20 -0700 Subject: rbtree: remove prior augmented rbtree implementation convert arch/x86/mm/pat_rbtree.c to the proposed augmented rbtree api and remove the old augmented rbtree implementation. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat_rbtree.c | 65 +++++++++++++++++++++++++++++++------------- include/linux/rbtree.h | 8 ------ lib/rbtree.c | 71 ------------------------------------------------ 3 files changed, 46 insertions(+), 98 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 8acaddd0fb21..7e1515bd4770 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -54,29 +54,57 @@ static u64 get_subtree_max_end(struct rb_node *node) return ret; } -/* Update 'subtree_max_end' for a node, based on node and its children */ -static void memtype_rb_augment_cb(struct rb_node *node, void *__unused) +static u64 compute_subtree_max_end(struct memtype *data) { - struct memtype *data; - u64 max_end, child_max_end; - - if (!node) - return; - - data = container_of(node, struct memtype, rb); - max_end = data->end; + u64 max_end = data->end, child_max_end; - child_max_end = get_subtree_max_end(node->rb_right); + child_max_end = get_subtree_max_end(data->rb.rb_right); if (child_max_end > max_end) max_end = child_max_end; - child_max_end = get_subtree_max_end(node->rb_left); + child_max_end = get_subtree_max_end(data->rb.rb_left); if (child_max_end > max_end) max_end = child_max_end; - data->subtree_max_end = max_end; + return max_end; +} + +/* Update 'subtree_max_end' for node and its parents */ +static void memtype_rb_propagate_cb(struct rb_node *node, struct rb_node *stop) +{ + while (node != stop) { + struct memtype *data = container_of(node, struct memtype, rb); + u64 subtree_max_end = compute_subtree_max_end(data); + if (data->subtree_max_end == subtree_max_end) + break; + data->subtree_max_end = subtree_max_end; + node = rb_parent(&data->rb); + } +} + +static void memtype_rb_copy_cb(struct rb_node *old, struct rb_node *new) +{ + struct memtype *old_data = container_of(old, struct memtype, rb); + struct memtype *new_data = container_of(new, struct memtype, rb); + + new_data->subtree_max_end = old_data->subtree_max_end; } +/* Update 'subtree_max_end' after tree rotation. old and new are the + * former and current subtree roots */ +static void memtype_rb_rotate_cb(struct rb_node *old, struct rb_node *new) +{ + struct memtype *old_data = container_of(old, struct memtype, rb); + struct memtype *new_data = container_of(new, struct memtype, rb); + + new_data->subtree_max_end = old_data->subtree_max_end; + old_data->subtree_max_end = compute_subtree_max_end(old_data); +} + +static const struct rb_augment_callbacks memtype_rb_augment_cb = { + memtype_rb_propagate_cb, memtype_rb_copy_cb, memtype_rb_rotate_cb +}; + /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, u64 start, u64 end) @@ -179,15 +207,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) struct memtype *data = container_of(*node, struct memtype, rb); parent = *node; + if (data->subtree_max_end < newdata->end) + data->subtree_max_end = newdata->end; if (newdata->start <= data->start) node = &((*node)->rb_left); else if (newdata->start > data->start) node = &((*node)->rb_right); } + newdata->subtree_max_end = newdata->end; rb_link_node(&newdata->rb, parent, node); - rb_insert_color(&newdata->rb, root); - rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL); + rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); } int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) @@ -209,16 +239,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) struct memtype *rbt_memtype_erase(u64 start, u64 end) { - struct rb_node *deepest; struct memtype *data; data = memtype_rb_exact_match(&memtype_rbroot, start, end); if (!data) goto out; - deepest = rb_augment_erase_begin(&data->rb); - rb_erase(&data->rb, &memtype_rbroot); - rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL); + rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb); out: return data; } diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index c902eb9d6506..4ace31b33380 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -80,14 +80,6 @@ rb_insert_augmented(struct rb_node *node, struct rb_root *root, } -typedef void (*rb_augment_f)(struct rb_node *node, void *data); - -extern void rb_augment_insert(struct rb_node *node, - rb_augment_f func, void *data); -extern struct rb_node *rb_augment_erase_begin(struct rb_node *node); -extern void rb_augment_erase_end(struct rb_node *node, - rb_augment_f func, void *data); - /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); diff --git a/lib/rbtree.c b/lib/rbtree.c index a37ee7954b8f..c0088ca345f9 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -538,77 +538,6 @@ void rb_erase_augmented(struct rb_node *node, struct rb_root *root, } EXPORT_SYMBOL(rb_erase_augmented); -static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data) -{ - struct rb_node *parent; - -up: - func(node, data); - parent = rb_parent(node); - if (!parent) - return; - - if (node == parent->rb_left && parent->rb_right) - func(parent->rb_right, data); - else if (parent->rb_left) - func(parent->rb_left, data); - - node = parent; - goto up; -} - -/* - * after inserting @node into the tree, update the tree to account for - * both the new entry and any damage done by rebalance - */ -void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data) -{ - if (node->rb_left) - node = node->rb_left; - else if (node->rb_right) - node = node->rb_right; - - rb_augment_path(node, func, data); -} -EXPORT_SYMBOL(rb_augment_insert); - -/* - * before removing the node, find the deepest node on the rebalance path - * that will still be there after @node gets removed - */ -struct rb_node *rb_augment_erase_begin(struct rb_node *node) -{ - struct rb_node *deepest; - - if (!node->rb_right && !node->rb_left) - deepest = rb_parent(node); - else if (!node->rb_right) - deepest = node->rb_left; - else if (!node->rb_left) - deepest = node->rb_right; - else { - deepest = rb_next(node); - if (deepest->rb_right) - deepest = deepest->rb_right; - else if (rb_parent(deepest) != node) - deepest = rb_parent(deepest); - } - - return deepest; -} -EXPORT_SYMBOL(rb_augment_erase_begin); - -/* - * after removal, update the tree to account for the removed entry - * and any rebalance damage. - */ -void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data) -{ - if (node) - rb_augment_path(node, func, data); -} -EXPORT_SYMBOL(rb_augment_erase_end); - /* * This function returns the first node (in sort order) of the tree. */ -- cgit v1.2.3 From 3908836aa77e3621aaf2101f2920e01d7c8460d6 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:21 -0700 Subject: rbtree: add RB_DECLARE_CALLBACKS() macro As proposed by Peter Zijlstra, this makes it easier to define the augmented rbtree callbacks. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat_rbtree.c | 37 ++----------------------------------- include/linux/rbtree.h | 30 ++++++++++++++++++++++++++++++ lib/rbtree_test.c | 34 ++-------------------------------- 3 files changed, 34 insertions(+), 67 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 7e1515bd4770..4d116959075d 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -69,41 +69,8 @@ static u64 compute_subtree_max_end(struct memtype *data) return max_end; } -/* Update 'subtree_max_end' for node and its parents */ -static void memtype_rb_propagate_cb(struct rb_node *node, struct rb_node *stop) -{ - while (node != stop) { - struct memtype *data = container_of(node, struct memtype, rb); - u64 subtree_max_end = compute_subtree_max_end(data); - if (data->subtree_max_end == subtree_max_end) - break; - data->subtree_max_end = subtree_max_end; - node = rb_parent(&data->rb); - } -} - -static void memtype_rb_copy_cb(struct rb_node *old, struct rb_node *new) -{ - struct memtype *old_data = container_of(old, struct memtype, rb); - struct memtype *new_data = container_of(new, struct memtype, rb); - - new_data->subtree_max_end = old_data->subtree_max_end; -} - -/* Update 'subtree_max_end' after tree rotation. old and new are the - * former and current subtree roots */ -static void memtype_rb_rotate_cb(struct rb_node *old, struct rb_node *new) -{ - struct memtype *old_data = container_of(old, struct memtype, rb); - struct memtype *new_data = container_of(new, struct memtype, rb); - - new_data->subtree_max_end = old_data->subtree_max_end; - old_data->subtree_max_end = compute_subtree_max_end(old_data); -} - -static const struct rb_augment_callbacks memtype_rb_augment_cb = { - memtype_rb_propagate_cb, memtype_rb_copy_cb, memtype_rb_rotate_cb -}; +RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, + u64, subtree_max_end, compute_subtree_max_end) /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 4ace31b33380..8d1e83b1c87b 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -79,6 +79,36 @@ rb_insert_augmented(struct rb_node *node, struct rb_root *root, __rb_insert_augmented(node, root, augment->rotate); } +#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ + rbtype, rbaugmented, rbcompute) \ +static void rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +{ \ + while (rb != stop) { \ + rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ + rbtype augmented = rbcompute(node); \ + if (node->rbaugmented == augmented) \ + break; \ + node->rbaugmented = augmented; \ + rb = rb_parent(&node->rbfield); \ + } \ +} \ +static void rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ +} \ +static void rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ + old->rbaugmented = rbcompute(old); \ +} \ +rbstatic const struct rb_augment_callbacks rbname = { \ + rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ +}; + /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index e28345df09bf..b20e99969b0f 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -61,38 +61,8 @@ static inline u32 augment_recompute(struct test_node *node) return max; } -static void augment_propagate(struct rb_node *rb, struct rb_node *stop) -{ - while (rb != stop) { - struct test_node *node = rb_entry(rb, struct test_node, rb); - u32 augmented = augment_recompute(node); - if (node->augmented == augmented) - break; - node->augmented = augmented; - rb = rb_parent(&node->rb); - } -} - -static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) -{ - struct test_node *old = rb_entry(rb_old, struct test_node, rb); - struct test_node *new = rb_entry(rb_new, struct test_node, rb); - new->augmented = old->augmented; -} - -static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) -{ - struct test_node *old = rb_entry(rb_old, struct test_node, rb); - struct test_node *new = rb_entry(rb_new, struct test_node, rb); - - /* Rotation doesn't change subtree's augmented value */ - new->augmented = old->augmented; - old->augmented = augment_recompute(old); -} - -static const struct rb_augment_callbacks augment_callbacks = { - augment_propagate, augment_copy, augment_rotate -}; +RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb, + u32, augmented, augment_recompute) static void insert_augmented(struct test_node *node, struct rb_root *root) { -- cgit v1.2.3 From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:25 -0700 Subject: mm: replace vma prio_tree with an interval tree Implement an interval tree as a replacement for the VMA prio_tree. The algorithms are similar to lib/interval_tree.c; however that code can't be directly reused as the interval endpoints are not explicitly stored in the VMA. So instead, the common algorithm is moved into a template and the details (node type, how to get interval endpoints from the node, etc) are filled in using the C preprocessor. Once the interval tree functions are available, using them as a replacement to the VMA prio tree is a relatively simple, mechanical job. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/fault-armv.c | 3 +- arch/arm/mm/flush.c | 3 +- arch/parisc/kernel/cache.c | 3 +- arch/x86/mm/hugetlbpage.c | 3 +- fs/hugetlbfs/inode.c | 9 +- fs/inode.c | 2 +- include/linux/fs.h | 6 +- include/linux/interval_tree_tmpl.h | 215 +++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 30 +++--- include/linux/mm_types.h | 14 +-- kernel/events/uprobes.c | 3 +- kernel/fork.c | 2 +- lib/interval_tree.c | 166 ++-------------------------- lib/prio_tree.c | 19 +--- mm/Makefile | 4 +- mm/filemap_xip.c | 3 +- mm/fremap.c | 2 +- mm/hugetlb.c | 3 +- mm/interval_tree.c | 61 +++++++++++ mm/memory-failure.c | 3 +- mm/memory.c | 9 +- mm/mmap.c | 22 ++-- mm/nommu.c | 12 +-- mm/prio_tree.c | 208 ----------------------------------- mm/rmap.c | 18 ++-- 25 files changed, 357 insertions(+), 466 deletions(-) create mode 100644 include/linux/interval_tree_tmpl.h create mode 100644 mm/interval_tree.c delete mode 100644 mm/prio_tree.c (limited to 'arch/x86') diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 7599e2625c7d..2a5907b5c8d2 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -134,7 +134,6 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *mpnt; - struct prio_tree_iter iter; unsigned long offset; pgoff_t pgoff; int aliases = 0; @@ -147,7 +146,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, * cache coherency. */ flush_dcache_mmap_lock(mapping); - vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { /* * If this VMA is not in our MM, we can ignore it. * Note that we intentionally mask out the VMA diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 40ca11ed6e5f..1c8f7f564175 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -196,7 +196,6 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p { struct mm_struct *mm = current->active_mm; struct vm_area_struct *mpnt; - struct prio_tree_iter iter; pgoff_t pgoff; /* @@ -208,7 +207,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); flush_dcache_mmap_lock(mapping); - vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { unsigned long offset; /* diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 9d181890a7e3..48e16dc20102 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -276,7 +276,6 @@ void flush_dcache_page(struct page *page) { struct address_space *mapping = page_mapping(page); struct vm_area_struct *mpnt; - struct prio_tree_iter iter; unsigned long offset; unsigned long addr, old_addr = 0; pgoff_t pgoff; @@ -299,7 +298,7 @@ void flush_dcache_page(struct page *page) * to flush one address here for them all to become coherent */ flush_dcache_mmap_lock(mapping); - vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; addr = mpnt->vm_start + offset; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index b91e48512425..937bff5cdaa7 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -71,7 +71,6 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - struct prio_tree_iter iter; struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; @@ -81,7 +80,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) return (pte_t *)pmd_alloc(mm, pud, addr); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { + vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0a0ab8e21b19..c5bc355d8243 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode) } static inline void -hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) +hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) { struct vm_area_struct *vma; - struct prio_tree_iter iter; - vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { + vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { unsigned long v_offset; /* * Can the expression below overflow on 32-bit arches? - * No, because the prio_tree returns us only those vmas + * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ @@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); mutex_lock(&mapping->i_mmap_mutex); - if (!prio_tree_empty(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); mutex_unlock(&mapping->i_mmap_mutex); truncate_hugepages(inode, offset); diff --git a/fs/inode.c b/fs/inode.c index ac8d904b3f16..b03c71957246 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping) mutex_init(&mapping->i_mmap_mutex); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + mapping->i_mmap = RB_ROOT; INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); diff --git a/include/linux/fs.h b/include/linux/fs.h index 5a8a273d5b2f..c617ed024df8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -401,7 +401,7 @@ struct inodes_stat_t { #include #include #include -#include +#include #include #include #include @@ -669,7 +669,7 @@ struct address_space { struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and lock protecting it */ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ - struct prio_tree_root i_mmap; /* tree of private and shared mappings */ + struct rb_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct mutex i_mmap_mutex; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ @@ -741,7 +741,7 @@ int mapping_tagged(struct address_space *mapping, int tag); */ static inline int mapping_mapped(struct address_space *mapping) { - return !prio_tree_empty(&mapping->i_mmap) || + return !RB_EMPTY_ROOT(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_nonlinear); } diff --git a/include/linux/interval_tree_tmpl.h b/include/linux/interval_tree_tmpl.h new file mode 100644 index 000000000000..c65deda31413 --- /dev/null +++ b/include/linux/interval_tree_tmpl.h @@ -0,0 +1,215 @@ +/* + Interval Trees + (C) 2012 Michel Lespinasse + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + include/linux/interval_tree_tmpl.h +*/ + +/* + * Template for implementing interval trees + * + * ITSTRUCT: struct type of the interval tree nodes + * ITRB: name of struct rb_node field within ITSTRUCT + * ITTYPE: type of the interval endpoints + * ITSUBTREE: name of ITTYPE field within ITSTRUCT holding last-in-subtree + * ITSTART(n): start endpoint of ITSTRUCT node n + * ITLAST(n): last endpoing of ITSTRUCT node n + * ITSTATIC: 'static' or empty + * ITPREFIX: prefix to use for the inline tree definitions + */ + +/* IT(name) -> ITPREFIX_name */ +#define _ITNAME(prefix, name) prefix ## _ ## name +#define ITNAME(prefix, name) _ITNAME(prefix, name) +#define IT(name) ITNAME(ITPREFIX, name) + +/* Callbacks for augmented rbtree insert and remove */ + +static inline ITTYPE IT(compute_subtree_last)(ITSTRUCT *node) +{ + ITTYPE max = ITLAST(node), subtree_last; + if (node->ITRB.rb_left) { + subtree_last = rb_entry(node->ITRB.rb_left, + ITSTRUCT, ITRB)->ITSUBTREE; + if (max < subtree_last) + max = subtree_last; + } + if (node->ITRB.rb_right) { + subtree_last = rb_entry(node->ITRB.rb_right, + ITSTRUCT, ITRB)->ITSUBTREE; + if (max < subtree_last) + max = subtree_last; + } + return max; +} + +static void IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop) +{ + while (rb != stop) { + ITSTRUCT *node = rb_entry(rb, ITSTRUCT, ITRB); + ITTYPE subtree_last = IT(compute_subtree_last)(node); + if (node->ITSUBTREE == subtree_last) + break; + node->ITSUBTREE = subtree_last; + rb = rb_parent(&node->ITRB); + } +} + +static void IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new) +{ + ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB); + ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB); + + new->ITSUBTREE = old->ITSUBTREE; +} + +static void IT(augment_rotate)(struct rb_node *rb_old, struct rb_node *rb_new) +{ + ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB); + ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB); + + new->ITSUBTREE = old->ITSUBTREE; + old->ITSUBTREE = IT(compute_subtree_last)(old); +} + +static const struct rb_augment_callbacks IT(augment_callbacks) = { + IT(augment_propagate), IT(augment_copy), IT(augment_rotate) +}; + +/* Insert / remove interval nodes from the tree */ + +ITSTATIC void IT(insert)(ITSTRUCT *node, struct rb_root *root) +{ + struct rb_node **link = &root->rb_node, *rb_parent = NULL; + ITTYPE start = ITSTART(node), last = ITLAST(node); + ITSTRUCT *parent; + + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, ITSTRUCT, ITRB); + if (parent->ITSUBTREE < last) + parent->ITSUBTREE = last; + if (start < ITSTART(parent)) + link = &parent->ITRB.rb_left; + else + link = &parent->ITRB.rb_right; + } + + node->ITSUBTREE = last; + rb_link_node(&node->ITRB, rb_parent, link); + rb_insert_augmented(&node->ITRB, root, &IT(augment_callbacks)); +} + +ITSTATIC void IT(remove)(ITSTRUCT *node, struct rb_root *root) +{ + rb_erase_augmented(&node->ITRB, root, &IT(augment_callbacks)); +} + +/* + * Iterate over intervals intersecting [start;last] + * + * Note that a node's interval intersects [start;last] iff: + * Cond1: ITSTART(node) <= last + * and + * Cond2: start <= ITLAST(node) + */ + +static ITSTRUCT *IT(subtree_search)(ITSTRUCT *node, ITTYPE start, ITTYPE last) +{ + while (true) { + /* + * Loop invariant: start <= node->ITSUBTREE + * (Cond2 is satisfied by one of the subtree nodes) + */ + if (node->ITRB.rb_left) { + ITSTRUCT *left = rb_entry(node->ITRB.rb_left, + ITSTRUCT, ITRB); + if (start <= left->ITSUBTREE) { + /* + * Some nodes in left subtree satisfy Cond2. + * Iterate to find the leftmost such node N. + * If it also satisfies Cond1, that's the match + * we are looking for. Otherwise, there is no + * matching interval as nodes to the right of N + * can't satisfy Cond1 either. + */ + node = left; + continue; + } + } + if (ITSTART(node) <= last) { /* Cond1 */ + if (start <= ITLAST(node)) /* Cond2 */ + return node; /* node is leftmost match */ + if (node->ITRB.rb_right) { + node = rb_entry(node->ITRB.rb_right, + ITSTRUCT, ITRB); + if (start <= node->ITSUBTREE) + continue; + } + } + return NULL; /* No match */ + } +} + +ITSTATIC ITSTRUCT *IT(iter_first)(struct rb_root *root, + ITTYPE start, ITTYPE last) +{ + ITSTRUCT *node; + + if (!root->rb_node) + return NULL; + node = rb_entry(root->rb_node, ITSTRUCT, ITRB); + if (node->ITSUBTREE < start) + return NULL; + return IT(subtree_search)(node, start, last); +} + +ITSTATIC ITSTRUCT *IT(iter_next)(ITSTRUCT *node, ITTYPE start, ITTYPE last) +{ + struct rb_node *rb = node->ITRB.rb_right, *prev; + + while (true) { + /* + * Loop invariants: + * Cond1: ITSTART(node) <= last + * rb == node->ITRB.rb_right + * + * First, search right subtree if suitable + */ + if (rb) { + ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB); + if (start <= right->ITSUBTREE) + return IT(subtree_search)(right, start, last); + } + + /* Move up the tree until we come from a node's left child */ + do { + rb = rb_parent(&node->ITRB); + if (!rb) + return NULL; + prev = &node->ITRB; + node = rb_entry(rb, ITSTRUCT, ITRB); + rb = node->ITRB.rb_right; + } while (prev == rb); + + /* Check if the node intersects [start;last] */ + if (last < ITSTART(node)) /* !Cond1 */ + return NULL; + else if (start <= ITLAST(node)) /* Cond2 */ + return node; + } +} diff --git a/include/linux/mm.h b/include/linux/mm.h index 5ddb11b2b4bb..0f671ef09eba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -1355,22 +1354,27 @@ extern void zone_pcp_reset(struct zone *zone); extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); -/* prio_tree.c */ -void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); -void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); -void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter); - -#define vma_prio_tree_foreach(vma, iter, root, begin, end) \ - for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \ - (vma = vma_prio_tree_next(vma, iter)); ) +/* interval_tree.c */ +void vma_interval_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old, + struct address_space *mapping); +void vma_interval_tree_insert(struct vm_area_struct *node, + struct rb_root *root); +void vma_interval_tree_remove(struct vm_area_struct *node, + struct rb_root *root); +struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root, + unsigned long start, unsigned long last); +struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, + unsigned long start, unsigned long last); + +#define vma_interval_tree_foreach(vma, root, start, last) \ + for (vma = vma_interval_tree_iter_first(root, start, last); \ + vma; vma = vma_interval_tree_iter_next(vma, start, last)) static inline void vma_nonlinear_insert(struct vm_area_struct *vma, struct list_head *list) { - vma->shared.vm_set.parent = NULL; - list_add_tail(&vma->shared.vm_set.list, list); + list_add_tail(&vma->shared.nonlinear, list); } /* mmap.c */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a57a43f5ca7c..31f8a3af7d94 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -240,18 +239,15 @@ struct vm_area_struct { /* * For areas with an address space and backing store, - * linkage into the address_space->i_mmap prio tree, or - * linkage to the list of like vmas hanging off its node, or + * linkage into the address_space->i_mmap interval tree, or * linkage of vma in the address_space->i_mmap_nonlinear list. */ union { struct { - struct list_head list; - void *parent; /* aligns with prio_tree_node parent */ - struct vm_area_struct *head; - } vm_set; - - struct raw_prio_tree_node prio_tree_node; + struct rb_node rb; + unsigned long rb_subtree_last; + } linear; + struct list_head nonlinear; } shared; /* diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 912ef48d28ab..1d9c0a985960 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -735,7 +735,6 @@ static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; - struct prio_tree_iter iter; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; @@ -744,7 +743,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) again: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; diff --git a/kernel/fork.c b/kernel/fork.c index 972762e01024..90dace52715e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -423,7 +423,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - vma_prio_tree_add(tmp, mpnt); + vma_interval_tree_add(tmp, mpnt, mapping); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } diff --git a/lib/interval_tree.c b/lib/interval_tree.c index 6fd540b1e499..77a793e0644b 100644 --- a/lib/interval_tree.c +++ b/lib/interval_tree.c @@ -1,159 +1,13 @@ #include #include -/* Callbacks for augmented rbtree insert and remove */ - -static inline unsigned long -compute_subtree_last(struct interval_tree_node *node) -{ - unsigned long max = node->last, subtree_last; - if (node->rb.rb_left) { - subtree_last = rb_entry(node->rb.rb_left, - struct interval_tree_node, rb)->__subtree_last; - if (max < subtree_last) - max = subtree_last; - } - if (node->rb.rb_right) { - subtree_last = rb_entry(node->rb.rb_right, - struct interval_tree_node, rb)->__subtree_last; - if (max < subtree_last) - max = subtree_last; - } - return max; -} - -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct interval_tree_node, rb, - unsigned long, __subtree_last, compute_subtree_last) - -/* Insert / remove interval nodes from the tree */ - -void interval_tree_insert(struct interval_tree_node *node, - struct rb_root *root) -{ - struct rb_node **link = &root->rb_node, *rb_parent = NULL; - unsigned long start = node->start, last = node->last; - struct interval_tree_node *parent; - - while (*link) { - rb_parent = *link; - parent = rb_entry(rb_parent, struct interval_tree_node, rb); - if (parent->__subtree_last < last) - parent->__subtree_last = last; - if (start < parent->start) - link = &parent->rb.rb_left; - else - link = &parent->rb.rb_right; - } - - node->__subtree_last = last; - rb_link_node(&node->rb, rb_parent, link); - rb_insert_augmented(&node->rb, root, &augment_callbacks); -} - -void interval_tree_remove(struct interval_tree_node *node, - struct rb_root *root) -{ - rb_erase_augmented(&node->rb, root, &augment_callbacks); -} - -/* - * Iterate over intervals intersecting [start;last] - * - * Note that a node's interval intersects [start;last] iff: - * Cond1: node->start <= last - * and - * Cond2: start <= node->last - */ - -static struct interval_tree_node * -subtree_search(struct interval_tree_node *node, - unsigned long start, unsigned long last) -{ - while (true) { - /* - * Loop invariant: start <= node->__subtree_last - * (Cond2 is satisfied by one of the subtree nodes) - */ - if (node->rb.rb_left) { - struct interval_tree_node *left = - rb_entry(node->rb.rb_left, - struct interval_tree_node, rb); - if (start <= left->__subtree_last) { - /* - * Some nodes in left subtree satisfy Cond2. - * Iterate to find the leftmost such node N. - * If it also satisfies Cond1, that's the match - * we are looking for. Otherwise, there is no - * matching interval as nodes to the right of N - * can't satisfy Cond1 either. - */ - node = left; - continue; - } - } - if (node->start <= last) { /* Cond1 */ - if (start <= node->last) /* Cond2 */ - return node; /* node is leftmost match */ - if (node->rb.rb_right) { - node = rb_entry(node->rb.rb_right, - struct interval_tree_node, rb); - if (start <= node->__subtree_last) - continue; - } - } - return NULL; /* No match */ - } -} - -struct interval_tree_node * -interval_tree_iter_first(struct rb_root *root, - unsigned long start, unsigned long last) -{ - struct interval_tree_node *node; - - if (!root->rb_node) - return NULL; - node = rb_entry(root->rb_node, struct interval_tree_node, rb); - if (node->__subtree_last < start) - return NULL; - return subtree_search(node, start, last); -} - -struct interval_tree_node * -interval_tree_iter_next(struct interval_tree_node *node, - unsigned long start, unsigned long last) -{ - struct rb_node *rb = node->rb.rb_right, *prev; - - while (true) { - /* - * Loop invariants: - * Cond1: node->start <= last - * rb == node->rb.rb_right - * - * First, search right subtree if suitable - */ - if (rb) { - struct interval_tree_node *right = - rb_entry(rb, struct interval_tree_node, rb); - if (start <= right->__subtree_last) - return subtree_search(right, start, last); - } - - /* Move up the tree until we come from a node's left child */ - do { - rb = rb_parent(&node->rb); - if (!rb) - return NULL; - prev = &node->rb; - node = rb_entry(rb, struct interval_tree_node, rb); - rb = node->rb.rb_right; - } while (prev == rb); - - /* Check if the node intersects [start;last] */ - if (last < node->start) /* !Cond1 */ - return NULL; - else if (start <= node->last) /* Cond2 */ - return node; - } -} +#define ITSTRUCT struct interval_tree_node +#define ITRB rb +#define ITTYPE unsigned long +#define ITSUBTREE __subtree_last +#define ITSTART(n) ((n)->start) +#define ITLAST(n) ((n)->last) +#define ITSTATIC +#define ITPREFIX interval_tree + +#include diff --git a/lib/prio_tree.c b/lib/prio_tree.c index 4e0d2edff2b4..bba37148c15e 100644 --- a/lib/prio_tree.c +++ b/lib/prio_tree.c @@ -44,27 +44,12 @@ * The following macros are used for implementing prio_tree for i_mmap */ -#define RADIX_INDEX(vma) ((vma)->vm_pgoff) -#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) -/* avoid overflow */ -#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) - - static void get_index(const struct prio_tree_root *root, const struct prio_tree_node *node, unsigned long *radix, unsigned long *heap) { - if (root->raw) { - struct vm_area_struct *vma = prio_tree_entry( - node, struct vm_area_struct, shared.prio_tree_node); - - *radix = RADIX_INDEX(vma); - *heap = HEAP_INDEX(vma); - } - else { - *radix = node->start; - *heap = node->last; - } + *radix = node->start; + *heap = node->last; } static unsigned long index_bits_to_maxindex[BITS_PER_LONG]; diff --git a/mm/Makefile b/mm/Makefile index 92753e2d82da..6b025f80af34 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -14,9 +14,9 @@ endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ - prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ + util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o $(mmu-y) + compaction.o interval_tree.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 91750227a191..a52daee11d3f 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping, { struct vm_area_struct *vma; struct mm_struct *mm; - struct prio_tree_iter iter; unsigned long address; pte_t *pte; pte_t pteval; @@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping, retry: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); diff --git a/mm/fremap.c b/mm/fremap.c index 3d731a498788..3899a86851ce 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -214,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f1bb534254f6..c9b40e3a9936 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2474,7 +2474,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; - struct prio_tree_iter iter; pgoff_t pgoff; /* @@ -2491,7 +2490,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * __unmap_hugepage_range() is called as the lock is already held */ mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 000000000000..7dc565660e56 --- /dev/null +++ b/mm/interval_tree.c @@ -0,0 +1,61 @@ +/* + * mm/interval_tree.c - interval tree for mapping->i_mmap + * + * Copyright (C) 2012, Michel Lespinasse + * + * This file is released under the GPL v2. + */ + +#include +#include + +#define ITSTRUCT struct vm_area_struct +#define ITRB shared.linear.rb +#define ITTYPE unsigned long +#define ITSUBTREE shared.linear.rb_subtree_last +#define ITSTART(n) ((n)->vm_pgoff) +#define ITLAST(n) ((n)->vm_pgoff + \ + (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1) +#define ITSTATIC +#define ITPREFIX vma_interval_tree + +#include + +/* Insert old immediately after vma in the interval tree */ +void vma_interval_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old, + struct address_space *mapping) +{ + struct rb_node **link; + struct vm_area_struct *parent; + unsigned long last; + + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + list_add(&vma->shared.nonlinear, &old->shared.nonlinear); + return; + } + + last = ITLAST(vma); + + if (!old->shared.linear.rb.rb_right) { + parent = old; + link = &old->shared.linear.rb.rb_right; + } else { + parent = rb_entry(old->shared.linear.rb.rb_right, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + while (parent->shared.linear.rb.rb_left) { + parent = rb_entry(parent->shared.linear.rb.rb_left, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + } + link = &parent->shared.linear.rb.rb_left; + } + + vma->shared.linear.rb_subtree_last = last; + rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link); + rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap, + &vma_interval_tree_augment_callbacks); +} diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a6e2141a6610..c38a6257d082 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -431,7 +431,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, { struct vm_area_struct *vma; struct task_struct *tsk; - struct prio_tree_iter iter; struct address_space *mapping = page->mapping; mutex_lock(&mapping->i_mmap_mutex); @@ -442,7 +441,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, if (!task_early_kill(tsk)) continue; - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* * Send early kill signal to tasks where a vma covers diff --git a/mm/memory.c b/mm/memory.c index e09c04813186..d205e4381a34 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2801,14 +2801,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct prio_tree_root *root, +static inline void unmap_mapping_range_tree(struct rb_root *root, struct zap_details *details) { struct vm_area_struct *vma; - struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; - vma_prio_tree_foreach(vma, &iter, root, + vma_interval_tree_foreach(vma, root, details->first_index, details->last_index) { vba = vma->vm_pgoff; @@ -2839,7 +2838,7 @@ static inline void unmap_mapping_range_list(struct list_head *head, * across *all* the pages in each nonlinear VMA, not just the pages * whose virtual address lies outside the file truncation point. */ - list_for_each_entry(vma, head, shared.vm_set.list) { + list_for_each_entry(vma, head, shared.nonlinear) { details->nonlinear_vma = vma; unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); } @@ -2883,7 +2882,7 @@ void unmap_mapping_range(struct address_space *mapping, mutex_lock(&mapping->i_mmap_mutex); - if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); diff --git a/mm/mmap.c b/mm/mmap.c index e3c365ff1b6a..5ac533f88e99 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -199,14 +199,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.vm_set.list); + list_del_init(&vma->shared.nonlinear); else - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* - * Unlink a file-based vm structure from its prio_tree, to hide + * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) @@ -411,7 +411,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (unlikely(vma->vm_flags & VM_NONLINEAR)) vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); else - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -449,7 +449,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the prio_tree. + * mm's list and rbtree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { @@ -491,7 +491,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *next = vma->vm_next; struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; - struct prio_tree_root *root = NULL; + struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; long adjust_next = 0; @@ -554,7 +554,7 @@ again: remove_next = 1 + (end > next->vm_end); mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* - * Put into prio_tree now, so instantiated pages + * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. @@ -582,9 +582,9 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, root); + vma_interval_tree_remove(vma, root); if (adjust_next) - vma_prio_tree_remove(next, root); + vma_interval_tree_remove(next, root); } vma->vm_start = start; @@ -597,8 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { if (adjust_next) - vma_prio_tree_insert(next, root); - vma_prio_tree_insert(vma, root); + vma_interval_tree_insert(next, root); + vma_interval_tree_insert(vma, root); flush_dcache_mmap_unlock(mapping); } diff --git a/mm/nommu.c b/mm/nommu.c index 12e84e69dd06..45131b41bcdb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -2044,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, size_t newsize) { struct vm_area_struct *vma; - struct prio_tree_iter iter; struct vm_region *region; pgoff_t low, high; size_t r_size, r_top; @@ -2056,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, mutex_lock(&inode->i_mapping->i_mmap_mutex); /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - low, high) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { @@ -2073,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, * we don't check for any regions that start beyond the EOF as there * shouldn't be any */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - 0, ULONG_MAX) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { if (!(vma->vm_flags & VM_SHARED)) continue; diff --git a/mm/prio_tree.c b/mm/prio_tree.c deleted file mode 100644 index 799dcfd7cd8c..000000000000 --- a/mm/prio_tree.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * mm/prio_tree.c - priority search tree for mapping->i_mmap - * - * Copyright (C) 2004, Rajesh Venkatasubramanian - * - * This file is released under the GPL v2. - * - * Based on the radix priority search tree proposed by Edward M. McCreight - * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 - * - * 02Feb2004 Initial version - */ - -#include -#include -#include - -/* - * See lib/prio_tree.c for details on the general radix priority search tree - * code. - */ - -/* - * The following #defines are mirrored from lib/prio_tree.c. They're only used - * for debugging, and should be removed (along with the debugging code using - * them) when switching also VMAs to the regular prio_tree code. - */ - -#define RADIX_INDEX(vma) ((vma)->vm_pgoff) -#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) -/* avoid overflow */ -#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) - -/* - * Radix priority search tree for address_space->i_mmap - * - * For each vma that map a unique set of file pages i.e., unique [radix_index, - * heap_index] value, we have a corresponding priority search tree node. If - * multiple vmas have identical [radix_index, heap_index] value, then one of - * them is used as a tree node and others are stored in a vm_set list. The tree - * node points to the first vma (head) of the list using vm_set.head. - * - * prio_tree_root - * | - * A vm_set.head - * / \ / - * L R -> H-I-J-K-M-N-O-P-Q-S - * ^ ^ <-- vm_set.list --> - * tree nodes - * - * We need some way to identify whether a vma is a tree node, head of a vm_set - * list, or just a member of a vm_set list. We cannot use vm_flags to store - * such information. The reason is, in the above figure, it is possible that - * vm_flags' of R and H are covered by the different mmap_sems. When R is - * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold - * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. - * That's why some trick involving shared.vm_set.parent is used for identifying - * tree nodes and list head nodes. - * - * vma radix priority search tree node rules: - * - * vma->shared.vm_set.parent != NULL ==> a tree node - * vma->shared.vm_set.head != NULL ==> list of others mapping same range - * vma->shared.vm_set.head == NULL ==> no others map the same range - * - * vma->shared.vm_set.parent == NULL - * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range - * vma->shared.vm_set.head == NULL ==> a list node - */ - -/* - * Add a new vma known to map the same set of pages as the old vma: - * useful for fork's dup_mmap as well as vma_prio_tree_insert below. - * Note that it just happens to work correctly on i_mmap_nonlinear too. - */ -void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) -{ - /* Leave these BUG_ONs till prio_tree patch stabilizes */ - BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); - BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); - - vma->shared.vm_set.head = NULL; - vma->shared.vm_set.parent = NULL; - - if (!old->shared.vm_set.parent) - list_add(&vma->shared.vm_set.list, - &old->shared.vm_set.list); - else if (old->shared.vm_set.head) - list_add_tail(&vma->shared.vm_set.list, - &old->shared.vm_set.head->shared.vm_set.list); - else { - INIT_LIST_HEAD(&vma->shared.vm_set.list); - vma->shared.vm_set.head = old; - old->shared.vm_set.head = vma; - } -} - -void vma_prio_tree_insert(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *old; - - vma->shared.vm_set.head = NULL; - - ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); - if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { - old = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - vma_prio_tree_add(vma, old); - } -} - -void vma_prio_tree_remove(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct vm_area_struct *node, *head, *new_head; - - if (!vma->shared.vm_set.head) { - if (!vma->shared.vm_set.parent) - list_del_init(&vma->shared.vm_set.list); - else - raw_prio_tree_remove(root, &vma->shared.prio_tree_node); - } else { - /* Leave this BUG_ON till prio_tree patch stabilizes */ - BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); - if (vma->shared.vm_set.parent) { - head = vma->shared.vm_set.head; - if (!list_empty(&head->shared.vm_set.list)) { - new_head = list_entry( - head->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&head->shared.vm_set.list); - } else - new_head = NULL; - - raw_prio_tree_replace(root, &vma->shared.prio_tree_node, - &head->shared.prio_tree_node); - head->shared.vm_set.head = new_head; - if (new_head) - new_head->shared.vm_set.head = head; - - } else { - node = vma->shared.vm_set.head; - if (!list_empty(&vma->shared.vm_set.list)) { - new_head = list_entry( - vma->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&vma->shared.vm_set.list); - node->shared.vm_set.head = new_head; - new_head->shared.vm_set.head = node; - } else - node->shared.vm_set.head = NULL; - } - } -} - -/* - * Helper function to enumerate vmas that map a given file page or a set of - * contiguous file pages. The function returns vmas that at least map a single - * page in the given range of contiguous file pages. - */ -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *next; - - if (!vma) { - /* - * First call is with NULL vma - */ - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; - } - - if (vma->shared.vm_set.parent) { - if (vma->shared.vm_set.head) { - next = vma->shared.vm_set.head; - prefetch(next->shared.vm_set.list.next); - return next; - } - } else { - next = list_entry(vma->shared.vm_set.list.next, - struct vm_area_struct, shared.vm_set.list); - if (!next->shared.vm_set.head) { - prefetch(next->shared.vm_set.list.next); - return next; - } - } - - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; -} diff --git a/mm/rmap.c b/mm/rmap.c index 0f3b7cda2a24..7b5b51d25fc5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -820,7 +820,6 @@ static int page_referenced_file(struct page *page, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int referenced = 0; /* @@ -846,7 +845,7 @@ static int page_referenced_file(struct page *page, */ mapcount = page_mapcount(page); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -945,13 +944,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = 0; BUG_ON(PageAnon(page)); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1547,7 +1545,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1555,7 +1552,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned int mapcount; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -1576,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1608,7 +1605,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { @@ -1631,7 +1628,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) * in locked vmas). Reset cursor on all unreserved nonlinear * vmas, now forgetting on which ones it had fallen behind. */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) vma->vm_private_data = NULL; out: mutex_unlock(&mapping->i_mmap_mutex); @@ -1748,13 +1745,12 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; -- cgit v1.2.3 From 9c079add0d0f45220f4bb37febf0621137ec2d38 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:33 -0700 Subject: rbtree: move augmented rbtree functionality to rbtree_augmented.h Provide rb_insert_augmented() and rb_erase_augmented() through a new rbtree_augmented.h include file. rb_erase_augmented() is defined there as an __always_inline function, in order to allow inlining of augmented rbtree callbacks into it. Since this generates a relatively large function, each augmented rbtree user should make sure to have a single call site. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rbtree.txt | 13 +++ arch/x86/mm/pat_rbtree.c | 2 +- include/linux/interval_tree_tmpl.h | 8 +- include/linux/rbtree.h | 48 -------- include/linux/rbtree_augmented.h | 223 +++++++++++++++++++++++++++++++++++++ lib/rbtree.c | 162 ++------------------------- lib/rbtree_test.c | 2 +- 7 files changed, 255 insertions(+), 203 deletions(-) create mode 100644 include/linux/rbtree_augmented.h (limited to 'arch/x86') diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt index 0a0b6dce3e08..61b6c48871a0 100644 --- a/Documentation/rbtree.txt +++ b/Documentation/rbtree.txt @@ -202,6 +202,14 @@ An rbtree user who wants this feature will have to call the augmentation functions with the user provided augmentation callback when inserting and erasing nodes. +C files implementing augmented rbtree manipulation must include + instead of . Note that +linux/rbtree_augmented.h exposes some rbtree implementations details +you are not expected to rely on; please stick to the documented APIs +there and do not include from header files +either so as to minimize chances of your users accidentally relying on +such implementation details. + On insertion, the user must update the augmented information on the path leading to the inserted node, then call rb_link_node() as usual and rb_augment_inserted() instead of the usual rb_insert_color() call. @@ -227,6 +235,11 @@ In both cases, the callbacks are provided through struct rb_augment_callbacks. subtree to a newly assigned subtree root AND recomputes the augmented information for the former subtree root. +The compiled code for rb_erase_augmented() may inline the propagation and +copy callbacks, which results in a large function, so each augmented rbtree +user should have a single rb_erase_augmented() call site in order to limit +compiled code size. + Sample usage: diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 4d116959075d..415f6c4ced36 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/interval_tree_tmpl.h b/include/linux/interval_tree_tmpl.h index c65deda31413..c1aeb922d65f 100644 --- a/include/linux/interval_tree_tmpl.h +++ b/include/linux/interval_tree_tmpl.h @@ -19,6 +19,8 @@ include/linux/interval_tree_tmpl.h */ +#include + /* * Template for implementing interval trees * @@ -57,7 +59,8 @@ static inline ITTYPE IT(compute_subtree_last)(ITSTRUCT *node) return max; } -static void IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop) +static inline void +IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop) { while (rb != stop) { ITSTRUCT *node = rb_entry(rb, ITSTRUCT, ITRB); @@ -69,7 +72,8 @@ static void IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop) } } -static void IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new) +static inline void +IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new) { ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB); ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB); diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 8d1e83b1c87b..0022c1bb1e26 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -62,54 +62,6 @@ extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); -struct rb_augment_callbacks { - void (*propagate)(struct rb_node *node, struct rb_node *stop); - void (*copy)(struct rb_node *old, struct rb_node *new); - void (*rotate)(struct rb_node *old, struct rb_node *new); -}; - -extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, - void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); -extern void rb_erase_augmented(struct rb_node *node, struct rb_root *root, - const struct rb_augment_callbacks *augment); -static inline void -rb_insert_augmented(struct rb_node *node, struct rb_root *root, - const struct rb_augment_callbacks *augment) -{ - __rb_insert_augmented(node, root, augment->rotate); -} - -#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ - rbtype, rbaugmented, rbcompute) \ -static void rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ -{ \ - while (rb != stop) { \ - rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ - rbtype augmented = rbcompute(node); \ - if (node->rbaugmented == augmented) \ - break; \ - node->rbaugmented = augmented; \ - rb = rb_parent(&node->rbfield); \ - } \ -} \ -static void rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ -{ \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ -} \ -static void rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ -{ \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ - old->rbaugmented = rbcompute(old); \ -} \ -rbstatic const struct rb_augment_callbacks rbname = { \ - rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ -}; - - /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h new file mode 100644 index 000000000000..214caa33433b --- /dev/null +++ b/include/linux/rbtree_augmented.h @@ -0,0 +1,223 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + (C) 2002 David Woodhouse + (C) 2012 Michel Lespinasse + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/include/linux/rbtree_augmented.h +*/ + +#ifndef _LINUX_RBTREE_AUGMENTED_H +#define _LINUX_RBTREE_AUGMENTED_H + +#include + +/* + * Please note - only struct rb_augment_callbacks and the prototypes for + * rb_insert_augmented() and rb_erase_augmented() are intended to be public. + * The rest are implementation details you are not expected to depend on. + * + * See Documentation/rbtree.txt for documentation and samples. + */ + +struct rb_augment_callbacks { + void (*propagate)(struct rb_node *node, struct rb_node *stop); + void (*copy)(struct rb_node *old, struct rb_node *new); + void (*rotate)(struct rb_node *old, struct rb_node *new); +}; + +extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); +static inline void +rb_insert_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + __rb_insert_augmented(node, root, augment->rotate); +} + +#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ + rbtype, rbaugmented, rbcompute) \ +static inline void \ +rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +{ \ + while (rb != stop) { \ + rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ + rbtype augmented = rbcompute(node); \ + if (node->rbaugmented == augmented) \ + break; \ + node->rbaugmented = augmented; \ + rb = rb_parent(&node->rbfield); \ + } \ +} \ +static inline void \ +rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ +} \ +static void \ +rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ + old->rbaugmented = rbcompute(old); \ +} \ +rbstatic const struct rb_augment_callbacks rbname = { \ + rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ +}; + + +#define RB_RED 0 +#define RB_BLACK 1 + +#define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) + +#define __rb_color(pc) ((pc) & 1) +#define __rb_is_black(pc) __rb_color(pc) +#define __rb_is_red(pc) (!__rb_color(pc)) +#define rb_color(rb) __rb_color((rb)->__rb_parent_color) +#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) +#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; +} + +static inline void rb_set_parent_color(struct rb_node *rb, + struct rb_node *p, int color) +{ + rb->__rb_parent_color = (unsigned long)p | color; +} + +static inline void +__rb_change_child(struct rb_node *old, struct rb_node *new, + struct rb_node *parent, struct rb_root *root) +{ + if (parent) { + if (parent->rb_left == old) + parent->rb_left = new; + else + parent->rb_right = new; + } else + root->rb_node = new; +} + +extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); + +static __always_inline void +rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *child = node->rb_right, *tmp = node->rb_left; + struct rb_node *parent, *rebalance; + unsigned long pc; + + if (!tmp) { + /* + * Case 1: node to erase has no more than 1 child (easy!) + * + * Note that if there is one child it must be red due to 5) + * and node must be black due to 4). We adjust colors locally + * so as to bypass __rb_erase_color() later on. + */ + pc = node->__rb_parent_color; + parent = __rb_parent(pc); + __rb_change_child(node, child, parent, root); + if (child) { + child->__rb_parent_color = pc; + rebalance = NULL; + } else + rebalance = __rb_is_black(pc) ? parent : NULL; + tmp = parent; + } else if (!child) { + /* Still case 1, but this time the child is node->rb_left */ + tmp->__rb_parent_color = pc = node->__rb_parent_color; + parent = __rb_parent(pc); + __rb_change_child(node, tmp, parent, root); + rebalance = NULL; + tmp = parent; + } else { + struct rb_node *successor = child, *child2; + tmp = child->rb_left; + if (!tmp) { + /* + * Case 2: node's successor is its right child + * + * (n) (s) + * / \ / \ + * (x) (s) -> (x) (c) + * \ + * (c) + */ + parent = successor; + child2 = successor->rb_right; + augment->copy(node, successor); + } else { + /* + * Case 3: node's successor is leftmost under + * node's right child subtree + * + * (n) (s) + * / \ / \ + * (x) (y) -> (x) (y) + * / / + * (p) (p) + * / / + * (s) (c) + * \ + * (c) + */ + do { + parent = successor; + successor = tmp; + tmp = tmp->rb_left; + } while (tmp); + parent->rb_left = child2 = successor->rb_right; + successor->rb_right = child; + rb_set_parent(child, successor); + augment->copy(node, successor); + augment->propagate(parent, successor); + } + + successor->rb_left = tmp = node->rb_left; + rb_set_parent(tmp, successor); + + pc = node->__rb_parent_color; + tmp = __rb_parent(pc); + __rb_change_child(node, successor, tmp, root); + if (child2) { + successor->__rb_parent_color = pc; + rb_set_parent_color(child2, parent, RB_BLACK); + rebalance = NULL; + } else { + unsigned long pc2 = successor->__rb_parent_color; + successor->__rb_parent_color = pc; + rebalance = __rb_is_black(pc2) ? parent : NULL; + } + tmp = successor; + } + + augment->propagate(tmp, NULL); + if (rebalance) + __rb_erase_color(rebalance, root, augment->rotate); +} + +#endif /* _LINUX_RBTREE_AUGMENTED_H */ diff --git a/lib/rbtree.c b/lib/rbtree.c index c0088ca345f9..4f56a11d67fa 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -21,7 +21,7 @@ linux/lib/rbtree.c */ -#include +#include #include /* @@ -44,52 +44,16 @@ * parentheses and have some accompanying text comment. */ -#define RB_RED 0 -#define RB_BLACK 1 - -#define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) - -#define __rb_color(pc) ((pc) & 1) -#define __rb_is_black(pc) __rb_color(pc) -#define __rb_is_red(pc) (!__rb_color(pc)) -#define rb_color(rb) __rb_color((rb)->__rb_parent_color) -#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) -#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) - static inline void rb_set_black(struct rb_node *rb) { rb->__rb_parent_color |= RB_BLACK; } -static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) -{ - rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; -} - -static inline void rb_set_parent_color(struct rb_node *rb, - struct rb_node *p, int color) -{ - rb->__rb_parent_color = (unsigned long)p | color; -} - static inline struct rb_node *rb_red_parent(struct rb_node *red) { return (struct rb_node *)red->__rb_parent_color; } -static inline void -__rb_change_child(struct rb_node *old, struct rb_node *new, - struct rb_node *parent, struct rb_root *root) -{ - if (parent) { - if (parent->rb_left == old) - parent->rb_left = new; - else - parent->rb_right = new; - } else - root->rb_node = new; -} - /* * Helper function for rotations: * - old's parent and color get assigned to new @@ -230,9 +194,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root, } } -static __always_inline void +__always_inline void __rb_erase_color(struct rb_node *parent, struct rb_root *root, - const struct rb_augment_callbacks *augment) + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; @@ -261,7 +225,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); - augment->rotate(parent, sibling); + augment_rotate(parent, sibling); sibling = tmp1; } tmp1 = sibling->rb_right; @@ -313,7 +277,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); - augment->rotate(sibling, tmp2); + augment_rotate(sibling, tmp2); tmp1 = sibling; sibling = tmp2; } @@ -336,7 +300,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, rb_set_parent(tmp2, parent); __rb_rotate_set_parents(parent, sibling, root, RB_BLACK); - augment->rotate(parent, sibling); + augment_rotate(parent, sibling); break; } else { sibling = parent->rb_left; @@ -347,7 +311,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); - augment->rotate(parent, sibling); + augment_rotate(parent, sibling); sibling = tmp1; } tmp1 = sibling->rb_left; @@ -374,7 +338,7 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); - augment->rotate(sibling, tmp2); + augment_rotate(sibling, tmp2); tmp1 = sibling; sibling = tmp2; } @@ -386,109 +350,12 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, rb_set_parent(tmp2, parent); __rb_rotate_set_parents(parent, sibling, root, RB_BLACK); - augment->rotate(parent, sibling); + augment_rotate(parent, sibling); break; } } } - -static __always_inline void -__rb_erase(struct rb_node *node, struct rb_root *root, - const struct rb_augment_callbacks *augment) -{ - struct rb_node *child = node->rb_right, *tmp = node->rb_left; - struct rb_node *parent, *rebalance; - unsigned long pc; - - if (!tmp) { - /* - * Case 1: node to erase has no more than 1 child (easy!) - * - * Note that if there is one child it must be red due to 5) - * and node must be black due to 4). We adjust colors locally - * so as to bypass __rb_erase_color() later on. - */ - pc = node->__rb_parent_color; - parent = __rb_parent(pc); - __rb_change_child(node, child, parent, root); - if (child) { - child->__rb_parent_color = pc; - rebalance = NULL; - } else - rebalance = __rb_is_black(pc) ? parent : NULL; - tmp = parent; - } else if (!child) { - /* Still case 1, but this time the child is node->rb_left */ - tmp->__rb_parent_color = pc = node->__rb_parent_color; - parent = __rb_parent(pc); - __rb_change_child(node, tmp, parent, root); - rebalance = NULL; - tmp = parent; - } else { - struct rb_node *successor = child, *child2; - tmp = child->rb_left; - if (!tmp) { - /* - * Case 2: node's successor is its right child - * - * (n) (s) - * / \ / \ - * (x) (s) -> (x) (c) - * \ - * (c) - */ - parent = successor; - child2 = successor->rb_right; - augment->copy(node, successor); - } else { - /* - * Case 3: node's successor is leftmost under - * node's right child subtree - * - * (n) (s) - * / \ / \ - * (x) (y) -> (x) (y) - * / / - * (p) (p) - * / / - * (s) (c) - * \ - * (c) - */ - do { - parent = successor; - successor = tmp; - tmp = tmp->rb_left; - } while (tmp); - parent->rb_left = child2 = successor->rb_right; - successor->rb_right = child; - rb_set_parent(child, successor); - augment->copy(node, successor); - augment->propagate(parent, successor); - } - - successor->rb_left = tmp = node->rb_left; - rb_set_parent(tmp, successor); - - pc = node->__rb_parent_color; - tmp = __rb_parent(pc); - __rb_change_child(node, successor, tmp, root); - if (child2) { - successor->__rb_parent_color = pc; - rb_set_parent_color(child2, parent, RB_BLACK); - rebalance = NULL; - } else { - unsigned long pc2 = successor->__rb_parent_color; - successor->__rb_parent_color = pc; - rebalance = __rb_is_black(pc2) ? parent : NULL; - } - tmp = successor; - } - - augment->propagate(tmp, NULL); - if (rebalance) - __rb_erase_color(rebalance, root, augment); -} +EXPORT_SYMBOL(__rb_erase_color); /* * Non-augmented rbtree manipulation functions. @@ -513,7 +380,7 @@ EXPORT_SYMBOL(rb_insert_color); void rb_erase(struct rb_node *node, struct rb_root *root) { - __rb_erase(node, root, &dummy_callbacks); + rb_erase_augmented(node, root, &dummy_callbacks); } EXPORT_SYMBOL(rb_erase); @@ -531,13 +398,6 @@ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, } EXPORT_SYMBOL(__rb_insert_augmented); -void rb_erase_augmented(struct rb_node *node, struct rb_root *root, - const struct rb_augment_callbacks *augment) -{ - __rb_erase(node, root, augment); -} -EXPORT_SYMBOL(rb_erase_augmented); - /* * This function returns the first node (in sort order) of the tree. */ diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index b20e99969b0f..268b23951fec 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -1,5 +1,5 @@ #include -#include +#include #include #include -- cgit v1.2.3 From e79bee24fd6134f90af4228cfebd010136d67631 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 8 Oct 2012 16:32:18 -0700 Subject: atomic: implement generic atomic_dec_if_positive() The x86 implementation of atomic_dec_if_positive is quite generic, so make it available to all architectures. This is needed for "swap: add a simple detector for inappropriate swapin readahead". [akpm@linux-foundation.org: do the "#define foo foo" trick in the conventional manner] Signed-off-by: Shaohua Li Cc: Stephen Rothwell Cc: "David S. Miller" Cc: Rik van Riel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Benjamin Herrenschmidt Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/atomic.h | 1 + arch/powerpc/include/asm/atomic.h | 1 + arch/x86/include/asm/atomic.h | 24 ------------------------ include/linux/atomic.h | 25 +++++++++++++++++++++++++ 4 files changed, 27 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/microblaze/include/asm/atomic.h b/arch/microblaze/include/asm/atomic.h index 472d8bf726df..42ac382a09da 100644 --- a/arch/microblaze/include/asm/atomic.h +++ b/arch/microblaze/include/asm/atomic.h @@ -22,5 +22,6 @@ static inline int atomic_dec_if_positive(atomic_t *v) return res; } +#define atomic_dec_if_positive atomic_dec_if_positive #endif /* _ASM_MICROBLAZE_ATOMIC_H */ diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index da29032ae38f..e3b1d41c89be 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -268,6 +268,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v) return t; } +#define atomic_dec_if_positive atomic_dec_if_positive #define smp_mb__before_atomic_dec() smp_mb() #define smp_mb__after_atomic_dec() smp_mb() diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 250b8774c158..b6c3b821acf6 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -240,30 +240,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) return c; } - -/* - * atomic_dec_if_positive - decrement by 1 if old value positive - * @v: pointer of type atomic_t - * - * The function returns the old value of *v minus 1, even if - * the atomic variable, v, was not decremented. - */ -static inline int atomic_dec_if_positive(atomic_t *v) -{ - int c, old, dec; - c = atomic_read(v); - for (;;) { - dec = c - 1; - if (unlikely(dec < 0)) - break; - old = atomic_cmpxchg((v), c, dec); - if (likely(old == c)) - break; - c = old; - } - return dec; -} - /** * atomic_inc_short - increment of a short integer * @v: pointer to type int diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 70cfcb2d63c4..5b08a8540ecf 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -86,6 +86,31 @@ static inline int atomic_dec_unless_positive(atomic_t *p) } #endif +/* + * atomic_dec_if_positive - decrement by 1 if old value positive + * @v: pointer of type atomic_t + * + * The function returns the old value of *v minus 1, even if + * the atomic variable, v, was not decremented. + */ +#ifndef atomic_dec_if_positive +static inline int atomic_dec_if_positive(atomic_t *v) +{ + int c, old, dec; + c = atomic_read(v); + for (;;) { + dec = c - 1; + if (unlikely(dec < 0)) + break; + old = atomic_cmpxchg((v), c, dec); + if (likely(old == c)) + break; + c = old; + } + return dec; +} +#endif + #ifndef CONFIG_ARCH_HAS_ATOMIC_OR static inline void atomic_or(int i, atomic_t *v) { -- cgit v1.2.3 From 45cac65b0fcd287ebb877b141d40ba9bbe8e5da7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 8 Oct 2012 16:32:19 -0700 Subject: readahead: fault retry breaks mmap file read random detection .fault now can retry. The retry can break state machine of .fault. In filemap_fault, if page is miss, ra->mmap_miss is increased. In the second try, since the page is in page cache now, ra->mmap_miss is decreased. And these are done in one fault, so we can't detect random mmap file access. Add a new flag to indicate .fault is tried once. In the second try, skip ra->mmap_miss decreasing. The filemap_fault state machine is ok with it. I only tested x86, didn't test other archs, but looks the change for other archs is obvious, but who knows :) Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/fault.c | 1 + arch/avr32/mm/fault.c | 1 + arch/cris/mm/fault.c | 1 + arch/hexagon/mm/vm_fault.c | 1 + arch/ia64/mm/fault.c | 1 + arch/m68k/mm/fault.c | 1 + arch/microblaze/mm/fault.c | 1 + arch/mips/mm/fault.c | 1 + arch/openrisc/mm/fault.c | 1 + arch/powerpc/mm/fault.c | 1 + arch/s390/mm/fault.c | 1 + arch/sh/mm/fault.c | 1 + arch/sparc/mm/fault_32.c | 1 + arch/sparc/mm/fault_64.c | 1 + arch/tile/mm/fault.c | 1 + arch/um/kernel/trap.c | 1 + arch/x86/mm/fault.c | 1 + arch/xtensa/mm/fault.c | 1 + include/linux/mm.h | 1 + mm/filemap.c | 4 ++-- 20 files changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index c3bd83450227..5dbf13f954f6 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -336,6 +336,7 @@ retry: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index b92e60958617..b2f2d2d66849 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -152,6 +152,7 @@ good_area: tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would have diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 45fd542cf173..73312ab6c696 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -186,6 +186,7 @@ retry: tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 06695cc4fe58..513b74cb397e 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -113,6 +113,7 @@ good_area: current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 8443daf4f515..6cf0341f978e 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -184,6 +184,7 @@ retry: current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index aeebbb7b30f0..a563727806bf 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -170,6 +170,7 @@ good_area: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index eb365d6795fa..714b35a9c4f7 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -233,6 +233,7 @@ good_area: current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index c14f6dfed995..9f513486af10 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -171,6 +171,7 @@ good_area: } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 40f850e9766c..e2bfafce66c5 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -183,6 +183,7 @@ good_area: tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5495ebe983a2..0a6b28336eb0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -451,6 +451,7 @@ good_area: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index ac9122ca1152..04ad4001a289 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -367,6 +367,7 @@ retry: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; down_read(&mm->mmap_sem); goto retry; } diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 3bdc1ad9a341..cbbdcad8fcb3 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -504,6 +504,7 @@ good_area: } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 77ac917be152..e98bfda205a2 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -265,6 +265,7 @@ good_area: } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 1fe0429b6314..413d29263304 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -452,6 +452,7 @@ good_area: } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 84ce7abbf5af..fe811fa5f1b9 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -454,6 +454,7 @@ good_area: tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 0353b98ae35a..0f00e9c82080 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -89,6 +89,7 @@ good_area: current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a530b230e7d7..8e13ecb41bee 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1220,6 +1220,7 @@ good_area: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 5a74c53bc69c..2c2f710ed1dc 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -126,6 +126,7 @@ good_area: current->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry diff --git a/include/linux/mm.h b/include/linux/mm.h index b01e585ab4b5..bcaab4e6fe91 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -161,6 +161,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ +#define FAULT_FLAG_TRIED 0x40 /* second try */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's diff --git a/mm/filemap.c b/mm/filemap.c index a9827b42556e..83efee76a5c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); - if (likely(page)) { + if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { /* * We found the page, so try async readahead before * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - } else { + } else if (!page) { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); -- cgit v1.2.3 From 027ef6c87853b0a9df53175063028edb4950d476 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 8 Oct 2012 16:33:27 -0700 Subject: mm: thp: fix pmd_present for split_huge_page and PROT_NONE with THP In many places !pmd_present has been converted to pmd_none. For pmds that's equivalent and pmd_none is quicker so using pmd_none is better. However (unless we delete pmd_present) we should provide an accurate pmd_present too. This will avoid the risk of code thinking the pmd is non present because it's under __split_huge_page_map, see the pmd_mknotpresent there and the comment above it. If the page has been mprotected as PROT_NONE, it would also lead to a pmd_present false negative in the same way as the race with split_huge_page. Because the PSE bit stays on at all times (both during split_huge_page and when the _PAGE_PROTNONE bit get set), we could only check for the PSE bit, but checking the PROTNONE bit too is still good to remember pmd_present must always keep PROT_NONE into account. This explains a not reproducible BUG_ON that was seldom reported on the lists. The same issue is in pmd_large, it would go wrong with both PROT_NONE and if it races with split_huge_page. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Hugh Dickins Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fc9948465293..a1f780d45f76 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -146,8 +146,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd) static inline int pmd_large(pmd_t pte) { - return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); + return pmd_flags(pte) & _PAGE_PSE; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -415,7 +414,13 @@ static inline int pte_hidden(pte_t pte) static inline int pmd_present(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_PRESENT; + /* + * Checking for _PAGE_PSE is needed too because + * split_huge_page will temporarily clear the present bit (but + * the _PAGE_PSE flag will remain set at all times while the + * _PAGE_PRESENT bit is clear). + */ + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } static inline int pmd_none(pmd_t pmd) -- cgit v1.2.3 From b113da65785d5f3f9ff1451ec0fe43d6d76da25b Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 8 Oct 2012 16:34:25 -0700 Subject: mm: Add and use update_mmu_cache_pmd() in transparent huge page code. The transparent huge page code passes a PMD pointer in as the third argument of update_mmu_cache(), which expects a PTE pointer. This never got noticed because X86 implements update_mmu_cache() as a macro and thus we don't get any type checking, and X86 is the only architecture which supports transparent huge pages currently. Before other architectures can support transparent huge pages properly we need to add a new interface which will take a PMD pointer as the third argument rather than a PTE pointer. [akpm@linux-foundation.org: implement update_mm_cache_pmd() for s390] Signed-off-by: David S. Miller Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Gerald Schaefer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 1 + arch/x86/include/asm/pgtable_32.h | 1 + arch/x86/include/asm/pgtable_64.h | 1 + mm/huge_memory.c | 6 +++--- 4 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index ed14fc2db6e0..979fe3dc0788 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -42,6 +42,7 @@ extern void fault_init(void); * tables contain all the necessary information. */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_pmd(vma, address, ptep) do { } while (0) /* * ZERO_PAGE is a global shared page that is always zero; used diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 0c92113c4cb6..8faa215a503e 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -71,6 +71,7 @@ do { \ * tables contain all the necessary information. */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 8251be02301e..47356f9df82e 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -143,6 +143,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_unmap(pte) ((void)(pte))/* NOP */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3a8d6b7d95db..68a3c93036f6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -900,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache(vma, address, pmd); + update_mmu_cache_pmd(vma, address, pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } @@ -956,7 +956,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache(vma, address, pmd); + update_mmu_cache_pmd(vma, address, pmd); page_remove_rmap(page); put_page(page); ret |= VM_FAULT_WRITE; @@ -2041,7 +2041,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache(vma, address, pmd); + update_mmu_cache_pmd(vma, address, pmd); pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); -- cgit v1.2.3