From 02390b87a9459937cdb299e6b34ff33992512ec7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 14 Feb 2018 14:16:49 +0300 Subject: mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS With boot-time switching between paging mode we will have variable MAX_PHYSMEM_BITS. Let's use the maximum variable possible for CONFIG_X86_5LEVEL=y configuration to define zsmalloc data structures. The patch introduces MAX_POSSIBLE_PHYSMEM_BITS to cover such case. It also suits well to handle PAE special case. Signed-off-by: Kirill A. Shutemov Reviewed-by: Nitin Gupta Acked-by: Minchan Kim Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214111656.88514-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- mm/zsmalloc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c3013505c305..b7f61cd1c709 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -84,18 +84,19 @@ * This is made more complicated by various memory models and PAE. */ -#ifndef MAX_PHYSMEM_BITS -#ifdef CONFIG_HIGHMEM64G -#define MAX_PHYSMEM_BITS 36 -#else /* !CONFIG_HIGHMEM64G */ +#ifndef MAX_POSSIBLE_PHYSMEM_BITS +#ifdef MAX_PHYSMEM_BITS +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS +#else /* * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just * be PAGE_SHIFT */ -#define MAX_PHYSMEM_BITS BITS_PER_LONG +#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG #endif #endif -#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) + +#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) /* * Memory for allocating for handle keeps object position by -- cgit v1.2.3 From c65e774fb3f6af212641538694b9778ff9ab4300 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 14 Feb 2018 14:16:53 +0300 Subject: x86/mm: Make PGDIR_SHIFT and PTRS_PER_P4D variable For boot-time switching between 4- and 5-level paging we need to be able to fold p4d page table level at runtime. It requires variable PGDIR_SHIFT and PTRS_PER_P4D. The change doesn't affect the kernel image size much: text data bss dec hex filename 8628091 4734304 1368064 14730459 e0c4db vmlinux.before 8628393 4734340 1368064 14730797 e0c62d vmlinux.after Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180214111656.88514-7-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 2 ++ arch/x86/include/asm/pgtable_32.h | 2 ++ arch/x86/include/asm/pgtable_64_types.h | 19 ++++++++++++------- arch/x86/kernel/cpu/mcheck/mce.c | 18 ++++++------------ arch/x86/kernel/head64.c | 6 +++++- arch/x86/mm/dump_pagetables.c | 12 +++++------- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/kasan_init_64.c | 2 +- arch/x86/platform/efi/efi_64.c | 4 ++-- include/asm-generic/5level-fixup.h | 1 + include/asm-generic/pgtable-nop4d.h | 9 +++++---- include/linux/kasan.h | 2 +- mm/kasan/kasan_init.c | 2 +- 13 files changed, 44 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index bd69e1830fbe..b18e8f9512de 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -48,6 +48,8 @@ #ifdef CONFIG_X86_5LEVEL unsigned int pgtable_l5_enabled __ro_after_init = 1; +unsigned int pgdir_shift __ro_after_init = 48; +unsigned int ptrs_per_p4d __ro_after_init = 512; #endif extern unsigned long get_cmd_line_ptr(void); diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index e67c0620aec2..d829360e26bd 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -33,6 +33,8 @@ static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } void paging_init(void); +static inline int pgd_large(pgd_t pgd) { return 0; } + /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 903e4d054bcb..0c48d80e11d4 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -26,6 +26,9 @@ extern unsigned int pgtable_l5_enabled; #define pgtable_l5_enabled 0 #endif +extern unsigned int pgdir_shift; +extern unsigned int ptrs_per_p4d; + #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 @@ -35,16 +38,17 @@ extern unsigned int pgtable_l5_enabled; /* * PGDIR_SHIFT determines what a top-level page table entry can map */ -#define PGDIR_SHIFT 48 +#define PGDIR_SHIFT pgdir_shift #define PTRS_PER_PGD 512 /* * 4th level page in 5-level paging case */ -#define P4D_SHIFT 39 -#define PTRS_PER_P4D 512 -#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) -#define P4D_MASK (~(P4D_SIZE - 1)) +#define P4D_SHIFT 39 +#define MAX_PTRS_PER_P4D 512 +#define PTRS_PER_P4D ptrs_per_p4d +#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) +#define P4D_MASK (~(P4D_SIZE - 1)) #define MAX_POSSIBLE_PHYSMEM_BITS 52 @@ -53,8 +57,9 @@ extern unsigned int pgtable_l5_enabled; /* * PGDIR_SHIFT determines what a top-level page table entry can map */ -#define PGDIR_SHIFT 39 -#define PTRS_PER_PGD 512 +#define PGDIR_SHIFT 39 +#define PTRS_PER_PGD 512 +#define MAX_PTRS_PER_P4D 1 #endif /* CONFIG_X86_5LEVEL */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3a8e88a611eb..cbb3af721291 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1082,19 +1082,7 @@ void arch_unmap_kpfn(unsigned long pfn) * a legal address. */ -/* - * Build time check to see if we have a spare virtual bit. Don't want - * to leave this until run time because most developers don't have a - * system that can exercise this code path. This will only become a - * problem if/when we move beyond 5-level page tables. - * - * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) - */ -#if PGDIR_SHIFT + 9 < 63 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); -#else -#error "no unused virtual bit available" -#endif if (set_memory_np(decoy_addr, 1)) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); @@ -2328,6 +2316,12 @@ static __init int mcheck_init_device(void) { int err; + /* + * Check if we have a spare virtual bit. This will only become + * a problem if/when we move beyond 5-level page tables. + */ + MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63); + if (!mce_available(&boot_cpu_data)) { err = -EIO; goto err_out; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 17d00d1886de..98b0ff49b220 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -42,6 +42,10 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL unsigned int pgtable_l5_enabled __ro_after_init = 1; EXPORT_SYMBOL(pgtable_l5_enabled); +unsigned int pgdir_shift __ro_after_init = 48; +EXPORT_SYMBOL(pgdir_shift); +unsigned int ptrs_per_p4d __ro_after_init = 512; +EXPORT_SYMBOL(ptrs_per_p4d); #endif #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT @@ -336,7 +340,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a89f2dbc3531..420058b05d39 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -428,14 +428,15 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, #define p4d_none(a) pud_none(__pud(p4d_val(a))) #endif -#if PTRS_PER_P4D > 1 - static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) { int i; p4d_t *start, *p4d_start; pgprotval_t prot; + if (PTRS_PER_P4D == 1) + return walk_pud_level(m, st, __p4d(pgd_val(addr)), P); + p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_P4D; i++) { @@ -455,11 +456,8 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, } } -#else -#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) -#define pgd_large(a) p4d_large(__p4d(pgd_val(a))) -#define pgd_none(a) p4d_none(__p4d(pgd_val(a))) -#endif +#define pgd_large(a) (pgtable_l5_enabled ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) +#define pgd_none(a) (pgtable_l5_enabled ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) static inline bool is_hypervisor_range(int idx) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ab42c852069..6a4b20bc7527 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -143,7 +143,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) * With folded p4d, pgd_none() is always false, we need to * handle synchonization on p4d level. */ - BUILD_BUG_ON(pgd_none(*pgd_ref)); + MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); p4d_ref = p4d_offset(pgd_ref, addr); if (p4d_none(*p4d_ref)) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index af6f2f9c6a26..12ec90f62457 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -19,7 +19,7 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; -static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); +static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); static __init void *early_alloc(size_t size, int nid, bool panic) { diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 780460aa5ea5..d52aaa7dc088 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -257,8 +257,8 @@ void efi_sync_low_kernel_mappings(void) * only span a single PGD entry and that the entry also maps * other important kernel regions. */ - BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); - BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != + MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END)); + MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) != (EFI_VA_END & PGDIR_MASK)); pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET); diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index dfbd9d990637..9c2e0708eb82 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -8,6 +8,7 @@ #define P4D_SHIFT PGDIR_SHIFT #define P4D_SIZE PGDIR_SIZE #define P4D_MASK PGDIR_MASK +#define MAX_PTRS_PER_P4D 1 #define PTRS_PER_P4D 1 #define p4d_t pgd_t diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h index 8f22f55de17a..1a29b2a0282b 100644 --- a/include/asm-generic/pgtable-nop4d.h +++ b/include/asm-generic/pgtable-nop4d.h @@ -8,10 +8,11 @@ typedef struct { pgd_t pgd; } p4d_t; -#define P4D_SHIFT PGDIR_SHIFT -#define PTRS_PER_P4D 1 -#define P4D_SIZE (1UL << P4D_SHIFT) -#define P4D_MASK (~(P4D_SIZE-1)) +#define P4D_SHIFT PGDIR_SHIFT +#define MAX_PTRS_PER_P4D 1 +#define PTRS_PER_P4D 1 +#define P4D_SIZE (1UL << P4D_SHIFT) +#define P4D_MASK (~(P4D_SIZE-1)) /* * The "pgd_xxx()" functions here are trivial for a folded two-level diff --git a/include/linux/kasan.h b/include/linux/kasan.h index adc13474a53b..d6459bd1376d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -18,7 +18,7 @@ extern unsigned char kasan_zero_page[PAGE_SIZE]; extern pte_t kasan_zero_pte[PTRS_PER_PTE]; extern pmd_t kasan_zero_pmd[PTRS_PER_PMD]; extern pud_t kasan_zero_pud[PTRS_PER_PUD]; -extern p4d_t kasan_zero_p4d[PTRS_PER_P4D]; +extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D]; void kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end); diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index 554e4c0f23a2..f436246ccc79 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -31,7 +31,7 @@ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; #if CONFIG_PGTABLE_LEVELS > 4 -p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss; +p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; #endif #if CONFIG_PGTABLE_LEVELS > 3 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; -- cgit v1.2.3 From 15d9f3d116c02a485441d758d9ca0a2e4f3b30be Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 15 Feb 2018 10:08:14 -0600 Subject: percpu: match chunk allocator declarations with definitions At some point the function declaration parameters got out of sync with the function definitions in percpu-vm.c and percpu-km.c. This patch makes them match again. Signed-off-by: Dennis Zhou Acked-by: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 50e7fdf84055..e1ea41002173 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1277,8 +1277,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, * pcpu_addr_to_page - translate address to physical address * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size); -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size); +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end); +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end); static struct pcpu_chunk *pcpu_create_chunk(void); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); -- cgit v1.2.3 From 47504ee04b9241548ae2c28be7d0b01cff3b7aa6 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 16 Feb 2018 12:07:19 -0600 Subject: percpu: add __GFP_NORETRY semantics to the percpu balancing path Percpu memory using the vmalloc area based chunk allocator lazily populates chunks by first requesting the full virtual address space required for the chunk and subsequently adding pages as allocations come through. To ensure atomic allocations can succeed, a workqueue item is used to maintain a minimum number of empty pages. In certain scenarios, such as reported in [1], it is possible that physical memory becomes quite scarce which can result in either a rather long time spent trying to find free pages or worse, a kernel panic. This patch adds support for __GFP_NORETRY and __GFP_NOWARN passing them through to the underlying allocators. This should prevent any unnecessary panics potentially caused by the workqueue item. The passing of gfp around is as additional flags rather than a full set of flags. The next patch will change these to caller passed semantics. V2: Added const modifier to gfp flags in the balance path. Removed an extra whitespace. [1] https://lkml.org/lkml/2018/2/12/551 Signed-off-by: Dennis Zhou Suggested-by: Daniel Borkmann Reported-by: syzbot+adb03f3f0bb57ce3acda@syzkaller.appspotmail.com Acked-by: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu-km.c | 8 ++++---- mm/percpu-vm.c | 18 +++++++++++------- mm/percpu.c | 44 +++++++++++++++++++++++++++----------------- 3 files changed, 42 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index d2a76642c4ae..0d88d7bd5706 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -34,7 +34,7 @@ #include static int pcpu_populate_chunk(struct pcpu_chunk *chunk, - int page_start, int page_end) + int page_start, int page_end, gfp_t gfp) { return 0; } @@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, /* nada */ } -static struct pcpu_chunk *pcpu_create_chunk(void) +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) { const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; struct pcpu_chunk *chunk; struct page *pages; int i; - chunk = pcpu_alloc_chunk(); + chunk = pcpu_alloc_chunk(gfp); if (!chunk) return NULL; - pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages)); + pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages)); if (!pages) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 9158e5a81391..0af71eb2fff0 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void) lockdep_assert_held(&pcpu_alloc_mutex); if (!pages) - pages = pcpu_mem_zalloc(pages_size); + pages = pcpu_mem_zalloc(pages_size, 0); return pages; } @@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 + * @gfp: allocation flags passed to the underlying allocator * * Allocate pages [@page_start,@page_end) into @pages for all units. * The allocation is for @chunk. Percpu core doesn't care about the * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, - struct page **pages, int page_start, int page_end) + struct page **pages, int page_start, int page_end, + gfp_t gfp) { - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM; unsigned int cpu, tcpu; int i; + gfp |= GFP_KERNEL | __GFP_HIGHMEM; + for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; @@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * @chunk: chunk of interest * @page_start: the start page * @page_end: the end page + * @gfp: allocation flags passed to the underlying memory allocator * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. @@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, - int page_start, int page_end) + int page_start, int page_end, gfp_t gfp) { struct page **pages; @@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, if (!pages) return -ENOMEM; - if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) return -ENOMEM; if (pcpu_map_pages(chunk, pages, page_start, page_end)) { @@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, pcpu_free_pages(chunk, pages, page_start, page_end); } -static struct pcpu_chunk *pcpu_create_chunk(void) +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; struct vm_struct **vms; - chunk = pcpu_alloc_chunk(); + chunk = pcpu_alloc_chunk(gfp); if (!chunk) return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index e1ea41002173..f97443d488a8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -447,10 +447,12 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate + * @gfp: allocation flags * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, - * kzalloc() is used; otherwise, vzalloc() is used. The returned - * memory is always zeroed. + * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. + * This is to facilitate passing through whitelisted flags. The + * returned memory is always zeroed. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -458,15 +460,16 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_mem_zalloc(size_t size) +static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; if (size <= PAGE_SIZE) - return kzalloc(size, GFP_KERNEL); + return kzalloc(size, gfp | GFP_KERNEL); else - return vzalloc(size); + return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO, + PAGE_KERNEL); } /** @@ -1154,12 +1157,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, return chunk; } -static struct pcpu_chunk *pcpu_alloc_chunk(void) +static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; int region_bits; - chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); + chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp); if (!chunk) return NULL; @@ -1168,17 +1171,17 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) region_bits = pcpu_chunk_map_bits(chunk); chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * - sizeof(chunk->alloc_map[0])); + sizeof(chunk->alloc_map[0]), gfp); if (!chunk->alloc_map) goto alloc_map_fail; chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * - sizeof(chunk->bound_map[0])); + sizeof(chunk->bound_map[0]), gfp); if (!chunk->bound_map) goto bound_map_fail; chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * - sizeof(chunk->md_blocks[0])); + sizeof(chunk->md_blocks[0]), gfp); if (!chunk->md_blocks) goto md_blocks_fail; @@ -1278,10 +1281,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, - int page_start, int page_end); + int page_start, int page_end, gfp_t gfp); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); -static struct pcpu_chunk *pcpu_create_chunk(void); +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); @@ -1423,7 +1426,7 @@ restart: } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { - chunk = pcpu_create_chunk(); + chunk = pcpu_create_chunk(0); if (!chunk) { err = "failed to allocate new chunk"; goto fail; @@ -1452,7 +1455,7 @@ area_found: page_start, page_end) { WARN_ON(chunk->immutable); - ret = pcpu_populate_chunk(chunk, rs, re); + ret = pcpu_populate_chunk(chunk, rs, re, 0); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { @@ -1563,10 +1566,17 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * - * Reclaim all fully free chunks except for the first one. + * Reclaim all fully free chunks except for the first one. This is also + * responsible for maintaining the pool of empty populated pages. However, + * it is possible that this is called when physical memory is scarce causing + * OOM killer to be triggered. We should avoid doing so until an actual + * allocation causes the failure as it is possible that requests can be + * serviced from already backed regions. */ static void pcpu_balance_workfn(struct work_struct *work) { + /* gfp flags passed to underlying allocators */ + const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; @@ -1647,7 +1657,7 @@ retry_pop: chunk->nr_pages) { int nr = min(re - rs, nr_to_pop); - ret = pcpu_populate_chunk(chunk, rs, rs + nr); + ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); if (!ret) { nr_to_pop -= nr; spin_lock_irq(&pcpu_lock); @@ -1664,7 +1674,7 @@ retry_pop: if (nr_to_pop) { /* ran out of chunks to populate, create a new one and retry */ - chunk = pcpu_create_chunk(); + chunk = pcpu_create_chunk(gfp); if (chunk) { spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); -- cgit v1.2.3 From 554fef1c39ee148623a496e04569dabb11463406 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 16 Feb 2018 12:09:58 -0600 Subject: percpu: allow select gfp to be passed to underlying allocators The prior patch added support for passing gfp flags through to the underlying allocators. This patch allows users to pass along gfp flags (currently only __GFP_NORETRY and __GFP_NOWARN) to the underlying allocators. This should allow users to decide if they are ok with failing allocations recovering in a more graceful way. Additionally, gfp passing was done as additional flags in the previous patch. Instead, change this to caller passed semantics. GFP_KERNEL is also removed as the default flag. It continues to be used for internally caused underlying percpu allocations. V2: Removed gfp_percpu_mask in favor of doing it inline. Removed GFP_KERNEL as a default flag for __alloc_percpu_gfp. Signed-off-by: Dennis Zhou Suggested-by: Daniel Borkmann Acked-by: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu-km.c | 2 +- mm/percpu-vm.c | 4 ++-- mm/percpu.c | 16 +++++++--------- 3 files changed, 10 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 0d88d7bd5706..38de70ab1a0d 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -56,7 +56,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) if (!chunk) return NULL; - pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages)); + pages = alloc_pages(gfp, order_base_2(nr_pages)); if (!pages) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 0af71eb2fff0..d8078de912de 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void) lockdep_assert_held(&pcpu_alloc_mutex); if (!pages) - pages = pcpu_mem_zalloc(pages_size, 0); + pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); return pages; } @@ -86,7 +86,7 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, unsigned int cpu, tcpu; int i; - gfp |= GFP_KERNEL | __GFP_HIGHMEM; + gfp |= __GFP_HIGHMEM; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { diff --git a/mm/percpu.c b/mm/percpu.c index f97443d488a8..fa3f854634a1 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -454,9 +454,6 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, * This is to facilitate passing through whitelisted flags. The * returned memory is always zeroed. * - * CONTEXT: - * Does GFP_KERNEL allocation. - * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ @@ -466,10 +463,9 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) return NULL; if (size <= PAGE_SIZE) - return kzalloc(size, gfp | GFP_KERNEL); + return kzalloc(size, gfp); else - return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL); } /** @@ -1344,6 +1340,8 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp_t gfp) { + /* whitelisted flags that can be passed to the backing allocators */ + gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; bool do_warn = !(gfp & __GFP_NOWARN); static int warn_limit = 10; @@ -1426,7 +1424,7 @@ restart: } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { - chunk = pcpu_create_chunk(0); + chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) { err = "failed to allocate new chunk"; goto fail; @@ -1455,7 +1453,7 @@ area_found: page_start, page_end) { WARN_ON(chunk->immutable); - ret = pcpu_populate_chunk(chunk, rs, re, 0); + ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { @@ -1576,7 +1574,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) static void pcpu_balance_workfn(struct work_struct *work) { /* gfp flags passed to underlying allocators */ - const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; -- cgit v1.2.3 From 5f171577b4f35b44795a73bde8cf2c49b4073925 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 24 Oct 2017 16:52:32 +0100 Subject: Drop a bunch of metag references Now that arch/metag/ has been removed, drop a bunch of metag references in various codes across the whole tree: - VM_GROWSUP and __VM_ARCH_SPECIFIC_1. - MT_METAG_* ELF note types. - METAG Kconfig dependencies (FRAME_POINTER) and ranges (MAX_STACK_SIZE_MB). - metag cases in tools (checkstack.pl, recordmcount.c, perf). Signed-off-by: James Hogan Acked-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Reviewed-by: Guenter Roeck Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: linux-mm@kvack.org Cc: linux-metag@vger.kernel.org --- include/linux/cpuhotplug.h | 1 - include/linux/mm.h | 2 -- include/trace/events/mmflags.h | 2 +- include/uapi/linux/elf.h | 3 --- lib/Kconfig.debug | 2 +- mm/Kconfig | 7 +++---- scripts/checkstack.pl | 4 ---- scripts/recordmcount.c | 20 -------------------- tools/perf/perf-sys.h | 4 ---- 9 files changed, 5 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 5172ad0daa7c..c7a950681f3a 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -108,7 +108,6 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_CQM_STARTING, CPUHP_AP_PERF_X86_CSTATE_STARTING, CPUHP_AP_PERF_XTENSA_STARTING, - CPUHP_AP_PERF_METAG_STARTING, CPUHP_AP_MIPS_OP_LOONGSON3_STARTING, CPUHP_AP_ARM_SDEI_STARTING, CPUHP_AP_ARM_VFP_STARTING, diff --git a/include/linux/mm.h b/include/linux/mm.h index ad06d42adb1a..ccac10682ce5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -241,8 +241,6 @@ extern unsigned int kobjsize(const void *objp); # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 -#elif defined(CONFIG_METAG) -# define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_IA64) # define VM_GROWSUP VM_ARCH_1 #elif !defined(CONFIG_MMU) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index dbe1bb058c09..a81cffb76d89 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -115,7 +115,7 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) #define __VM_ARCH_SPECIFIC_1 {VM_PAT, "pat" } #elif defined(CONFIG_PPC) #define __VM_ARCH_SPECIFIC_1 {VM_SAO, "sao" } -#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) +#elif defined(CONFIG_PARISC) || defined(CONFIG_IA64) #define __VM_ARCH_SPECIFIC_1 {VM_GROWSUP, "growsup" } #elif !defined(CONFIG_MMU) #define __VM_ARCH_SPECIFIC_1 {VM_MAPPED_COPY,"mappedcopy" } diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 3bf73fb58045..e2535d6dcec7 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -420,9 +420,6 @@ typedef struct elf64_shdr { #define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ #define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ -#define NT_METAG_CBUF 0x500 /* Metag catch buffer registers */ -#define NT_METAG_RPIPE 0x501 /* Metag read pipeline state */ -#define NT_METAG_TLS 0x502 /* Metag TLS pointer */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ /* Note header in a PT_NOTE section */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6088408ef26c..d1c523e408e9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -356,7 +356,7 @@ config FRAME_POINTER bool "Compile the kernel with frame pointers" depends on DEBUG_KERNEL && \ (CRIS || M68K || FRV || UML || \ - SUPERH || BLACKFIN || MN10300 || METAG) || \ + SUPERH || BLACKFIN || MN10300) || \ ARCH_WANT_FRAME_POINTERS default y if (DEBUG_INFO && UML) || ARCH_WANT_FRAME_POINTERS help diff --git a/mm/Kconfig b/mm/Kconfig index c782e8fb7235..abefa573bcd8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -627,15 +627,14 @@ config GENERIC_EARLY_IOREMAP config MAX_STACK_SIZE_MB int "Maximum user stack size for 32-bit processes (MB)" default 80 - range 8 256 if METAG range 8 2048 depends on STACK_GROWSUP && (!64BIT || COMPAT) help This is the maximum stack size in Megabytes in the VM layout of 32-bit user processes when the stack grows upwards (currently only on parisc - and metag arch). The stack will be located at the highest memory - address minus the given value, unless the RLIMIT_STACK hard limit is - changed to a smaller value in which case that is used. + arch). The stack will be located at the highest memory address minus + the given value, unless the RLIMIT_STACK hard limit is changed to a + smaller value in which case that is used. A sane initial value is 80 MB. diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index cb993801e4b2..eeb9ac8dbcfb 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -64,10 +64,6 @@ my (@stack, $re, $dre, $x, $xs, $funcre); # 2b6c: 4e56 fb70 linkw %fp,#-1168 # 1df770: defc ffe4 addaw #-28,%sp $re = qr/.*(?:linkw %fp,|addaw )#-([0-9]{1,4})(?:,%sp)?$/o; - } elsif ($arch eq 'metag') { - #400026fc: 40 00 00 82 ADD A0StP,A0StP,#0x8 - $re = qr/.*ADD.*A0StP,A0StP,\#(0x$x{1,8})/o; - $funcre = qr/^$x* <[^\$](.*)>:$/; } elsif ($arch eq 'mips64') { #8800402c: 67bdfff0 daddiu sp,sp,-16 $re = qr/.*daddiu.*sp,sp,-(([0-9]{2}|[3-9])[0-9]{2})/o; diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 16e086dcc567..8c9691c3329e 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -33,20 +33,6 @@ #include #include -/* - * glibc synced up and added the metag number but didn't add the relocations. - * Work around this in a crude manner for now. - */ -#ifndef EM_METAG -#define EM_METAG 174 -#endif -#ifndef R_METAG_ADDR32 -#define R_METAG_ADDR32 2 -#endif -#ifndef R_METAG_NONE -#define R_METAG_NONE 3 -#endif - #ifndef EM_AARCH64 #define EM_AARCH64 183 #define R_AARCH64_NONE 0 @@ -538,12 +524,6 @@ do_file(char const *const fname) gpfx = '_'; break; case EM_IA_64: reltype = R_IA64_IMM64; gpfx = '_'; break; - case EM_METAG: reltype = R_METAG_ADDR32; - altmcount = "_mcount_wrapper"; - rel_type_nop = R_METAG_NONE; - /* We happen to have the same requirement as MIPS */ - is_fake_mcount32 = MIPS32_is_fake_mcount; - break; case EM_MIPS: /* reltype: e_class */ gpfx = '_'; break; case EM_PPC: reltype = R_PPC_ADDR32; gpfx = '_'; break; case EM_PPC64: reltype = R_PPC64_ADDR64; gpfx = '_'; break; diff --git a/tools/perf/perf-sys.h b/tools/perf/perf-sys.h index 36673f98d66b..3eb7a39169f6 100644 --- a/tools/perf/perf-sys.h +++ b/tools/perf/perf-sys.h @@ -46,10 +46,6 @@ #define CPUINFO_PROC {"Processor"} #endif -#ifdef __metag__ -#define CPUINFO_PROC {"CPU"} -#endif - #ifdef __xtensa__ #define CPUINFO_PROC {"core ID"} #endif -- cgit v1.2.3 From accd4f36a7d11c2d54544007eb65e10604dcf2f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 23 Feb 2018 08:12:42 -0800 Subject: percpu: add a schedule point in pcpu_balance_workfn() When a large BPF percpu map is destroyed, I have seen pcpu_balance_workfn() holding cpu for hundreds of milliseconds. On KASAN config and 112 hyperthreads, average time to destroy a chunk is ~4 ms. [ 2489.841376] destroy chunk 1 in 4148689 ns ... [ 2490.093428] destroy chunk 32 in 4072718 ns Signed-off-by: Eric Dumazet Signed-off-by: Tejun Heo --- mm/percpu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index fa3f854634a1..36e7b65ba6cf 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1610,6 +1610,7 @@ static void pcpu_balance_workfn(struct work_struct *work) spin_unlock_irq(&pcpu_lock); } pcpu_destroy_chunk(chunk); + cond_resched(); } /* -- cgit v1.2.3 From 4704dea36dd9e5b4bf37ed20f7f15e70632ccdd0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 9 Mar 2018 15:50:55 -0800 Subject: hugetlb: fix surplus pages accounting Dan Rue has noticed that libhugetlbfs test suite fails counter test: # mount_point="/mnt/hugetlb/" # echo 200 > /proc/sys/vm/nr_hugepages # mkdir -p "${mount_point}" # mount -t hugetlbfs hugetlbfs "${mount_point}" # export LD_LIBRARY_PATH=/root/libhugetlbfs/libhugetlbfs-2.20/obj64 # /root/libhugetlbfs/libhugetlbfs-2.20/tests/obj64/counters Starting testcase "/root/libhugetlbfs/libhugetlbfs-2.20/tests/obj64/counters", pid 3319 Base pool size: 0 Clean... FAIL Line 326: Bad HugePages_Total: expected 0, actual 1 The bug was bisected to 0c397daea1d4 ("mm, hugetlb: further simplify hugetlb allocation API"). The reason is that alloc_surplus_huge_page() misaccounts per node surplus pages. We should increase surplus_huge_pages_node rather than nr_huge_pages_node which is already handled by alloc_fresh_huge_page. Link: http://lkml.kernel.org/r/20180221191439.GM2231@dhcp22.suse.cz Fixes: 0c397daea1d4 ("mm, hugetlb: further simplify hugetlb allocation API") Signed-off-by: Michal Hocko Reported-by: Dan Rue Tested-by: Dan Rue Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c204e3d132b..a963f2034dfc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1583,7 +1583,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, page = NULL; } else { h->surplus_huge_pages++; - h->nr_huge_pages_node[page_to_nid(page)]++; + h->surplus_huge_pages_node[page_to_nid(page)]++; } out_unlock: -- cgit v1.2.3 From 96312e61282ae3f6537a562625706498cbc75594 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 9 Mar 2018 15:51:06 -0800 Subject: mm/gup.c: teach get_user_pages_unlocked to handle FOLL_NOWAIT KVM is hanging during postcopy live migration with userfaultfd because get_user_pages_unlocked is not capable to handle FOLL_NOWAIT. Earlier FOLL_NOWAIT was only ever passed to get_user_pages. Specifically faultin_page (the callee of get_user_pages_unlocked caller) doesn't know that if FAULT_FLAG_RETRY_NOWAIT was set in the page fault flags, when VM_FAULT_RETRY is returned, the mmap_sem wasn't actually released (even if nonblocking is not NULL). So it sets *nonblocking to zero and the caller won't release the mmap_sem thinking it was already released, but it wasn't because of FOLL_NOWAIT. Link: http://lkml.kernel.org/r/20180302174343.5421-2-aarcange@redhat.com Fixes: ce53053ce378c ("kvm: switch get_user_page_nowait() to get_user_pages_unlocked()") Signed-off-by: Andrea Arcangeli Reported-by: Dr. David Alan Gilbert Tested-by: Dr. David Alan Gilbert Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 1b46e6e74881..6afae32571ca 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -516,7 +516,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, } if (ret & VM_FAULT_RETRY) { - if (nonblocking) + if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *nonblocking = 0; return -EBUSY; } @@ -890,7 +890,10 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, break; } if (*locked) { - /* VM_FAULT_RETRY didn't trigger */ + /* + * VM_FAULT_RETRY didn't trigger or it was a + * FOLL_NOWAIT. + */ if (!pages_done) pages_done = ret; break; -- cgit v1.2.3 From 379b03b7fa05f7db521b7732a52692448a3c34fe Mon Sep 17 00:00:00 2001 From: Daniel Vacek Date: Fri, 9 Mar 2018 15:51:09 -0800 Subject: mm/memblock.c: hardcode the end_pfn being -1 This is just a cleanup. It aids handling the special end case in the next commit. [akpm@linux-foundation.org: make it work against current -linus, not against -mm] [akpm@linux-foundation.org: make it work against current -linus, not against -mm some more] Link: http://lkml.kernel.org/r/1ca478d4269125a99bcfb1ca04d7b88ac1aee924.1520011944.git.neelx@redhat.com Signed-off-by: Daniel Vacek Cc: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Cc: Pavel Tatashin Cc: Paul Burton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 5a9ca2a1751b..b6ba6b7adadc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1107,7 +1107,7 @@ unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, struct memblock_type *type = &memblock.memory; unsigned int right = type->cnt; unsigned int mid, left = 0; - phys_addr_t addr = PFN_PHYS(pfn + 1); + phys_addr_t addr = PFN_PHYS(++pfn); do { mid = (right + left) / 2; @@ -1118,15 +1118,15 @@ unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, type->regions[mid].size)) left = mid + 1; else { - /* addr is within the region, so pfn + 1 is valid */ - return min(pfn + 1, max_pfn); + /* addr is within the region, so pfn is valid */ + return pfn; } } while (left < right); if (right == type->cnt) - return max_pfn; + return -1UL; else - return min(PHYS_PFN(type->regions[right].base), max_pfn); + return PHYS_PFN(type->regions[right].base); } /** -- cgit v1.2.3 From 864b75f9d6b0100bb24fdd9a20d156e7cda9b5ae Mon Sep 17 00:00:00 2001 From: Daniel Vacek Date: Fri, 9 Mar 2018 15:51:13 -0800 Subject: mm/page_alloc: fix memmap_init_zone pageblock alignment Commit b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible") introduced a bug where move_freepages() triggers a VM_BUG_ON() on uninitialized page structure due to pageblock alignment. To fix this, simply align the skipped pfns in memmap_init_zone() the same way as in move_freepages_block(). Seen in one of the RHEL reports: crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1 kernel BUG at mm/page_alloc.c:1389! invalid opcode: 0000 [#1] SMP -- RIP: 0010:[] [] move_freepages+0x15e/0x160 RSP: 0018:ffff88054d727688 EFLAGS: 00010087 -- Call Trace: [] move_freepages_block+0x73/0x80 [] __rmqueue+0x263/0x460 [] get_page_from_freelist+0x7e1/0x9e0 [] __alloc_pages_nodemask+0x176/0x420 -- RIP [] move_freepages+0x15e/0x160 RSP crash> page_init_bug -v | grep RAM 1000 - 9bfff System RAM (620.00 KiB) 100000 - 430bffff System RAM ( 1.05 GiB = 1071.75 MiB = 1097472.00 KiB) 4b0c8000 - 4bf9cfff System RAM ( 14.83 MiB = 15188.00 KiB) 4bfac000 - 646b1fff System RAM (391.02 MiB = 400408.00 KiB) 7b788000 - 7b7fffff System RAM (480.00 KiB) 100000000 - 67fffffff System RAM ( 22.00 GiB) crash> page_init_bug | head -6 7b788000 - 7b7fffff System RAM (480.00 KiB) 1fffff00000000 0 1 DMA32 4096 1048575 505736 505344 505855 0 0 0 DMA 1 4095 1fffff00000400 0 1 DMA32 4096 1048575 BUG, zones differ! Note that this range follows two not populated sections 68000000-77ffffff in this zone. 7b788000-7b7fffff is the first one after a gap. This makes memmap_init_zone() skip all the pfns up to the beginning of this range. But this range is not pageblock (2M) aligned. In fact no range has to be. crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000 PAGE PHYSICAL MAPPING INDEX CNT FLAGS ffffea0001e00000 78000000 0 0 0 0 ffffea0001ed7fc0 7b5ff000 0 0 0 0 ffffea0001ed8000 7b600000 0 0 0 0 <<<< ffffea0001ede1c0 7b787000 0 0 0 0 ffffea0001ede200 7b788000 0 0 1 1fffff00000000 Top part of page flags should contain nodeid and zonenr, which is not the case for page ffffea0001ed8000 here (<<<<). crash> log | grep -o fffea0001ed[^\ ]* | sort -u fffea0001ed8000 fffea0001eded20 fffea0001edffc0 crash> bt -r | grep -o fffea0001ed[^\ ]* | sort -u fffea0001ed8000 fffea0001eded00 fffea0001eded20 fffea0001edffc0 Initialization of the whole beginning of the section is skipped up to the start of the range due to the commit b92df1de5d28. Now any code calling move_freepages_block() (like reusing the page from a freelist as in this example) with a page from the beginning of the range will get the page rounded down to start_page ffffea0001ed8000 and passed to move_freepages() which crashes on assertion getting wrong zonenr. > VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); Note, page_zone() derives the zone from page flags here. From similar machine before commit b92df1de5d28: crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b7fe000 7b7ff000 PAGE PHYSICAL MAPPING INDEX CNT FLAGS fffff73941e00000 78000000 0 0 1 1fffff00000000 fffff73941ed7fc0 7b5ff000 0 0 1 1fffff00000000 fffff73941ed8000 7b600000 0 0 1 1fffff00000000 fffff73941edff80 7b7fe000 0 0 1 1fffff00000000 fffff73941edffc0 7b7ff000 ffff8e67e04d3ae0 ad84 1 1fffff00020068 uptodate,lru,active,mappedtodisk All the pages since the beginning of the section are initialized. move_freepages()' not gonna blow up. The same machine with this fix applied: crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b7fe000 7b7ff000 PAGE PHYSICAL MAPPING INDEX CNT FLAGS ffffea0001e00000 78000000 0 0 0 0 ffffea0001e00000 7b5ff000 0 0 0 0 ffffea0001ed8000 7b600000 0 0 1 1fffff00000000 ffffea0001edff80 7b7fe000 0 0 1 1fffff00000000 ffffea0001edffc0 7b7ff000 ffff88017fb13720 8 2 1fffff00020068 uptodate,lru,active,mappedtodisk At least the bare minimum of pages is initialized preventing the crash as well. Customers started to report this as soon as 7.4 (where b92df1de5d28 was merged in RHEL) was released. I remember reports from September/October-ish times. It's not easily reproduced and happens on a handful of machines only. I guess that's why. But that does not make it less serious, I think. Though there actually is a report here: https://bugzilla.kernel.org/show_bug.cgi?id=196443 And there are reports for Fedora from July: https://bugzilla.redhat.com/show_bug.cgi?id=1473242 and CentOS: https://bugs.centos.org/view.php?id=13964 and we internally track several dozens reports for RHEL bug https://bugzilla.redhat.com/show_bug.cgi?id=1525121 Link: http://lkml.kernel.org/r/0485727b2e82da7efbce5f6ba42524b429d0391a.1520011945.git.neelx@redhat.com Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible") Signed-off-by: Daniel Vacek Cc: Mel Gorman Cc: Michal Hocko Cc: Paul Burton Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cb416723538f..3d974cb2a1a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5359,9 +5359,14 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, /* * Skip to the pfn preceding the next valid one (or * end_pfn), such that we hit a valid pfn (or end_pfn) - * on our next iteration of the loop. + * on our next iteration of the loop. Note that it needs + * to be pageblock aligned even when the region itself + * is not. move_freepages_block() can shift ahead of + * the valid region but still depends on correct page + * metadata. */ - pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; + pfn = (memblock_next_valid_pfn(pfn, end_pfn) & + ~(pageblock_nr_pages-1)) - 1; #endif continue; } -- cgit v1.2.3 From 3e04040df6d4613a8af5a80882d5f7f298f49810 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 14 Mar 2018 19:29:37 +0000 Subject: Revert "mm/page_alloc: fix memmap_init_zone pageblock alignment" This reverts commit 864b75f9d6b0100bb24fdd9a20d156e7cda9b5ae. Commit 864b75f9d6b0 ("mm/page_alloc: fix memmap_init_zone pageblock alignment") modified the logic in memmap_init_zone() to initialize struct pages associated with invalid PFNs, to appease a VM_BUG_ON() in move_freepages(), which is redundant by its own admission, and dereferences struct page fields to obtain the zone without checking whether the struct pages in question are valid to begin with. Commit 864b75f9d6b0 only makes it worse, since the rounding it does may cause pfn assume the same value it had in a prior iteration of the loop, resulting in an infinite loop and a hang very early in the boot. Also, since it doesn't perform the same rounding on start_pfn itself but only on intermediate values following an invalid PFN, we may still hit the same VM_BUG_ON() as before. So instead, let's fix this at the core, and ensure that the BUG check doesn't dereference struct page fields of invalid pages. Fixes: 864b75f9d6b0 ("mm/page_alloc: fix memmap_init_zone pageblock alignment") Tested-by: Jan Glauber Tested-by: Shanker Donthineni Cc: Daniel Vacek Cc: Mel Gorman Cc: Michal Hocko Cc: Paul Burton Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Andrew Morton Signed-off-by: Ard Biesheuvel Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3d974cb2a1a1..635d7dd29d7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1910,7 +1910,9 @@ static int move_freepages(struct zone *zone, * Remove at a later date when no bug reports exist related to * grouping pages by mobility */ - VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); + VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && + pfn_valid(page_to_pfn(end_page)) && + page_zone(start_page) != page_zone(end_page)); #endif if (num_movable) @@ -5359,14 +5361,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, /* * Skip to the pfn preceding the next valid one (or * end_pfn), such that we hit a valid pfn (or end_pfn) - * on our next iteration of the loop. Note that it needs - * to be pageblock aligned even when the region itself - * is not. move_freepages_block() can shift ahead of - * the valid region but still depends on correct page - * metadata. + * on our next iteration of the loop. */ - pfn = (memblock_next_valid_pfn(pfn, end_pfn) & - ~(pageblock_nr_pages-1)) - 1; + pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; #endif continue; } -- cgit v1.2.3 From 1a8429132e1d2ada3832db5b4a0802c49affb750 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 9 Mar 2018 23:11:26 +0100 Subject: mm: remove blackfin MPU support The CONFIG_MPU option was only defined on blackfin, and that architecture is now being removed, so the respective code can be simplified. A lot of other microcontrollers have an MPU, but I suspect that if we want to bring that support back, we'd do it differently anyway. Signed-off-by: Arnd Bergmann --- kernel/module.c | 4 ---- mm/nommu.c | 20 -------------------- 2 files changed, 24 deletions(-) (limited to 'mm') diff --git a/kernel/module.c b/kernel/module.c index ad2d420024f6..2c1df850029b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2181,10 +2181,6 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ disable_ro_nx(&mod->core_layout); module_memfree(mod->core_layout.base); - -#ifdef CONFIG_MPU - update_protections(current->mm); -#endif } void *__symbol_get(const char *symbol) diff --git a/mm/nommu.c b/mm/nommu.c index ebb6e618dade..838a8fdec5c2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -662,22 +662,6 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } -/* - * update protection on a vma - */ -static void protect_vma(struct vm_area_struct *vma, unsigned long flags) -{ -#ifdef CONFIG_MPU - struct mm_struct *mm = vma->vm_mm; - long start = vma->vm_start & PAGE_MASK; - while (start < vma->vm_end) { - protect_page(mm, start, flags); - start += PAGE_SIZE; - } - update_protections(mm); -#endif -} - /* * add a VMA into a process's mm_struct in the appropriate place in the list * and tree and add to the address space's page tree also if not an anonymous @@ -695,8 +679,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; vma->vm_mm = mm; - protect_vma(vma, vma->vm_flags); - /* add the VMA to the mapping */ if (vma->vm_file) { mapping = vma->vm_file->f_mapping; @@ -757,8 +739,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) struct mm_struct *mm = vma->vm_mm; struct task_struct *curr = current; - protect_vma(vma, 0); - mm->map_count--; for (i = 0; i < VMACACHE_SIZE; i++) { /* if the vma is cached, invalidate the entire cache */ -- cgit v1.2.3 From 79375ea3ec527f746d5beae8c8f6e8a58740d4a8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 9 Mar 2018 23:14:56 +0100 Subject: mm: remove obsolete alloc_remap() Tile was the only remaining architecture to implement alloc_remap(), and since that is being removed, there is no point in keeping this function. Removing all callers simplifies the mem_map handling. Reviewed-by: Pavel Tatashin Signed-off-by: Arnd Bergmann --- include/linux/bootmem.h | 9 --------- mm/page_alloc.c | 5 +---- mm/sparse.c | 15 --------------- 3 files changed, 1 insertion(+), 28 deletions(-) (limited to 'mm') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index a53063e9d7d8..7942a96b1a9d 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -364,15 +364,6 @@ static inline void __init memblock_free_late( } #endif /* defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) */ -#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP -extern void *alloc_remap(int nid, unsigned long size); -#else -static inline void *alloc_remap(int nid, unsigned long size) -{ - return NULL; -} -#endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */ - extern void *alloc_large_system_hash(const char *tablename, unsigned long bucketsize, unsigned long numentries, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cb416723538f..484e21062228 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6199,10 +6199,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = alloc_remap(pgdat->node_id, size); - if (!map) - map = memblock_virt_alloc_node_nopanic(size, - pgdat->node_id); + map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); pgdat->node_mem_map = map + offset; } pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", diff --git a/mm/sparse.c b/mm/sparse.c index 7af5e7a92528..65bb52599f90 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -427,10 +427,6 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, struct page *map; unsigned long size; - map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); - if (map) - return map; - size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); map = memblock_virt_alloc_try_nid(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), @@ -446,17 +442,6 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, unsigned long pnum; unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; - map = alloc_remap(nodeid, size * map_count); - if (map) { - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - map_map[pnum] = map; - map += size; - } - return; - } - size = PAGE_ALIGN(size); map = memblock_virt_alloc_try_nid_raw(size * map_count, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), -- cgit v1.2.3 From 71546d100422bcc2c543dadeb9328728997cd23a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 08:27:26 -0700 Subject: percpu: include linux/sched.h for cond_resched() microblaze build broke due to missing declaration of the cond_resched() invocation added recently. Let's include linux/sched.h explicitly. Signed-off-by: Tejun Heo Reported-by: kbuild test robot --- mm/percpu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 36e7b65ba6cf..15a398c00791 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From f52ba1fef7b92e74d58efef8eae7b6f48c6d218d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Mar 2018 18:32:10 +0300 Subject: mm: Allow to kill tasks doing pcpu_alloc() and waiting for pcpu_balance_workfn() In case of memory deficit and low percpu memory pages, pcpu_balance_workfn() takes pcpu_alloc_mutex for a long time (as it makes memory allocations itself and waits for memory reclaim). If tasks doing pcpu_alloc() are choosen by OOM killer, they can't exit, because they are waiting for the mutex. The patch makes pcpu_alloc() to care about killing signal and use mutex_lock_killable(), when it's allowed by GFP flags. This guarantees, a task does not miss SIGKILL from OOM killer. Signed-off-by: Kirill Tkhai Signed-off-by: Tejun Heo --- mm/percpu.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 15a398c00791..9297098519a6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1373,8 +1373,17 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, return NULL; } - if (!is_atomic) - mutex_lock(&pcpu_alloc_mutex); + if (!is_atomic) { + /* + * pcpu_balance_workfn() allocates memory under this mutex, + * and it may wait for memory reclaim. Allow current task + * to become OOM victim, in case of memory pressure. + */ + if (gfp & __GFP_NOFAIL) + mutex_lock(&pcpu_alloc_mutex); + else if (mutex_lock_killable(&pcpu_alloc_mutex)) + return NULL; + } spin_lock_irqsave(&pcpu_lock, flags); -- cgit v1.2.3 From 8970a63e965b43288c4f5f40efbc2bbf80de7f16 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Thu, 22 Mar 2018 16:17:02 -0700 Subject: mm/mempolicy.c: avoid use uninitialized preferred_node Alexander reported a use of uninitialized memory in __mpol_equal(), which is caused by incorrect use of preferred_node. When mempolicy in mode MPOL_PREFERRED with flags MPOL_F_LOCAL, it uses numa_node_id() instead of preferred_node, however, __mpol_equal() uses preferred_node without checking whether it is MPOL_F_LOCAL or not. [akpm@linux-foundation.org: slight comment tweak] Link: http://lkml.kernel.org/r/4ebee1c2-57f6-bcb8-0e2d-1833d1ee0bb7@huawei.com Fixes: fc36b8d3d819 ("mempolicy: use MPOL_F_LOCAL to Indicate Preferred Local Policy") Signed-off-by: Yisheng Xie Reported-by: Alexander Potapenko Tested-by: Alexander Potapenko Reviewed-by: Andrew Morton Cc: Dmitriy Vyukov Cc: Vlastimil Babka Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d879f1d8a44a..32cba0332787 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2124,6 +2124,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: + /* a's ->flags is the same as b's */ + if (a->flags & MPOL_F_LOCAL) + return true; return a->v.preferred_node == b->v.preferred_node; default: BUG(); -- cgit v1.2.3 From 2e517d681632326ed98399cb4dd99519efe3e32c Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 22 Mar 2018 16:17:10 -0700 Subject: lockdep: fix fs_reclaim warning Dave Jones reported fs_reclaim lockdep warnings. ============================================ WARNING: possible recursive locking detected 4.15.0-rc9-backup-debug+ #1 Not tainted -------------------------------------------- sshd/24800 is trying to acquire lock: (fs_reclaim){+.+.}, at: [<0000000084f438c2>] fs_reclaim_acquire.part.102+0x5/0x30 but task is already holding lock: (fs_reclaim){+.+.}, at: [<0000000084f438c2>] fs_reclaim_acquire.part.102+0x5/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(fs_reclaim); lock(fs_reclaim); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by sshd/24800: #0: (sk_lock-AF_INET6){+.+.}, at: [<000000001a069652>] tcp_sendmsg+0x19/0x40 #1: (fs_reclaim){+.+.}, at: [<0000000084f438c2>] fs_reclaim_acquire.part.102+0x5/0x30 stack backtrace: CPU: 3 PID: 24800 Comm: sshd Not tainted 4.15.0-rc9-backup-debug+ #1 Call Trace: dump_stack+0xbc/0x13f __lock_acquire+0xa09/0x2040 lock_acquire+0x12e/0x350 fs_reclaim_acquire.part.102+0x29/0x30 kmem_cache_alloc+0x3d/0x2c0 alloc_extent_state+0xa7/0x410 __clear_extent_bit+0x3ea/0x570 try_release_extent_mapping+0x21a/0x260 __btrfs_releasepage+0xb0/0x1c0 btrfs_releasepage+0x161/0x170 try_to_release_page+0x162/0x1c0 shrink_page_list+0x1d5a/0x2fb0 shrink_inactive_list+0x451/0x940 shrink_node_memcg.constprop.88+0x4c9/0x5e0 shrink_node+0x12d/0x260 try_to_free_pages+0x418/0xaf0 __alloc_pages_slowpath+0x976/0x1790 __alloc_pages_nodemask+0x52c/0x5c0 new_slab+0x374/0x3f0 ___slab_alloc.constprop.81+0x47e/0x5a0 __slab_alloc.constprop.80+0x32/0x60 __kmalloc_track_caller+0x267/0x310 __kmalloc_reserve.isra.40+0x29/0x80 __alloc_skb+0xee/0x390 sk_stream_alloc_skb+0xb8/0x340 tcp_sendmsg_locked+0x8e6/0x1d30 tcp_sendmsg+0x27/0x40 inet_sendmsg+0xd0/0x310 sock_write_iter+0x17a/0x240 __vfs_write+0x2ab/0x380 vfs_write+0xfb/0x260 SyS_write+0xb6/0x140 do_syscall_64+0x1e5/0xc05 entry_SYSCALL64_slow_path+0x25/0x25 This warning is caused by commit d92a8cfcb37e ("locking/lockdep: Rework FS_RECLAIM annotation") which replaced the use of lockdep_{set,clear}_current_reclaim_state() in __perform_reclaim() and lockdep_trace_alloc() in slab_pre_alloc_hook() with fs_reclaim_acquire()/ fs_reclaim_release(). Since __kmalloc_reserve() from __alloc_skb() adds __GFP_NOMEMALLOC | __GFP_NOWARN to gfp_mask, and all reclaim path simply propagates __GFP_NOMEMALLOC, fs_reclaim_acquire() in slab_pre_alloc_hook() is trying to grab the 'fake' lock again when __perform_reclaim() already grabbed the 'fake' lock. The /* this guy won't enter reclaim */ if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) return false; test which causes slab_pre_alloc_hook() to try to grab the 'fake' lock was added by commit cf40bd16fdad ("lockdep: annotate reclaim context (__GFP_NOFS)"). But that test is outdated because PF_MEMALLOC thread won't enter reclaim regardless of __GFP_NOMEMALLOC after commit 341ce06f69ab ("page allocator: calculate the alloc_flags for allocation only once") added the PF_MEMALLOC safeguard ( /* Avoid recursion of direct reclaim */ if (p->flags & PF_MEMALLOC) goto nopage; in __alloc_pages_slowpath()). Thus, let's fix outdated test by removing __GFP_NOMEMALLOC test and allow __need_fs_reclaim() to return false. Link: http://lkml.kernel.org/r/201802280650.FJC73911.FOSOMLJVFFQtHO@I-love.SAKURA.ne.jp Fixes: d92a8cfcb37ecd13 ("locking/lockdep: Rework FS_RECLAIM annotation") Signed-off-by: Tetsuo Handa Reported-by: Dave Jones Tested-by: Dave Jones Cc: Peter Zijlstra Cc: Nick Piggin Cc: Ingo Molnar Cc: Nikolay Borisov Cc: Michal Hocko Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 635d7dd29d7f..010dee0f089e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3596,7 +3596,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) return false; /* this guy won't enter reclaim */ - if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + if (current->flags & PF_MEMALLOC) return false; /* We're only interested __GFP_FS allocations for now */ -- cgit v1.2.3 From 63489f8e821144000e0bdca7e65a8d1cc23a7ee7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 22 Mar 2018 16:17:13 -0700 Subject: hugetlbfs: check for pgoff value overflow A vma with vm_pgoff large enough to overflow a loff_t type when converted to a byte offset can be passed via the remap_file_pages system call. The hugetlbfs mmap routine uses the byte offset to calculate reservations and file size. A sequence such as: mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0); remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0); will result in the following when task exits/file closed, kernel BUG at mm/hugetlb.c:749! Call Trace: hugetlbfs_evict_inode+0x2f/0x40 evict+0xcb/0x190 __dentry_kill+0xcb/0x150 __fput+0x164/0x1e0 task_work_run+0x84/0xa0 exit_to_usermode_loop+0x7d/0x80 do_syscall_64+0x18b/0x190 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 The overflowed pgoff value causes hugetlbfs to try to set up a mapping with a negative range (end < start) that leaves invalid state which causes the BUG. The previous overflow fix to this code was incomplete and did not take the remap_file_pages system call into account. [mike.kravetz@oracle.com: v3] Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com [akpm@linux-foundation.org: include mmdebug.h] [akpm@linux-foundation.org: fix -ve left shift count on sh] Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap") Signed-off-by: Mike Kravetz Reported-by: Nic Losby Acked-by: Michal Hocko Cc: "Kirill A . Shutemov" Cc: Yisheng Xie Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 17 ++++++++++++++--- mm/hugetlb.c | 7 +++++++ 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8fe1b0aa2896..b9a254dcc0e7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } +/* + * Mask used when checking the page offset value passed in via system + * calls. This value will be converted to a loff_t which is signed. + * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the + * value. The extra bit (- 1 in the shift value) is to take the sign + * bit into account. + */ +#define PGOFF_LOFFT_MAX \ + (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) + static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); @@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &hugetlb_vm_ops; /* - * Offset passed to mmap (before page shift) could have been - * negative when represented as a (l)off_t. + * page based offset in vm_pgoff could be sufficiently large to + * overflow a (l)off_t when converted to byte offset. */ - if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL; + /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a963f2034dfc..976bbc5646fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -4374,6 +4375,12 @@ int hugetlb_reserve_pages(struct inode *inode, struct resv_map *resv_map; long gbl_reserve; + /* This should never happen */ + if (from > to) { + VM_WARN(1, "%s called with a negative range\n", __func__); + return -EINVAL; + } + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page -- cgit v1.2.3 From fece2029a9e65b9a990831afe2a2b83290cbbe26 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 22 Mar 2018 16:17:28 -0700 Subject: mm/khugepaged.c: convert VM_BUG_ON() to collapse fail khugepaged is not yet able to convert PTE-mapped huge pages back to PMD mapped. We do not collapse such pages. See check khugepaged_scan_pmd(). But if between khugepaged_scan_pmd() and __collapse_huge_page_isolate() somebody managed to instantiate THP in the range and then split the PMD back to PTEs we would have a problem -- VM_BUG_ON_PAGE(PageCompound(page)) will get triggered. It's possible since we drop mmap_sem during collapse to re-take for write. Replace the VM_BUG_ON() with graceful collapse fail. Link: http://lkml.kernel.org/r/20180315152353.27989-1-kirill.shutemov@linux.intel.com Fixes: b1caa957ae6d ("khugepaged: ignore pmd tables with THP mapped with ptes") Signed-off-by: Kirill A. Shutemov Cc: Laura Abbott Cc: Jerome Marchand Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b7e2268dfc9a..c15da1ea7e63 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -530,7 +530,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } - VM_BUG_ON_PAGE(PageCompound(page), page); + /* TODO: teach khugepaged to collapse THP mapped with pte */ + if (PageCompound(page)) { + result = SCAN_PAGE_COMPOUND; + goto out; + } + VM_BUG_ON_PAGE(!PageAnon(page), page); /* -- cgit v1.2.3 From fa41b900c30b45fab03783724932dc30cd46a6be Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 22 Mar 2018 16:17:31 -0700 Subject: mm/thp: do not wait for lock_page() in deferred_split_scan() deferred_split_scan() gets called from reclaim path. Waiting for page lock may lead to deadlock there. Replace lock_page() with trylock_page() and skip the page if we failed to lock it. We will get to the page on the next scan. Link: http://lkml.kernel.org/r/20180315150747.31945-1-kirill.shutemov@linux.intel.com Fixes: 9a982250f773 ("thp: introduce deferred_split_huge_page()") Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 87ab9b8f56b5..529cf36b7edb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2783,11 +2783,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); - lock_page(page); + if (!trylock_page(page)) + goto next; /* split_huge_page() removes page from list on success */ if (!split_huge_page(page)) split++; unlock_page(page); +next: put_page(page); } -- cgit v1.2.3 From b3cd54b257ad95d344d121dc563d943ca39b0921 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 22 Mar 2018 16:17:35 -0700 Subject: mm/shmem: do not wait for lock_page() in shmem_unused_huge_shrink() shmem_unused_huge_shrink() gets called from reclaim path. Waiting for page lock may lead to deadlock there. There was a bug report that may be attributed to this: http://lkml.kernel.org/r/alpine.LRH.2.11.1801242349220.30642@mail.ewheeler.net Replace lock_page() with trylock_page() and skip the page if we failed to lock it. We will get to the page on the next scan. We can test for the PageTransHuge() outside the page lock as we only need protection against splitting the page under us. Holding pin oni the page is enough for this. Link: http://lkml.kernel.org/r/20180316210830.43738-1-kirill.shutemov@linux.intel.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Kirill A. Shutemov Reported-by: Eric Wheeler Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Tetsuo Handa Cc: Hugh Dickins Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 1907688b75ee..b85919243399 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -493,36 +493,45 @@ next: info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; - if (nr_to_split && split >= nr_to_split) { - iput(inode); - continue; - } + if (nr_to_split && split >= nr_to_split) + goto leave; - page = find_lock_page(inode->i_mapping, + page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); if (!page) goto drop; + /* No huge page at the end of the file: nothing to split */ if (!PageTransHuge(page)) { - unlock_page(page); put_page(page); goto drop; } + /* + * Leave the inode on the list if we failed to lock + * the page at this time. + * + * Waiting for the lock may lead to deadlock in the + * reclaim path. + */ + if (!trylock_page(page)) { + put_page(page); + goto leave; + } + ret = split_huge_page(page); unlock_page(page); put_page(page); - if (ret) { - /* split failed: leave it on the list */ - iput(inode); - continue; - } + /* If split failed leave the inode on the list */ + if (ret) + goto leave; split++; drop: list_del_init(&info->shrinklist); removed++; +leave: iput(inode); } -- cgit v1.2.3 From f59f1caf72ba00d519c793c3deb32cd3be32edc2 Mon Sep 17 00:00:00 2001 From: Daniel Vacek Date: Thu, 22 Mar 2018 16:17:38 -0700 Subject: Revert "mm: page_alloc: skip over regions of invalid pfns where possible" This reverts commit b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible"). The commit is meant to be a boot init speed up skipping the loop in memmap_init_zone() for invalid pfns. But given some specific memory mapping on x86_64 (or more generally theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID) the implementation also skips valid pfns which is plain wrong and causes 'kernel BUG at mm/page_alloc.c:1389!' crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1 kernel BUG at mm/page_alloc.c:1389! invalid opcode: 0000 [#1] SMP -- RIP: 0010: move_freepages+0x15e/0x160 -- Call Trace: move_freepages_block+0x73/0x80 __rmqueue+0x263/0x460 get_page_from_freelist+0x7e1/0x9e0 __alloc_pages_nodemask+0x176/0x420 -- crash> page_init_bug -v | grep RAM 1000 - 9bfff System RAM (620.00 KiB) 100000 - 430bffff System RAM ( 1.05 GiB = 1071.75 MiB = 1097472.00 KiB) 4b0c8000 - 4bf9cfff System RAM ( 14.83 MiB = 15188.00 KiB) 4bfac000 - 646b1fff System RAM (391.02 MiB = 400408.00 KiB) 7b788000 - 7b7fffff System RAM (480.00 KiB) 100000000 - 67fffffff System RAM ( 22.00 GiB) crash> page_init_bug | head -6 7b788000 - 7b7fffff System RAM (480.00 KiB) 1fffff00000000 0 1 DMA32 4096 1048575 505736 505344 505855 0 0 0 DMA 1 4095 1fffff00000400 0 1 DMA32 4096 1048575 BUG, zones differ! crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000 PAGE PHYSICAL MAPPING INDEX CNT FLAGS ffffea0001e00000 78000000 0 0 0 0 ffffea0001ed7fc0 7b5ff000 0 0 0 0 ffffea0001ed8000 7b600000 0 0 0 0 <<<< ffffea0001ede1c0 7b787000 0 0 0 0 ffffea0001ede200 7b788000 0 0 1 1fffff00000000 Link: http://lkml.kernel.org/r/20180316143855.29838-1-neelx@redhat.com Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible") Signed-off-by: Daniel Vacek Acked-by: Ard Biesheuvel Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Mel Gorman Cc: Pavel Tatashin Cc: Paul Burton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 - mm/memblock.c | 28 ---------------------------- mm/page_alloc.c | 11 +---------- 3 files changed, 1 insertion(+), 39 deletions(-) (limited to 'mm') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8be5077efb5f..f92ea7783652 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); -unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn); /** * for_each_mem_pfn_range - early memory pfn range iterator diff --git a/mm/memblock.c b/mm/memblock.c index b6ba6b7adadc..48376bd33274 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, *out_nid = r->nid; } -unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, - unsigned long max_pfn) -{ - struct memblock_type *type = &memblock.memory; - unsigned int right = type->cnt; - unsigned int mid, left = 0; - phys_addr_t addr = PFN_PHYS(++pfn); - - do { - mid = (right + left) / 2; - - if (addr < type->regions[mid].base) - right = mid; - else if (addr >= (type->regions[mid].base + - type->regions[mid].size)) - left = mid + 1; - else { - /* addr is within the region, so pfn is valid */ - return pfn; - } - } while (left < right); - - if (right == type->cnt) - return -1UL; - else - return PHYS_PFN(type->regions[right].base); -} - /** * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 010dee0f089e..1741dd23e7c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * Skip to the pfn preceding the next valid one (or - * end_pfn), such that we hit a valid pfn (or end_pfn) - * on our next iteration of the loop. - */ - pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; -#endif + if (!early_pfn_valid(pfn)) continue; - } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) -- cgit v1.2.3 From 1c610d5f93c709df56787f50b3576704ac271826 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 22 Mar 2018 16:17:42 -0700 Subject: mm/vmscan: wake up flushers for legacy cgroups too Commit 726d061fbd36 ("mm: vmscan: kick flushers when we encounter dirty pages on the LRU") added flusher invocation to shrink_inactive_list() when many dirty pages on the LRU are encountered. However, shrink_inactive_list() doesn't wake up flushers for legacy cgroup reclaim, so the next commit bbef938429f5 ("mm: vmscan: remove old flusher wakeup from direct reclaim path") removed the only source of flusher's wake up in legacy mem cgroup reclaim path. This leads to premature OOM if there is too many dirty pages in cgroup: # mkdir /sys/fs/cgroup/memory/test # echo $$ > /sys/fs/cgroup/memory/test/tasks # echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes # dd if=/dev/zero of=tmp_file bs=1M count=100 Killed dd invoked oom-killer: gfp_mask=0x14000c0(GFP_KERNEL), nodemask=(null), order=0, oom_score_adj=0 Call Trace: dump_stack+0x46/0x65 dump_header+0x6b/0x2ac oom_kill_process+0x21c/0x4a0 out_of_memory+0x2a5/0x4b0 mem_cgroup_out_of_memory+0x3b/0x60 mem_cgroup_oom_synchronize+0x2ed/0x330 pagefault_out_of_memory+0x24/0x54 __do_page_fault+0x521/0x540 page_fault+0x45/0x50 Task in /test killed as a result of limit of /test memory: usage 51200kB, limit 51200kB, failcnt 73 memory+swap: usage 51200kB, limit 9007199254740988kB, failcnt 0 kmem: usage 296kB, limit 9007199254740988kB, failcnt 0 Memory cgroup stats for /test: cache:49632KB rss:1056KB rss_huge:0KB shmem:0KB mapped_file:0KB dirty:49500KB writeback:0KB swap:0KB inactive_anon:0KB active_anon:1168KB inactive_file:24760KB active_file:24960KB unevictable:0KB Memory cgroup out of memory: Kill process 3861 (bash) score 88 or sacrifice child Killed process 3876 (dd) total-vm:8484kB, anon-rss:1052kB, file-rss:1720kB, shmem-rss:0kB oom_reaper: reaped process 3876 (dd), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB Wake up flushers in legacy cgroup reclaim too. Link: http://lkml.kernel.org/r/20180315164553.17856-1-aryabinin@virtuozzo.com Fixes: bbef938429f5 ("mm: vmscan: remove old flusher wakeup from direct reclaim path") Signed-off-by: Andrey Ryabinin Tested-by: Shakeel Butt Acked-by: Michal Hocko Cc: Mel Gorman Cc: Tejun Heo Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index bee53495a829..cd5dc3faaa57 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1779,6 +1779,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stat.nr_writeback && stat.nr_writeback == nr_taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not doing their job. This can + * happen when memory pressure pushes dirty pages to the end of + * the LRU before the dirty limits are breached and the dirty + * data has expired. It can also happen when the proportion of + * dirty pages grows not through writes but through memory + * pressure reclaiming all the clean cache. And in some cases, + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep. + */ + if (stat.nr_unqueued_dirty == nr_taken) + wakeup_flusher_threads(WB_REASON_VMSCAN); + /* * Legacy memcg will stall in page writeback so avoid forcibly * stalling here. @@ -1791,22 +1805,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) set_bit(PGDAT_CONGESTED, &pgdat->flags); - /* - * If dirty pages are scanned that are not queued for IO, it - * implies that flushers are not doing their job. This can - * happen when memory pressure pushes dirty pages to the end of - * the LRU before the dirty limits are breached and the dirty - * data has expired. It can also happen when the proportion of - * dirty pages grows not through writes but through memory - * pressure reclaiming all the clean cache. And in some cases, - * the flushers simply cannot keep up with the allocation - * rate. Nudge the flusher threads in case they are asleep, but - * also allow kswapd to start writing pages during reclaim. - */ - if (stat.nr_unqueued_dirty == nr_taken) { - wakeup_flusher_threads(WB_REASON_VMSCAN); + /* Allow kswapd to start writing pages during reclaim. */ + if (stat.nr_unqueued_dirty == nr_taken) set_bit(PGDAT_DIRTY, &pgdat->flags); - } /* * If kswapd scans pages marked marked for immediate -- cgit v1.2.3 From 9d3c3354bb85bab4d865fe95039443f09a4c8394 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 22 Mar 2018 16:17:45 -0700 Subject: mm, thp: do not cause memcg oom for thp Commit 2516035499b9 ("mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations") changed the page allocator to no longer detect thp allocations based on __GFP_NORETRY. It did not, however, modify the mem cgroup try_charge() path to avoid oom kill for either khugepaged collapsing or thp faulting. It is never expected to oom kill a process to allocate a hugepage for thp; reclaim is governed by the thp defrag mode and MADV_HUGEPAGE, but allocations (and charging) should fallback instead of oom killing processes. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803191409420.124411@chino.kir.corp.google.com Fixes: 2516035499b9 ("mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations") Signed-off-by: David Rientjes Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Vlastimil Babka Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 +++-- mm/khugepaged.c | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 529cf36b7edb..5a68730eebd6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -555,7 +555,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg, + true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1316,7 +1317,7 @@ alloc: } if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, - huge_gfp, &memcg, true))) { + huge_gfp | __GFP_NORETRY, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c15da1ea7e63..e42568284e06 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -965,7 +965,9 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + /* Do not oom kill for khugepaged charges */ + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, + &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1324,7 +1326,9 @@ static void collapse_shmem(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + /* Do not oom kill for khugepaged charges */ + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, + &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } -- cgit v1.2.3 From a687a5337063af99ebd0eebaa6f4b4cf2e07c21b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 7 Mar 2018 23:30:54 +0100 Subject: treewide: simplify Kconfig dependencies for removed archs A lot of Kconfig symbols have architecture specific dependencies. In those cases that depend on architectures we have already removed, they can be omitted. Acked-by: Kalle Valo Acked-by: Alexandre Belloni Signed-off-by: Arnd Bergmann --- block/bounce.c | 2 +- drivers/ide/Kconfig | 2 +- drivers/ide/ide-generic.c | 12 +----------- drivers/input/joystick/analog.c | 2 +- drivers/isdn/hisax/Kconfig | 10 +++++----- drivers/net/ethernet/davicom/Kconfig | 2 +- drivers/net/ethernet/smsc/Kconfig | 6 +++--- drivers/net/wireless/cisco/Kconfig | 2 +- drivers/pwm/Kconfig | 2 +- drivers/rtc/Kconfig | 2 +- drivers/spi/Kconfig | 4 ++-- drivers/usb/musb/Kconfig | 2 +- drivers/video/console/Kconfig | 3 +-- drivers/watchdog/Kconfig | 6 ------ drivers/watchdog/Makefile | 6 ------ fs/Kconfig.binfmt | 5 ++--- fs/minix/Kconfig | 2 +- include/linux/ide.h | 7 +------ init/Kconfig | 5 ++--- lib/Kconfig.debug | 13 +++++-------- lib/test_user_copy.c | 2 -- mm/Kconfig | 7 ------- mm/percpu.c | 4 ---- 23 files changed, 31 insertions(+), 77 deletions(-) (limited to 'mm') diff --git a/block/bounce.c b/block/bounce.c index 6a3e68292273..dd0b93f2a871 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -31,7 +31,7 @@ static struct bio_set *bounce_bio_set, *bounce_bio_split; static mempool_t *page_pool, *isa_page_pool; -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) +#if defined(CONFIG_HIGHMEM) static __init int init_emergency_pool(void) { #if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index cf1fb3fb5d26..901b8833847f 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -200,7 +200,7 @@ comment "IDE chipset support/bugfixes" config IDE_GENERIC tristate "generic/default IDE chipset support" - depends on ALPHA || X86 || IA64 || M32R || MIPS || ARCH_RPC + depends on ALPHA || X86 || IA64 || MIPS || ARCH_RPC default ARM && ARCH_RPC help This is the generic IDE driver. This driver attaches to the diff --git a/drivers/ide/ide-generic.c b/drivers/ide/ide-generic.c index 54d7c4685d23..80c0d69b83ac 100644 --- a/drivers/ide/ide-generic.c +++ b/drivers/ide/ide-generic.c @@ -13,13 +13,10 @@ #include #include -/* FIXME: convert arm and m32r to use ide_platform host driver */ +/* FIXME: convert arm to use ide_platform host driver */ #ifdef CONFIG_ARM #include #endif -#ifdef CONFIG_M32R -#include -#endif #define DRV_NAME "ide_generic" @@ -35,13 +32,6 @@ static const struct ide_port_info ide_generic_port_info = { #ifdef CONFIG_ARM static const u16 legacy_bases[] = { 0x1f0 }; static const int legacy_irqs[] = { IRQ_HARDDISK }; -#elif defined(CONFIG_PLAT_M32700UT) || defined(CONFIG_PLAT_MAPPI2) || \ - defined(CONFIG_PLAT_OPSPUT) -static const u16 legacy_bases[] = { 0x1f0 }; -static const int legacy_irqs[] = { PLD_IRQ_CFIREQ }; -#elif defined(CONFIG_PLAT_MAPPI3) -static const u16 legacy_bases[] = { 0x1f0, 0x170 }; -static const int legacy_irqs[] = { PLD_IRQ_CFIREQ, PLD_IRQ_IDEIREQ }; #elif defined(CONFIG_ALPHA) static const u16 legacy_bases[] = { 0x1f0, 0x170, 0x1e8, 0x168 }; static const int legacy_irqs[] = { 14, 15, 11, 10 }; diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index be1b4921f22a..eefac7978f93 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -163,7 +163,7 @@ static unsigned int get_time_pit(void) #define GET_TIME(x) do { x = (unsigned int)rdtsc(); } while (0) #define DELTA(x,y) ((y)-(x)) #define TIME_NAME "TSC" -#elif defined(__alpha__) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV) || defined(CONFIG_TILE) +#elif defined(__alpha__) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV) #define GET_TIME(x) do { x = get_cycles(); } while (0) #define DELTA(x,y) ((y)-(x)) #define TIME_NAME "get_cycles" diff --git a/drivers/isdn/hisax/Kconfig b/drivers/isdn/hisax/Kconfig index eb83d94ab4fe..38cfc8baae19 100644 --- a/drivers/isdn/hisax/Kconfig +++ b/drivers/isdn/hisax/Kconfig @@ -109,7 +109,7 @@ config HISAX_16_3 config HISAX_TELESPCI bool "Teles PCI" - depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN))) + depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || (XTENSA && !CPU_LITTLE_ENDIAN))) help This enables HiSax support for the Teles PCI. See on how to configure it. @@ -237,7 +237,7 @@ config HISAX_MIC config HISAX_NETJET bool "NETjet card" - depends on PCI && (BROKEN || !(PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN) || MICROBLAZE)) + depends on PCI && (BROKEN || !(PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || (XTENSA && !CPU_LITTLE_ENDIAN) || MICROBLAZE)) depends on VIRT_TO_BUS help This enables HiSax support for the NetJet from Traverse @@ -249,7 +249,7 @@ config HISAX_NETJET config HISAX_NETJET_U bool "NETspider U card" - depends on PCI && (BROKEN || !(PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN) || MICROBLAZE)) + depends on PCI && (BROKEN || !(PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || (XTENSA && !CPU_LITTLE_ENDIAN) || MICROBLAZE)) depends on VIRT_TO_BUS help This enables HiSax support for the Netspider U interface ISDN card @@ -318,7 +318,7 @@ config HISAX_GAZEL config HISAX_HFC_PCI bool "HFC PCI-Bus cards" - depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN))) + depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || (XTENSA && !CPU_LITTLE_ENDIAN))) help This enables HiSax support for the HFC-S PCI 2BDS0 based cards. @@ -343,7 +343,7 @@ config HISAX_HFC_SX config HISAX_ENTERNOW_PCI bool "Formula-n enter:now PCI card" - depends on HISAX_NETJET && PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || FRV || (XTENSA && !CPU_LITTLE_ENDIAN))) + depends on HISAX_NETJET && PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || (MIPS && !CPU_LITTLE_ENDIAN) || (XTENSA && !CPU_LITTLE_ENDIAN))) help This enables HiSax support for the Formula-n enter:now PCI ISDN card. diff --git a/drivers/net/ethernet/davicom/Kconfig b/drivers/net/ethernet/davicom/Kconfig index 7ec2d74f94d3..680a6d983f37 100644 --- a/drivers/net/ethernet/davicom/Kconfig +++ b/drivers/net/ethernet/davicom/Kconfig @@ -4,7 +4,7 @@ config DM9000 tristate "DM9000 support" - depends on ARM || BLACKFIN || MIPS || COLDFIRE || NIOS2 + depends on ARM || MIPS || COLDFIRE || NIOS2 select CRC32 select MII ---help--- diff --git a/drivers/net/ethernet/smsc/Kconfig b/drivers/net/ethernet/smsc/Kconfig index 948603e9b905..3da0c573d2ab 100644 --- a/drivers/net/ethernet/smsc/Kconfig +++ b/drivers/net/ethernet/smsc/Kconfig @@ -5,8 +5,8 @@ config NET_VENDOR_SMSC bool "SMC (SMSC)/Western Digital devices" default y - depends on ARM || ARM64 || ATARI_ETHERNAT || BLACKFIN || COLDFIRE || \ - ISA || M32R || MAC || MIPS || NIOS2 || PCI || \ + depends on ARM || ARM64 || ATARI_ETHERNAT || COLDFIRE || \ + ISA || MAC || MIPS || NIOS2 || PCI || \ PCMCIA || SUPERH || XTENSA || H8300 ---help--- If you have a network (Ethernet) card belonging to this class, say Y. @@ -37,7 +37,7 @@ config SMC91X select CRC32 select MII depends on !OF || GPIOLIB - depends on ARM || ARM64 || ATARI_ETHERNAT || BLACKFIN || COLDFIRE || \ + depends on ARM || ARM64 || ATARI_ETHERNAT || COLDFIRE || \ M32R || MIPS || NIOS2 || SUPERH || XTENSA || H8300 ---help--- This is a driver for SMC's 91x series of Ethernet chipsets, diff --git a/drivers/net/wireless/cisco/Kconfig b/drivers/net/wireless/cisco/Kconfig index b22567dff893..8ed0b154bb33 100644 --- a/drivers/net/wireless/cisco/Kconfig +++ b/drivers/net/wireless/cisco/Kconfig @@ -33,7 +33,7 @@ config AIRO config AIRO_CS tristate "Cisco/Aironet 34X/35X/4500/4800 PCMCIA cards" - depends on CFG80211 && PCMCIA && (BROKEN || !M32R) + depends on CFG80211 && PCMCIA select WIRELESS_EXT select WEXT_SPY select WEXT_PRIV diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index 763ee50ea57d..f16aad3bf5d6 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -43,7 +43,7 @@ config PWM_AB8500 config PWM_ATMEL tristate "Atmel PWM support" - depends on ARCH_AT91 || AVR32 + depends on ARCH_AT91 help Generic PWM framework driver for Atmel SoC. diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index be5a3dc99c11..46af10ac45fc 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -868,7 +868,7 @@ comment "Platform RTC drivers" config RTC_DRV_CMOS tristate "PC-style 'CMOS'" - depends on X86 || ARM || M32R || PPC || MIPS || SPARC64 + depends on X86 || ARM || PPC || MIPS || SPARC64 default y if X86 select RTC_MC146818_LIB help diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 603783976b81..103c13fcefa0 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -72,10 +72,10 @@ config SPI_ARMADA_3700 config SPI_ATMEL tristate "Atmel SPI Controller" depends on HAS_DMA - depends on (ARCH_AT91 || AVR32 || COMPILE_TEST) + depends on ARCH_AT91 || COMPILE_TEST help This selects a driver for the Atmel SPI Controller, present on - many AT32 (AVR32) and AT91 (ARM) chips. + many AT91 ARM chips. config SPI_AU1550 tristate "Au1550/Au1200/Au1300 SPI Controller" diff --git a/drivers/usb/musb/Kconfig b/drivers/usb/musb/Kconfig index 5506a9c03c1f..e757afc1cfd0 100644 --- a/drivers/usb/musb/Kconfig +++ b/drivers/usb/musb/Kconfig @@ -87,7 +87,7 @@ config USB_MUSB_DA8XX config USB_MUSB_TUSB6010 tristate "TUSB6010" depends on HAS_IOMEM - depends on (ARCH_OMAP2PLUS || COMPILE_TEST) && !BLACKFIN + depends on ARCH_OMAP2PLUS || COMPILE_TEST depends on NOP_USB_XCEIV = USB_MUSB_HDRC # both built-in or both modules config USB_MUSB_OMAP2PLUS diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig index 005ed87c8216..a9e398c144f8 100644 --- a/drivers/video/console/Kconfig +++ b/drivers/video/console/Kconfig @@ -6,8 +6,7 @@ menu "Console display driver support" config VGA_CONSOLE bool "VGA text console" if EXPERT || !X86 - depends on !4xx && !PPC_8xx && !SPARC && !M68K && !PARISC && !FRV && \ - !SUPERH && !BLACKFIN && !AVR32 && !CRIS && \ + depends on !4xx && !PPC_8xx && !SPARC && !M68K && !PARISC && !SUPERH && \ (!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \ !ARM64 && !ARC && !MICROBLAZE && !OPENRISC default y diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 0e19679348d1..79020ce95de2 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -828,10 +828,6 @@ config BFIN_WDT To compile this driver as a module, choose M here: the module will be called bfin_wdt. -# CRIS Architecture - -# FRV Architecture - # X86 (i386 + ia64 + x86_64) Architecture config ACQUIRE_WDT @@ -1431,8 +1427,6 @@ config NIC7018_WDT To compile this driver as a module, choose M here: the module will be called nic7018_wdt. -# M32R Architecture - # M68K Architecture config M54xx_WATCHDOG diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 0474d38aa854..1f9a0235f22c 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -94,10 +94,6 @@ obj-$(CONFIG_SPRD_WATCHDOG) += sprd_wdt.o # BLACKFIN Architecture obj-$(CONFIG_BFIN_WDT) += bfin_wdt.o -# CRIS Architecture - -# FRV Architecture - # X86 (i386 + ia64 + x86_64) Architecture obj-$(CONFIG_ACQUIRE_WDT) += acquirewdt.o obj-$(CONFIG_ADVANTECH_WDT) += advantechwdt.o @@ -146,8 +142,6 @@ obj-$(CONFIG_INTEL_MEI_WDT) += mei_wdt.o obj-$(CONFIG_NI903X_WDT) += ni903x_wdt.o obj-$(CONFIG_NIC7018_WDT) += nic7018_wdt.o -# M32R Architecture - # M68K Architecture obj-$(CONFIG_M54xx_WATCHDOG) += m54xx_wdt.o diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 58c2bbd385ad..57a27c42b5ac 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -1,6 +1,6 @@ config BINFMT_ELF bool "Kernel support for ELF binaries" - depends on MMU && (BROKEN || !FRV) + depends on MMU select ELFCORE default y ---help--- @@ -35,7 +35,7 @@ config ARCH_BINFMT_ELF_STATE config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on (ARM || FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) + depends on (ARM || (SUPERH32 && !MMU) || C6X) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load @@ -90,7 +90,6 @@ config BINFMT_SCRIPT config BINFMT_FLAT bool "Kernel support for flat binaries" depends on !MMU || ARM || M68K - depends on !FRV || BROKEN help Support uClinux FLAT format binaries. diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig index f2a0cfcef11d..bcd53a79156f 100644 --- a/fs/minix/Kconfig +++ b/fs/minix/Kconfig @@ -18,7 +18,7 @@ config MINIX_FS config MINIX_FS_NATIVE_ENDIAN def_bool MINIX_FS - depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) + depends on MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED def_bool MINIX_FS diff --git a/include/linux/ide.h b/include/linux/ide.h index 20d42c0d9fb6..1d6f16110eae 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -25,15 +25,10 @@ #include #include -#if defined(CONFIG_CRIS) || defined(CONFIG_FRV) -# define SUPPORT_VLB_SYNC 0 -#else -# define SUPPORT_VLB_SYNC 1 -#endif - /* * Probably not wise to fiddle with these */ +#define SUPPORT_VLB_SYNC 1 #define IDE_DEFAULT_MAX_FAILURES 1 #define ERROR_MAX 8 /* Max read/write errors per sector */ #define ERROR_RESET 3 /* Reset controller every 4th retry */ diff --git a/init/Kconfig b/init/Kconfig index a14bcc9724a2..2852692d7c9c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -998,7 +998,6 @@ config RELAY config BLK_DEV_INITRD bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support" - depends on BROKEN || !FRV help The initial RAM filesystem is a ramfs which is loaded by the boot loader (loadlin or lilo) and that is mounted as root @@ -1108,7 +1107,7 @@ config MULTIUSER config SGETMASK_SYSCALL bool "sgetmask/ssetmask syscalls support" if EXPERT - def_bool PARISC || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH + def_bool PARISC || M68K || PPC || MIPS || X86 || SPARC || MICROBLAZE || SUPERH ---help--- sys_sgetmask and sys_ssetmask are obsolete system calls no longer supported in libc but still enabled by default in some @@ -1370,7 +1369,7 @@ config KALLSYMS_ABSOLUTE_PERCPU config KALLSYMS_BASE_RELATIVE bool depends on KALLSYMS - default !IA64 && !(TILE && 64BIT) + default !IA64 help Instead of emitting them as absolute values in the native word size, emit the symbol references in the kallsyms table as 32-bit entries, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 41ac9d294245..6927c6d8d185 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -165,7 +165,7 @@ config DEBUG_INFO_REDUCED config DEBUG_INFO_SPLIT bool "Produce split debuginfo in .dwo files" - depends on DEBUG_INFO && !FRV + depends on DEBUG_INFO help Generate debug info into separate .dwo files. This significantly reduces the build directory size for builds with DEBUG_INFO, @@ -354,10 +354,7 @@ config ARCH_WANT_FRAME_POINTERS config FRAME_POINTER bool "Compile the kernel with frame pointers" - depends on DEBUG_KERNEL && \ - (CRIS || M68K || FRV || UML || \ - SUPERH || BLACKFIN) || \ - ARCH_WANT_FRAME_POINTERS + depends on DEBUG_KERNEL && (M68K || UML || SUPERH) || ARCH_WANT_FRAME_POINTERS default y if (DEBUG_INFO && UML) || ARCH_WANT_FRAME_POINTERS help If you say Y here the resulting kernel image will be slightly @@ -1138,7 +1135,7 @@ config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT select STACKTRACE - select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE && !X86 + select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !X86 select KALLSYMS select KALLSYMS_ALL @@ -1571,7 +1568,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT depends on !X86_64 select STACKTRACE - select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE && !X86 + select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86 help Provide stacktrace filter for fault-injection capabilities @@ -1969,7 +1966,7 @@ config STRICT_DEVMEM bool "Filter access to /dev/mem" depends on MMU && DEVMEM depends on ARCH_HAS_DEVMEM_IS_ALLOWED - default y if TILE || PPC || X86 || ARM64 + default y if PPC || X86 || ARM64 ---help--- If this option is disabled, you allow userspace (root) access to all of memory, including kernel and userspace memory. Accidental diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c index a6556f3364d1..e161f0498f42 100644 --- a/lib/test_user_copy.c +++ b/lib/test_user_copy.c @@ -31,8 +31,6 @@ * their capability at compile-time, we just have to opt-out certain archs. */ #if BITS_PER_LONG == 64 || (!(defined(CONFIG_ARM) && !defined(MMU)) && \ - !defined(CONFIG_BLACKFIN) && \ - !defined(CONFIG_M32R) && \ !defined(CONFIG_M68K) && \ !defined(CONFIG_MICROBLAZE) && \ !defined(CONFIG_NIOS2) && \ diff --git a/mm/Kconfig b/mm/Kconfig index abefa573bcd8..d5004d82a1d6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -278,13 +278,6 @@ config BOUNCE by default when ZONE_DMA or HIGHMEM is selected, but you may say n to override this. -# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often -# have more than 4GB of memory, but we don't currently use the IOTLB to present -# a 32-bit address to OHCI. So we need to use a bounce pool instead. -config NEED_BOUNCE_POOL - bool - default y if TILE && USB_OHCI_HCD - config NR_QUICK int depends on QUICKLIST diff --git a/mm/percpu.c b/mm/percpu.c index 50e7fdf84055..79e3549cab0f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2719,11 +2719,7 @@ void __init setup_per_cpu_areas(void) if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); -#ifdef CONFIG_CRIS -#warning "the CRIS architecture has physical and virtual addresses confused" -#else pcpu_free_alloc_info(ai); -#endif } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From fc5d1073cae299de4517755a910df4f12a6a438f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 26 Mar 2018 23:27:21 -0700 Subject: x86/mm/32: Remove unused node_memmap_size_bytes() & CONFIG_NEED_NODE_MEMMAP_SIZE logic node_memmap_size_bytes() has been unused since the v3.9 kernel, so remove it. Signed-off-by: David Rientjes Cc: Dave Hansen Cc: Laura Abbott Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Fixes: f03574f2d5b2 ("x86-32, mm: Rip out x86_32 NUMA remapping code") Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803262325540.256524@chino.kir.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ---- arch/x86/mm/numa_32.c | 11 ----------- include/linux/mmzone.h | 5 ----- mm/sparse.c | 22 ---------------------- 4 files changed, 42 deletions(-) (limited to 'mm') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 18233e459bff..739aff253d17 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1608,10 +1608,6 @@ config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM -config NEED_NODE_MEMMAP_SIZE - def_bool y - depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) - config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index aca6295350f3..e8a4a09e20f1 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -60,17 +60,6 @@ void memory_present(int nid, unsigned long start, unsigned long end) } printk(KERN_CONT "\n"); } - -unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long nr_pages = end_pfn - start_pfn; - - if (!nr_pages) - return 0; - - return (nr_pages + 1) * sizeof(struct page); -} #endif extern unsigned long highend_pfn, highstart_pfn; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7522a6987595..a2db4576e499 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -816,10 +816,6 @@ int local_memory_node(int node_id); static inline int local_memory_node(int node_id) { return node_id; }; #endif -#ifdef CONFIG_NEED_NODE_MEMMAP_SIZE -unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); -#endif - /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ @@ -1289,7 +1285,6 @@ struct mminit_pfnnid_cache { #endif void memory_present(int nid, unsigned long start, unsigned long end); -unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we diff --git a/mm/sparse.c b/mm/sparse.c index 7af5e7a92528..79b26f98d793 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -235,28 +235,6 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) } } -/* - * Only used by the i386 NUMA architecures, but relatively - * generic code. - */ -unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long pfn; - unsigned long nr_pages = 0; - - mminit_validate_memmodel_limits(&start_pfn, &end_pfn); - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - if (nid != early_pfn_to_nid(pfn)) - continue; - - if (pfn_present(pfn)) - nr_pages += PAGES_PER_SECTION; - } - - return nr_pages * sizeof(struct page); -} - /* * Subtle, we encode the real pfn into the mem_map such that * the identity pfn - section_mem_map will return the actual -- cgit v1.2.3 From 880cd276dff17ea29e9a8404275c9502b265afa7 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 28 Mar 2018 16:00:57 -0700 Subject: mm, slab: memcg_link the SLAB's kmem_cache All the root caches are linked into slab_root_caches which was introduced by the commit 510ded33e075 ("slab: implement slab_root_caches list") but it missed to add the SLAB's kmem_cache. While experimenting with opt-in/opt-out kmem accounting, I noticed system crashes due to NULL dereference inside cache_from_memcg_idx() while deferencing kmem_cache.memcg_params.memcg_caches. The upstream clean kernel will not see these crashes but SLAB should be consistent with SLUB which does linked its boot caches (kmem_cache_node and kmem_cache) into slab_root_caches. Link: http://lkml.kernel.org/r/20180319210020.60289-1-shakeelb@google.com Fixes: 510ded33e075c ("slab: implement slab_root_caches list") Signed-off-by: Shakeel Butt Cc: Tejun Heo Cc: Vladimir Davydov Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 324446621b3e..9095c3945425 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1283,6 +1283,7 @@ void __init kmem_cache_init(void) nr_node_ids * sizeof(struct kmem_cache_node *), SLAB_HWCACHE_ALIGN, 0, 0); list_add(&kmem_cache->list, &slab_caches); + memcg_link_cache(kmem_cache); slab_state = PARTIAL; /* -- cgit v1.2.3 From 299815a4fba9f3c7a81434dba0072148f1690608 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Wed, 28 Mar 2018 16:01:05 -0700 Subject: mm/page_owner: fix recursion bug after changing skip entries This patch fixes commit 5f48f0bd4e36 ("mm, page_owner: skip unnecessary stack_trace entries"). Because if we skip first two entries then logic of checking count value as 2 for recursion is broken and code will go in one depth recursion. so we need to check only one call of _RET_IP(__set_page_owner) while checking for recursion. Current Backtrace while checking for recursion:- (save_stack) from (__set_page_owner) // (But recursion returns true here) (__set_page_owner) from (get_page_from_freelist) (get_page_from_freelist) from (__alloc_pages_nodemask) (__alloc_pages_nodemask) from (depot_save_stack) (depot_save_stack) from (save_stack) // recursion should return true here (save_stack) from (__set_page_owner) (__set_page_owner) from (get_page_from_freelist) (get_page_from_freelist) from (__alloc_pages_nodemask+) (__alloc_pages_nodemask) from (depot_save_stack) (depot_save_stack) from (save_stack) (save_stack) from (__set_page_owner) (__set_page_owner) from (get_page_from_freelist) Correct Backtrace with fix: (save_stack) from (__set_page_owner) // recursion returned true here (__set_page_owner) from (get_page_from_freelist) (get_page_from_freelist) from (__alloc_pages_nodemask+) (__alloc_pages_nodemask) from (depot_save_stack) (depot_save_stack) from (save_stack) (save_stack) from (__set_page_owner) (__set_page_owner) from (get_page_from_freelist) Link: http://lkml.kernel.org/r/1521607043-34670-1-git-send-email-maninder1.s@samsung.com Fixes: 5f48f0bd4e36 ("mm, page_owner: skip unnecessary stack_trace entries") Signed-off-by: Maninder Singh Signed-off-by: Vaneet Narang Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Oscar Salvador Cc: Greg Kroah-Hartman Cc: Ayush Mittal Cc: Prakash Gupta Cc: Vinayak Menon Cc: Vasyl Gomonovych Cc: Amit Sahrawat Cc: Cc: Vaneet Narang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index 9886c6073828..7172e0a80e13 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -123,13 +123,13 @@ void __reset_page_owner(struct page *page, unsigned int order) static inline bool check_recursive_alloc(struct stack_trace *trace, unsigned long ip) { - int i, count; + int i; if (!trace->nr_entries) return false; - for (i = 0, count = 0; i < trace->nr_entries; i++) { - if (trace->entries[i] == ip && ++count == 2) + for (i = 0; i < trace->nr_entries; i++) { + if (trace->entries[i] == ip) return true; } -- cgit v1.2.3 From c7f26ccfb2c31eb1bf810ba13d044fcf583232db Mon Sep 17 00:00:00 2001 From: "Steven J. Hill" Date: Wed, 28 Mar 2018 16:01:09 -0700 Subject: mm/vmstat.c: fix vmstat_update() preemption BUG Attempting to hotplug CPUs with CONFIG_VM_EVENT_COUNTERS enabled can cause vmstat_update() to report a BUG due to preemption not being disabled around smp_processor_id(). Discovered on Ubiquiti EdgeRouter Pro with Cavium Octeon II processor. BUG: using smp_processor_id() in preemptible [00000000] code: kworker/1:1/269 caller is vmstat_update+0x50/0xa0 CPU: 0 PID: 269 Comm: kworker/1:1 Not tainted 4.16.0-rc4-Cavium-Octeon-00009-gf83bbd5-dirty #1 Workqueue: mm_percpu_wq vmstat_update Call Trace: show_stack+0x94/0x128 dump_stack+0xa4/0xe0 check_preemption_disabled+0x118/0x120 vmstat_update+0x50/0xa0 process_one_work+0x144/0x348 worker_thread+0x150/0x4b8 kthread+0x110/0x140 ret_from_kernel_thread+0x14/0x1c Link: http://lkml.kernel.org/r/1520881552-25659-1-git-send-email-steven.hill@cavium.com Signed-off-by: Steven J. Hill Reviewed-by: Andrew Morton Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 40b2db6db6b1..33581be705f0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1839,9 +1839,11 @@ static void vmstat_update(struct work_struct *w) * to occur in the future. Keep on running the * update worker thread. */ + preempt_disable(); queue_delayed_work_on(smp_processor_id(), mm_percpu_wq, this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); + preempt_enable(); } } -- cgit v1.2.3 From b213b54fbf9d282dc545252313d727f3972be8e0 Mon Sep 17 00:00:00 2001 From: Honglei Wang Date: Wed, 28 Mar 2018 16:01:12 -0700 Subject: mm/memcontrol.c: fix parameter description mismatch There are a couple of places where parameter description and function name do not match the actual code. Fix it. Link: http://lkml.kernel.org/r/1520843448-17347-1-git-send-email-honglei.wang@oracle.com Signed-off-by: Honglei Wang Acked-by: Tejun Heo Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 670e99b68aa6..9ec024b862ac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -714,9 +714,9 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) * invocations for reference counting, or use mem_cgroup_iter_break() * to cancel a hierarchy walk before the round-trip is complete. * - * Reclaimers can specify a zone and a priority level in @reclaim to + * Reclaimers can specify a node and a priority level in @reclaim to * divide up the memcgs in the hierarchy among all concurrent - * reclaimers operating on the same zone and priority. + * reclaimers operating on the same node and priority. */ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -2299,7 +2299,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) } /** - * memcg_kmem_charge: charge a kmem page + * memcg_kmem_charge_memcg: charge a kmem page * @page: page to charge * @gfp: reclaim mode * @order: allocation order -- cgit v1.2.3 From 914b6dfff790544d9b77dfd1723adb3745ec9700 Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Wed, 28 Mar 2018 16:01:16 -0700 Subject: mm/kmemleak.c: wait for scan completion before disabling free A crash is observed when kmemleak_scan accesses the object->pointer, likely due to the following race. TASK A TASK B TASK C kmemleak_write (with "scan" and NOT "scan=on") kmemleak_scan() create_object kmem_cache_alloc fails kmemleak_disable kmemleak_do_cleanup kmemleak_free_enabled = 0 kfree kmemleak_free bails out (kmemleak_free_enabled is 0) slub frees object->pointer update_checksum crash - object->pointer freed (DEBUG_PAGEALLOC) kmemleak_do_cleanup waits for the scan thread to complete, but not for direct call to kmemleak_scan via kmemleak_write. So add a wait for kmemleak_scan completion before disabling kmemleak_free, and while at it fix the comment on stop_scan_thread. [vinmenon@codeaurora.org: fix stop_scan_thread comment] Link: http://lkml.kernel.org/r/1522219972-22809-1-git-send-email-vinmenon@codeaurora.org Link: http://lkml.kernel.org/r/1522063429-18992-1-git-send-email-vinmenon@codeaurora.org Signed-off-by: Vinayak Menon Reviewed-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e83987c55a08..46c2290a08f1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1657,8 +1657,7 @@ static void start_scan_thread(void) } /* - * Stop the automatic memory scanning thread. This function must be called - * with the scan_mutex held. + * Stop the automatic memory scanning thread. */ static void stop_scan_thread(void) { @@ -1921,12 +1920,15 @@ static void kmemleak_do_cleanup(struct work_struct *work) { stop_scan_thread(); + mutex_lock(&scan_mutex); /* - * Once the scan thread has stopped, it is safe to no longer track - * object freeing. Ordering of the scan thread stopping and the memory - * accesses below is guaranteed by the kthread_stop() function. + * Once it is made sure that kmemleak_scan has stopped, it is safe to no + * longer track object freeing. Ordering of the scan thread stopping and + * the memory accesses below is guaranteed by the kthread_stop() + * function. */ kmemleak_free_enabled = 0; + mutex_unlock(&scan_mutex); if (!kmemleak_found_leaks) __kmemleak_do_cleanup(); -- cgit v1.2.3 From b6e9b0babb7a02ae4f00f053974609000f00950e Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sat, 17 Mar 2018 16:00:25 +0100 Subject: mm: add kernel_migrate_pages() helper, move compat syscall to mm/mempolicy.c Move compat_sys_migrate_pages() to mm/mempolicy.c and make it call a newly introduced helper -- kernel_migrate_pages() -- instead of the syscall. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Cc: linux-mm@kvack.org Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- kernel/compat.c | 33 --------------------------------- mm/mempolicy.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/kernel/compat.c b/kernel/compat.c index 3f5fa8902e7d..51bdf1808943 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -508,39 +508,6 @@ COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, } return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } - -COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, - compat_ulong_t, maxnode, - const compat_ulong_t __user *, old_nodes, - const compat_ulong_t __user *, new_nodes) -{ - unsigned long __user *old = NULL; - unsigned long __user *new = NULL; - nodemask_t tmp_mask; - unsigned long nr_bits; - unsigned long size; - - nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); - size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - if (old_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) - return -EFAULT; - old = compat_alloc_user_space(new_nodes ? size * 2 : size); - if (new_nodes) - new = old + size / sizeof(unsigned long); - if (copy_to_user(old, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - if (new_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) - return -EFAULT; - if (new == NULL) - new = compat_alloc_user_space(size); - if (copy_to_user(new, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - return sys_migrate_pages(pid, nr_bits + 1, old, new); -} #endif /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d879f1d8a44a..7399ede02b5f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1377,9 +1377,9 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, return do_set_mempolicy(mode, flags, &nodes); } -SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, - const unsigned long __user *, old_nodes, - const unsigned long __user *, new_nodes) +static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, + const unsigned long __user *old_nodes, + const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; @@ -1469,6 +1469,13 @@ out_put: } +SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, + const unsigned long __user *, old_nodes, + const unsigned long __user *, new_nodes) +{ + return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); +} + /* Retrieve NUMA policy */ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, @@ -1571,7 +1578,40 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, return sys_mbind(start, len, mode, nm, nr_bits+1, flags); } -#endif +COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, + compat_ulong_t, maxnode, + const compat_ulong_t __user *, old_nodes, + const compat_ulong_t __user *, new_nodes) +{ + unsigned long __user *old = NULL; + unsigned long __user *new = NULL; + nodemask_t tmp_mask; + unsigned long nr_bits; + unsigned long size; + + nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); + size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (old_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) + return -EFAULT; + old = compat_alloc_user_space(new_nodes ? size * 2 : size); + if (new_nodes) + new = old + size / sizeof(unsigned long); + if (copy_to_user(old, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + if (new_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) + return -EFAULT; + if (new == NULL) + new = compat_alloc_user_space(size); + if (copy_to_user(new, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + return kernel_migrate_pages(pid, nr_bits + 1, old, new); +} + +#endif /* CONFIG_COMPAT */ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr) -- cgit v1.2.3 From 7addf44388255f6fa99c83e3e2ad79cef0813698 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sat, 17 Mar 2018 16:08:03 +0100 Subject: mm: add kernel_move_pages() helper, move compat syscall to mm/migrate.c Move compat_sys_move_pages() to mm/migrate.c and make it call a newly introduced helper -- kernel_move_pages() -- instead of the syscall. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Cc: linux-mm@kvack.org Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- kernel/compat.c | 22 ---------------------- mm/migrate.c | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/kernel/compat.c b/kernel/compat.c index 51bdf1808943..6d21894806b4 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -488,28 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) } EXPORT_SYMBOL_GPL(get_compat_sigset); -#ifdef CONFIG_NUMA -COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, - compat_uptr_t __user *, pages32, - const int __user *, nodes, - int __user *, status, - int, flags) -{ - const void __user * __user *pages; - int i; - - pages = compat_alloc_user_space(nr_pages * sizeof(void *)); - for (i = 0; i < nr_pages; i++) { - compat_uptr_t p; - - if (get_user(p, pages32 + i) || - put_user(compat_ptr(p), pages + i)) - return -EFAULT; - } - return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); -} -#endif - /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/mm/migrate.c b/mm/migrate.c index 1e5525a25691..003886606a22 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1745,10 +1746,10 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, * Move a list of pages in the address space of the currently executing * process. */ -SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, - const void __user * __user *, pages, - const int __user *, nodes, - int __user *, status, int, flags) +static int kernel_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) { struct task_struct *task; struct mm_struct *mm; @@ -1807,6 +1808,36 @@ out: return err; } +SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, + const void __user * __user *, pages, + const int __user *, nodes, + int __user *, status, int, flags) +{ + return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, + compat_uptr_t __user *, pages32, + const int __user *, nodes, + int __user *, status, + int, flags) +{ + const void __user * __user *pages; + int i; + + pages = compat_alloc_user_space(nr_pages * sizeof(void *)); + for (i = 0; i < nr_pages; i++) { + compat_uptr_t p; + + if (get_user(p, pages32 + i) || + put_user(compat_ptr(p), pages + i)) + return -EFAULT; + } + return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); +} +#endif /* CONFIG_COMPAT */ + #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA -- cgit v1.2.3 From e7dc9ad6e98eb8cc49b454d54e361f91aebc395f Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sat, 17 Mar 2018 16:12:22 +0100 Subject: mm: add kernel_mbind() helper; remove in-kernel call to syscall Using the mm-internal kernel_mbind() helper allows us to get rid of the mm-internal call to the sys_mbind() syscall. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Cc: linux-mm@kvack.org Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- mm/mempolicy.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7399ede02b5f..e4d7d4c0b253 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1336,9 +1336,9 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } -SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, - unsigned long, mode, const unsigned long __user *, nmask, - unsigned long, maxnode, unsigned, flags) +static long kernel_mbind(unsigned long start, unsigned long len, + unsigned long mode, const unsigned long __user *nmask, + unsigned long maxnode, unsigned int flags) { nodemask_t nodes; int err; @@ -1357,6 +1357,13 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, return do_mbind(start, len, mode, mode_flags, &nodes, flags); } +SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, + unsigned long, mode, const unsigned long __user *, nmask, + unsigned long, maxnode, unsigned int, flags) +{ + return kernel_mbind(start, len, mode, nmask, maxnode, flags); +} + /* Set the process memory policy */ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) @@ -1575,7 +1582,7 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, return -EFAULT; } - return sys_mbind(start, len, mode, nm, nr_bits+1, flags); + return kernel_mbind(start, len, mode, nm, nr_bits+1, flags); } COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, -- cgit v1.2.3 From af03c4acb728dd9ed850d329a1cff71d52e7f3a8 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sat, 17 Mar 2018 16:20:01 +0100 Subject: mm: add kernel_[sg]et_mempolicy() helpers; remove in-kernel calls to syscalls Using the mm-internal kernel_[sg]et_mempolicy() helper allows us to get rid of the mm-internal calls to the sys_[sg]et_mempolicy() syscalls. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Al Viro Cc: linux-mm@kvack.org Cc: Andrew Morton Signed-off-by: Dominik Brodowski --- mm/mempolicy.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e4d7d4c0b253..ca817e768d0e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1365,8 +1365,8 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, } /* Set the process memory policy */ -SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, - unsigned long, maxnode) +static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, + unsigned long maxnode) { int err; nodemask_t nodes; @@ -1384,6 +1384,12 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, return do_set_mempolicy(mode, flags, &nodes); } +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, + unsigned long, maxnode) +{ + return kernel_set_mempolicy(mode, nmask, maxnode); +} + static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) @@ -1485,9 +1491,11 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, /* Retrieve NUMA policy */ -SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, - unsigned long __user *, nmask, unsigned long, maxnode, - unsigned long, addr, unsigned long, flags) +static int kernel_get_mempolicy(int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, + unsigned long flags) { int err; int uninitialized_var(pval); @@ -1510,6 +1518,13 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, return err; } +SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + unsigned long __user *, nmask, unsigned long, maxnode, + unsigned long, addr, unsigned long, flags) +{ + return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, @@ -1528,7 +1543,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, if (nmask) nm = compat_alloc_user_space(alloc_size); - err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); + err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags); if (!err && nmask) { unsigned long copy_size; @@ -1560,7 +1575,7 @@ COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, return -EFAULT; } - return sys_set_mempolicy(mode, nm, nr_bits+1); + return kernel_set_mempolicy(mode, nm, nr_bits+1); } COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, -- cgit v1.2.3 From 9d5b7c956b09daab955fb2a42447d5d89ff15093 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 11 Mar 2018 11:34:45 +0100 Subject: mm: add ksys_fadvise64_64() helper; remove in-kernel call to sys_fadvise64_64() Using the ksys_fadvise64_64() helper allows us to avoid the in-kernel calls to the sys_fadvise64_64() syscall. The ksys_ prefix denotes that this function is meant as a drop-in replacement for the syscall. In particular, it uses the same calling convention as ksys_fadvise64_64(). Some compat stubs called sys_fadvise64(), which then just passed through the arguments to sys_fadvise64_64(). Get rid of this indirection, and call ksys_fadvise64_64() directly. This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Andrew Morton Cc: linux-mm@kvack.org Signed-off-by: Dominik Brodowski --- arch/arm/kernel/sys_arm.c | 2 +- arch/mips/kernel/linux32.c | 2 +- arch/parisc/kernel/sys_parisc.c | 2 +- arch/powerpc/kernel/sys_ppc32.c | 4 ++-- arch/powerpc/kernel/syscalls.c | 4 ++-- arch/s390/kernel/compat_linux.c | 5 +++-- arch/sh/kernel/sys_sh32.c | 8 ++++---- arch/sparc/kernel/sys_sparc32.c | 10 +++++----- arch/x86/ia32/sys_ia32.c | 12 ++++++------ arch/xtensa/kernel/syscall.c | 2 +- include/linux/syscalls.h | 9 +++++++++ mm/fadvise.c | 10 ++++++++-- 12 files changed, 43 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c index 3151f5623d0e..bdf7514204ab 100644 --- a/arch/arm/kernel/sys_arm.c +++ b/arch/arm/kernel/sys_arm.c @@ -35,5 +35,5 @@ asmlinkage long sys_arm_fadvise64_64(int fd, int advice, loff_t offset, loff_t len) { - return sys_fadvise64_64(fd, offset, len, advice); + return ksys_fadvise64_64(fd, offset, len, advice); } diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 0779d474c8ad..1c5785e72db4 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -149,7 +149,7 @@ asmlinkage long sys32_fadvise64_64(int fd, int __pad, unsigned long a4, unsigned long a5, int flags) { - return sys_fadvise64_64(fd, + return ksys_fadvise64_64(fd, merge_64(a2, a3), merge_64(a4, a5), flags); } diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index 588fab336ddd..f36ab1f09595 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -352,7 +352,7 @@ asmlinkage long parisc_fadvise64_64(int fd, unsigned int high_off, unsigned int low_off, unsigned int high_len, unsigned int low_len, int advice) { - return sys_fadvise64_64(fd, (loff_t)high_off << 32 | low_off, + return ksys_fadvise64_64(fd, (loff_t)high_off << 32 | low_off, (loff_t)high_len << 32 | low_len, advice); } diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 68f11e1065f8..0b95fa13307f 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -113,8 +113,8 @@ asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long h long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low, size_t len, int advice) { - return sys_fadvise64(fd, (u64)offset_high << 32 | offset_low, len, - advice); + return ksys_fadvise64_64(fd, (u64)offset_high << 32 | offset_low, len, + advice); } asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags, diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index a877bf8269fe..ecb981eea74b 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -119,8 +119,8 @@ long ppc64_personality(unsigned long personality) long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low) { - return sys_fadvise64(fd, (u64)offset_high << 32 | offset_low, - (u64)len_high << 32 | len_low, advice); + return ksys_fadvise64_64(fd, (u64)offset_high << 32 | offset_low, + (u64)len_high << 32 | len_low, advice); } long sys_switch_endian(void) diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 039858f9f128..9bb897e443a6 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -483,7 +483,8 @@ COMPAT_SYSCALL_DEFINE5(s390_fadvise64, int, fd, u32, high, u32, low, compat_size advise = POSIX_FADV_DONTNEED; else if (advise == 5) advise = POSIX_FADV_NOREUSE; - return sys_fadvise64(fd, (unsigned long)high << 32 | low, len, advise); + return ksys_fadvise64_64(fd, (unsigned long)high << 32 | low, len, + advise); } struct fadvise64_64_args { @@ -503,7 +504,7 @@ COMPAT_SYSCALL_DEFINE1(s390_fadvise64_64, struct fadvise64_64_args __user *, arg a.advice = POSIX_FADV_DONTNEED; else if (a.advice == 5) a.advice = POSIX_FADV_NOREUSE; - return sys_fadvise64_64(a.fd, a.offset, a.len, a.advice); + return ksys_fadvise64_64(a.fd, a.offset, a.len, a.advice); } COMPAT_SYSCALL_DEFINE6(s390_sync_file_range, int, fd, u32, offhigh, u32, offlow, diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c index c37ee3d0c803..9dca568509a5 100644 --- a/arch/sh/kernel/sys_sh32.c +++ b/arch/sh/kernel/sys_sh32.c @@ -52,10 +52,10 @@ asmlinkage int sys_fadvise64_64_wrapper(int fd, u32 offset0, u32 offset1, u32 len0, u32 len1, int advice) { #ifdef __LITTLE_ENDIAN__ - return sys_fadvise64_64(fd, (u64)offset1 << 32 | offset0, - (u64)len1 << 32 | len0, advice); + return ksys_fadvise64_64(fd, (u64)offset1 << 32 | offset0, + (u64)len1 << 32 | len0, advice); #else - return sys_fadvise64_64(fd, (u64)offset0 << 32 | offset1, - (u64)len0 << 32 | len1, advice); + return ksys_fadvise64_64(fd, (u64)offset0 << 32 | offset1, + (u64)len0 << 32 | len1, advice); #endif } diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index 4ba62d676632..4da66aed50b4 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -225,7 +225,7 @@ long compat_sys_fadvise64(int fd, unsigned long offlo, compat_size_t len, int advice) { - return sys_fadvise64_64(fd, (offhi << 32) | offlo, len, advice); + return ksys_fadvise64_64(fd, (offhi << 32) | offlo, len, advice); } long compat_sys_fadvise64_64(int fd, @@ -233,10 +233,10 @@ long compat_sys_fadvise64_64(int fd, unsigned long lenhi, unsigned long lenlo, int advice) { - return sys_fadvise64_64(fd, - (offhi << 32) | offlo, - (lenhi << 32) | lenlo, - advice); + return ksys_fadvise64_64(fd, + (offhi << 32) | offlo, + (lenhi << 32) | lenlo, + advice); } long sys32_sync_file_range(unsigned int fd, unsigned long off_high, unsigned long off_low, unsigned long nb_high, unsigned long nb_low, unsigned int flags) diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index df2acb13623f..401bd8ec9cf0 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -194,10 +194,10 @@ COMPAT_SYSCALL_DEFINE6(x86_fadvise64_64, int, fd, __u32, offset_low, __u32, offset_high, __u32, len_low, __u32, len_high, int, advice) { - return sys_fadvise64_64(fd, - (((u64)offset_high)<<32) | offset_low, - (((u64)len_high)<<32) | len_low, - advice); + return ksys_fadvise64_64(fd, + (((u64)offset_high)<<32) | offset_low, + (((u64)len_high)<<32) | len_low, + advice); } COMPAT_SYSCALL_DEFINE4(x86_readahead, int, fd, unsigned int, off_lo, @@ -218,8 +218,8 @@ COMPAT_SYSCALL_DEFINE6(x86_sync_file_range, int, fd, unsigned int, off_low, COMPAT_SYSCALL_DEFINE5(x86_fadvise64, int, fd, unsigned int, offset_lo, unsigned int, offset_hi, size_t, len, int, advice) { - return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, - len, advice); + return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, + len, advice); } COMPAT_SYSCALL_DEFINE6(x86_fallocate, int, fd, int, mode, diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c index 74afbf02d07e..8201748da05b 100644 --- a/arch/xtensa/kernel/syscall.c +++ b/arch/xtensa/kernel/syscall.c @@ -55,7 +55,7 @@ asmlinkage long xtensa_shmat(int shmid, char __user *shmaddr, int shmflg) asmlinkage long xtensa_fadvise64_64(int fd, int advice, unsigned long long offset, unsigned long long len) { - return sys_fadvise64_64(fd, offset, len, advice); + return ksys_fadvise64_64(fd, offset, len, advice); } #ifdef CONFIG_MMU diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 613b8127834d..466d408deefd 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -970,6 +970,15 @@ ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len); +#ifdef CONFIG_ADVISE_SYSCALLS +int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); +#else +static inline int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, + int advice) +{ + return -EINVAL; +} +#endif /* * The following kernel syscall equivalents are just wrappers to fs-internal diff --git a/mm/fadvise.c b/mm/fadvise.c index 767887f5f3bf..afa41491d324 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -26,7 +26,8 @@ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. */ -SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) + +int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { struct fd f = fdget(fd); struct inode *inode; @@ -185,11 +186,16 @@ out: return ret; } +SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) +{ + return ksys_fadvise64_64(fd, offset, len, advice); +} + #ifdef __ARCH_WANT_SYS_FADVISE64 SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) { - return sys_fadvise64_64(fd, offset, len, advice); + return ksys_fadvise64_64(fd, offset, len, advice); } #endif -- cgit v1.2.3 From a90f590a1bee36fc2129cfb38ceec24a555bb12d Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 11 Mar 2018 11:34:46 +0100 Subject: mm: add ksys_mmap_pgoff() helper; remove in-kernel calls to sys_mmap_pgoff() Using this helper allows us to avoid the in-kernel calls to the sys_mmap_pgoff() syscall. The ksys_ prefix denotes that this function is meant as a drop-in replacement for the syscall. In particular, it uses the same calling convention as sys_mmap_pgoff(). This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Andrew Morton Cc: linux-mm@kvack.org Signed-off-by: Dominik Brodowski --- arch/alpha/kernel/osf_sys.c | 2 +- arch/arm64/kernel/sys.c | 2 +- arch/ia64/kernel/sys_ia64.c | 4 ++-- arch/m68k/kernel/sys_m68k.c | 2 +- arch/microblaze/kernel/sys_microblaze.c | 6 +++--- arch/mips/kernel/linux32.c | 4 ++-- arch/mips/kernel/syscall.c | 6 ++++-- arch/parisc/kernel/sys_parisc.c | 6 +++--- arch/powerpc/kernel/syscalls.c | 2 +- arch/riscv/kernel/sys_riscv.c | 4 ++-- arch/s390/kernel/compat_linux.c | 6 +++--- arch/s390/kernel/sys_s390.c | 2 +- arch/sh/kernel/sys_sh.c | 4 ++-- arch/sparc/kernel/sys_sparc_32.c | 6 +++--- arch/sparc/kernel/sys_sparc_64.c | 2 +- arch/um/kernel/syscall.c | 2 +- arch/x86/ia32/sys_ia32.c | 2 +- arch/x86/kernel/sys_x86_64.c | 2 +- include/linux/syscalls.h | 3 +++ mm/mmap.c | 17 ++++++++++++----- mm/nommu.c | 17 ++++++++++++----- 21 files changed, 60 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index fa1a392ca9a2..89faa6f4de47 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -189,7 +189,7 @@ SYSCALL_DEFINE6(osf_mmap, unsigned long, addr, unsigned long, len, goto out; if (off & ~PAGE_MASK) goto out; - ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return ret; } diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c index 26fe8ea93ea2..72981bae10eb 100644 --- a/arch/arm64/kernel/sys.c +++ b/arch/arm64/kernel/sys.c @@ -34,7 +34,7 @@ asmlinkage long sys_mmap(unsigned long addr, unsigned long len, if (offset_in_page(off) != 0) return -EINVAL; - return sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } SYSCALL_DEFINE1(arm64_personality, unsigned int, personality) diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index 085adfcc74a4..9ebe1d633abc 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -139,7 +139,7 @@ int ia64_mmap_check(unsigned long addr, unsigned long len, asmlinkage unsigned long sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff) { - addr = sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); + addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; @@ -151,7 +151,7 @@ sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, lo if (offset_in_page(off) != 0) return -EINVAL; - addr = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index 27e10af5153a..6363ec83a290 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c @@ -46,7 +46,7 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, * so we need to shift the argument down by 1; m68k mmap64(3) * (in libc) expects the last argument of mmap2 in 4Kb units. */ - return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* Convert virtual (user) address VADDR to physical address PADDR */ diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c index f1e1f666ddde..ed9f34da1a2a 100644 --- a/arch/microblaze/kernel/sys_microblaze.c +++ b/arch/microblaze/kernel/sys_microblaze.c @@ -40,7 +40,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, if (pgoff & ~PAGE_MASK) return -EINVAL; - return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); } SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len, @@ -50,6 +50,6 @@ SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len, if (pgoff & (~PAGE_MASK >> 12)) return -EINVAL; - return sys_mmap_pgoff(addr, len, prot, flags, fd, - pgoff >> (PAGE_SHIFT - 12)); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 1c5785e72db4..0571ab7b68b0 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -67,8 +67,8 @@ SYSCALL_DEFINE6(32_mmap2, unsigned long, addr, unsigned long, len, { if (pgoff & (~PAGE_MASK >> 12)) return -EINVAL; - return sys_mmap_pgoff(addr, len, prot, flags, fd, - pgoff >> (PAGE_SHIFT-12)); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT-12)); } #define RLIM_INFINITY32 0x7fffffff diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 58c6f634b550..69c17b549fd3 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -63,7 +63,8 @@ SYSCALL_DEFINE6(mips_mmap, unsigned long, addr, unsigned long, len, { if (offset & ~PAGE_MASK) return -EINVAL; - return sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, + offset >> PAGE_SHIFT); } SYSCALL_DEFINE6(mips_mmap2, unsigned long, addr, unsigned long, len, @@ -73,7 +74,8 @@ SYSCALL_DEFINE6(mips_mmap2, unsigned long, addr, unsigned long, len, if (pgoff & (~PAGE_MASK >> 12)) return -EINVAL; - return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT-12)); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } save_static_function(sys_fork); diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index f36ab1f09595..080d566654ea 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -270,8 +270,8 @@ asmlinkage unsigned long sys_mmap2(unsigned long addr, unsigned long len, { /* Make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have. */ - return sys_mmap_pgoff(addr, len, prot, flags, fd, - pgoff >> (PAGE_SHIFT - 12)); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, @@ -279,7 +279,7 @@ asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, unsigned long offset) { if (!(offset & ~PAGE_MASK)) { - return sys_mmap_pgoff(addr, len, prot, flags, fd, + return ksys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); } else { return -EINVAL; diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index ecb981eea74b..1ef3b80b62a6 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -57,7 +57,7 @@ static inline long do_mmap2(unsigned long addr, size_t len, off >>= shift; } - ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off); + ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off); out: return ret; } diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index 79c78668258e..f7181ed8aafc 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -24,8 +24,8 @@ static long riscv_sys_mmap(unsigned long addr, unsigned long len, { if (unlikely(offset & (~PAGE_MASK >> page_shift_offset))) return -EINVAL; - return sys_mmap_pgoff(addr, len, prot, flags, fd, - offset >> (PAGE_SHIFT - page_shift_offset)); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, + offset >> (PAGE_SHIFT - page_shift_offset)); } #ifdef CONFIG_64BIT diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 9bb897e443a6..da5ef7718254 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -442,8 +442,8 @@ COMPAT_SYSCALL_DEFINE1(s390_old_mmap, struct mmap_arg_struct_emu31 __user *, arg return -EFAULT; if (a.offset & ~PAGE_MASK) return -EINVAL; - return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, - a.offset >> PAGE_SHIFT); + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); } COMPAT_SYSCALL_DEFINE1(s390_mmap2, struct mmap_arg_struct_emu31 __user *, arg) @@ -452,7 +452,7 @@ COMPAT_SYSCALL_DEFINE1(s390_mmap2, struct mmap_arg_struct_emu31 __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); } COMPAT_SYSCALL_DEFINE3(s390_read, unsigned int, fd, char __user *, buf, compat_size_t, count) diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c index 0090037ab148..31cefe0c28c0 100644 --- a/arch/s390/kernel/sys_s390.c +++ b/arch/s390/kernel/sys_s390.c @@ -53,7 +53,7 @@ SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) goto out; - error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); + error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); out: return error; } diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c index 724911c59e7d..f8afc014e084 100644 --- a/arch/sh/kernel/sys_sh.c +++ b/arch/sh/kernel/sys_sh.c @@ -35,7 +35,7 @@ asmlinkage int old_mmap(unsigned long addr, unsigned long len, { if (off & ~PAGE_MASK) return -EINVAL; - return sys_mmap_pgoff(addr, len, prot, flags, fd, off>>PAGE_SHIFT); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off>>PAGE_SHIFT); } asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, @@ -51,7 +51,7 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, pgoff >>= PAGE_SHIFT - 12; - return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* sys_cacheflush -- flush (part of) the processor cache. */ diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 990703b7cf4d..d980da4ffd7b 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -104,8 +104,8 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, { /* Make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have. */ - return sys_mmap_pgoff(addr, len, prot, flags, fd, - pgoff >> (PAGE_SHIFT - 12)); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } asmlinkage long sys_mmap(unsigned long addr, unsigned long len, @@ -113,7 +113,7 @@ asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long off) { /* no alignment check? */ - return sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } long sparc_remap_file_pages(unsigned long start, unsigned long size, diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 55416db482ad..ebb84dc8a5a7 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -458,7 +458,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, goto out; if (off & ~PAGE_MASK) goto out; - retval = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + retval = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return retval; } diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c index 6258676bed85..35f7047bdebc 100644 --- a/arch/um/kernel/syscall.c +++ b/arch/um/kernel/syscall.c @@ -22,7 +22,7 @@ long old_mmap(unsigned long addr, unsigned long len, if (offset & ~PAGE_MASK) goto out; - err = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + err = ksys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); out: return err; } diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 401bd8ec9cf0..bff71b9ae3f5 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -166,7 +166,7 @@ COMPAT_SYSCALL_DEFINE1(x86_mmap, struct mmap_arg_struct32 __user *, arg) if (a.offset & ~PAGE_MASK) return -EINVAL; - return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset>>PAGE_SHIFT); } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 676774b9bb8d..a3f15ed545b5 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -97,7 +97,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, if (off & ~PAGE_MASK) goto out; - error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return error; } diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 466d408deefd..ec866c959e7d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -979,6 +979,9 @@ static inline int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, return -EINVAL; } #endif +unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff); /* * The following kernel syscall equivalents are just wrappers to fs-internal diff --git a/mm/mmap.c b/mm/mmap.c index 9efdc021ad22..aa0dc8231c0d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1488,9 +1488,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return addr; } -SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, - unsigned long, prot, unsigned long, flags, - unsigned long, fd, unsigned long, pgoff) +unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) { struct file *file = NULL; unsigned long retval; @@ -1537,6 +1537,13 @@ out_fput: return retval; } +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); +} + #ifdef __ARCH_WANT_SYS_OLD_MMAP struct mmap_arg_struct { unsigned long addr; @@ -1556,8 +1563,8 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) if (offset_in_page(a.offset)) return -EINVAL; - return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, - a.offset >> PAGE_SHIFT); + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ diff --git a/mm/nommu.c b/mm/nommu.c index ebb6e618dade..cad329629530 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1423,9 +1423,9 @@ error_getting_region: return -ENOMEM; } -SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, - unsigned long, prot, unsigned long, flags, - unsigned long, fd, unsigned long, pgoff) +unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) { struct file *file = NULL; unsigned long retval = -EBADF; @@ -1447,6 +1447,13 @@ out: return retval; } +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); +} + #ifdef __ARCH_WANT_SYS_OLD_MMAP struct mmap_arg_struct { unsigned long addr; @@ -1466,8 +1473,8 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) if (offset_in_page(a.offset)) return -EINVAL; - return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, - a.offset >> PAGE_SHIFT); + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ -- cgit v1.2.3 From c7b95d5156a9ee70f800bd2e47a9eba677be73e1 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Mon, 19 Mar 2018 17:51:36 +0100 Subject: mm: add ksys_readahead() helper; remove in-kernel calls to sys_readahead() Using this helper allows us to avoid the in-kernel calls to the sys_readahead() syscall. The ksys_ prefix denotes that this function is meant as a drop-in replacement for the syscall. In particular, it uses the same calling convention as sys_readahead(). This patch is part of a series which removes in-kernel calls to syscalls. On this basis, the syscall entry path can be streamlined. For details, see http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net Cc: Andrew Morton Cc: linux-mm@kvack.org Signed-off-by: Dominik Brodowski --- arch/mips/kernel/linux32.c | 2 +- arch/parisc/kernel/sys_parisc.c | 2 +- arch/powerpc/kernel/sys_ppc32.c | 2 +- arch/s390/kernel/compat_linux.c | 2 +- arch/sparc/kernel/sys_sparc32.c | 2 +- arch/x86/ia32/sys_ia32.c | 2 +- include/linux/syscalls.h | 1 + mm/readahead.c | 7 ++++++- 8 files changed, 13 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 0571ab7b68b0..318f1c05c5b3 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -131,7 +131,7 @@ SYSCALL_DEFINE1(32_personality, unsigned long, personality) asmlinkage ssize_t sys32_readahead(int fd, u32 pad0, u64 a2, u64 a3, size_t count) { - return sys_readahead(fd, merge_64(a2, a3), count); + return ksys_readahead(fd, merge_64(a2, a3), count); } asmlinkage long sys32_sync_file_range(int fd, int __pad, diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index 080d566654ea..8c99ebbe2bac 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -345,7 +345,7 @@ asmlinkage ssize_t parisc_pwrite64(unsigned int fd, const char __user *buf, asmlinkage ssize_t parisc_readahead(int fd, unsigned int high, unsigned int low, size_t count) { - return sys_readahead(fd, (loff_t)high << 32 | low, count); + return ksys_readahead(fd, (loff_t)high << 32 | low, count); } asmlinkage long parisc_fadvise64_64(int fd, diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 0b95fa13307f..c11c73373691 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -88,7 +88,7 @@ compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, com compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offhi, u32 offlo, u32 count) { - return sys_readahead(fd, ((loff_t)offhi << 32) | offlo, count); + return ksys_readahead(fd, ((loff_t)offhi << 32) | offlo, count); } asmlinkage int compat_sys_truncate64(const char __user * path, u32 reg4, diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index da5ef7718254..8ac38d51ed7d 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -328,7 +328,7 @@ COMPAT_SYSCALL_DEFINE5(s390_pwrite64, unsigned int, fd, const char __user *, ubu COMPAT_SYSCALL_DEFINE4(s390_readahead, int, fd, u32, high, u32, low, s32, count) { - return sys_readahead(fd, (unsigned long)high << 32 | low, count); + return ksys_readahead(fd, (unsigned long)high << 32 | low, count); } struct stat64_emu31 { diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index 4da66aed50b4..f166e5bbf506 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -217,7 +217,7 @@ asmlinkage long compat_sys_readahead(int fd, unsigned long offlo, compat_size_t count) { - return sys_readahead(fd, (offhi << 32) | offlo, count); + return ksys_readahead(fd, (offhi << 32) | offlo, count); } long compat_sys_fadvise64(int fd, diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index bff71b9ae3f5..bd8a7020b9a7 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -203,7 +203,7 @@ COMPAT_SYSCALL_DEFINE6(x86_fadvise64_64, int, fd, __u32, offset_low, COMPAT_SYSCALL_DEFINE4(x86_readahead, int, fd, unsigned int, off_lo, unsigned int, off_hi, size_t, count) { - return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); + return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); } COMPAT_SYSCALL_DEFINE6(x86_sync_file_range, int, fd, unsigned int, off_low, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ec866c959e7d..815fbdd9cca1 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -982,6 +982,7 @@ static inline int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); +ssize_t ksys_readahead(int fd, loff_t offset, size_t count); /* * The following kernel syscall equivalents are just wrappers to fs-internal diff --git a/mm/readahead.c b/mm/readahead.c index c4ca70239233..4d57b4644f98 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -573,7 +573,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return force_page_cache_readahead(mapping, filp, index, nr); } -SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) +ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; struct fd f; @@ -592,3 +592,8 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) } return ret; } + +SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) +{ + return ksys_readahead(fd, offset, count); +} -- cgit v1.2.3