From 6be24bed9da367c29b04e6fba8c9f27db39aa665 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 30 Jun 2021 18:47:04 -0700 Subject: mm: hugetlb: introduce a new config HUGETLB_PAGE_FREE_VMEMMAP The option HUGETLB_PAGE_FREE_VMEMMAP allows for the freeing of some vmemmap pages associated with pre-allocated HugeTLB pages. For example, on X86_64 6 vmemmap pages of size 4KB each can be saved for each 2MB HugeTLB page. 4094 vmemmap pages of size 4KB each can be saved for each 1GB HugeTLB page. When a HugeTLB page is allocated or freed, the vmemmap array representing the range associated with the page will need to be remapped. When a page is allocated, vmemmap pages are freed after remapping. When a page is freed, previously discarded vmemmap pages must be allocated before remapping. The config option is introduced early so that supporting code can be written to depend on the option. The initial version of the code only provides support for x86-64. If config HAVE_BOOTMEM_INFO_NODE is enabled, the freeing vmemmap page code denpend on it to free vmemmap pages. Otherwise, just use free_reserved_page() to free vmemmmap pages. The routine register_page_bootmem_info() is used to register bootmem info. Therefore, make sure register_page_bootmem_info is enabled if HUGETLB_PAGE_FREE_VMEMMAP is defined. Link: https://lkml.kernel.org/r/20210510030027.56044-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Oscar Salvador Acked-by: Mike Kravetz Reviewed-by: Miaohe Lin Tested-by: Chen Huang Tested-by: Bodeddula Balasubramaniam Reviewed-by: Balbir Singh Cc: Alexander Viro Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Barry Song Cc: Borislav Petkov Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: HORIGUCHI NAOYA Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joao Martins Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mina Almasry Cc: Oliver Neukum Cc: Paul E. McKenney Cc: Pawan Gupta Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 141a856c50e7..58a53455d1fe 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -240,6 +240,11 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config HUGETLB_PAGE_FREE_VMEMMAP + def_bool HUGETLB_PAGE + depends on X86_64 + depends on SPARSEMEM_VMEMMAP + config MEMFD_CREATE def_bool TMPFS || HUGETLBFS -- cgit v1.2.3 From e6be37b2e7bddfe0c76585ee7c7eee5acc8efeab Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 30 Jun 2021 18:47:50 -0700 Subject: mm/huge_memory.c: add missing read-only THP checking in transparent_hugepage_enabled() Since commit 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) FS"), read-only THP file mapping is supported. But it forgot to add checking for it in transparent_hugepage_enabled(). To fix it, we add checking for read-only THP file mapping and also introduce helper transhuge_vma_enabled() to check whether thp is enabled for specified vma to reduce duplicated code. We rename transparent_hugepage_enabled to transparent_hugepage_active to make the code easier to follow as suggested by David Hildenbrand. [linmiaohe@huawei.com: define transhuge_vma_enabled next to transhuge_vma_suitable] Link: https://lkml.kernel.org/r/20210514093007.4117906-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210511134857.1581273-4-linmiaohe@huawei.com Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) FS") Signed-off-by: Miaohe Lin Reviewed-by: Yang Shi Cc: Alexey Dobriyan Cc: "Aneesh Kumar K . V" Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Minchan Kim Cc: Ralph Campbell Cc: Rik van Riel Cc: Song Liu Cc: William Kucharski Cc: Zi Yan Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- include/linux/huge_mm.h | 57 ++++++++++++++++++++++++++++++------------------- mm/huge_memory.c | 11 +++++++++- mm/khugepaged.c | 4 +--- mm/shmem.c | 3 +-- 5 files changed, 48 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 66965ad88d8b..9feda7792882 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -832,7 +832,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - transparent_hugepage_enabled(vma)); + transparent_hugepage_active(vma)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8a5f49abcfa2..b4e1ebaae825 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -115,9 +115,34 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; +static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, + unsigned long haddr) +{ + /* Don't have to check pgoff for anonymous vma */ + if (!vma_is_anonymous(vma)) { + if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR)) + return false; + } + + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + return false; + return true; +} + +static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + /* Explicitly disabled through madvise. */ + if ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + return true; +} + /* * to be used on vmas which are known to support THP. - * Use transparent_hugepage_enabled otherwise + * Use transparent_hugepage_active otherwise */ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { @@ -128,15 +153,12 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) return false; - if (vma->vm_flags & VM_NOHUGEPAGE) + if (!transhuge_vma_enabled(vma, vma->vm_flags)) return false; if (vma_is_temporary_stack(vma)) return false; - if (test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; @@ -150,22 +172,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } -bool transparent_hugepage_enabled(struct vm_area_struct *vma); - -static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) -{ - /* Don't have to check pgoff for anonymous vma */ - if (!vma_is_anonymous(vma)) { - if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR)) - return false; - } - - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) - return false; - return true; -} +bool transparent_hugepage_active(struct vm_area_struct *vma); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -352,7 +359,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } -static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma) +static inline bool transparent_hugepage_active(struct vm_area_struct *vma) { return false; } @@ -363,6 +370,12 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return false; } +static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + return false; +} + static inline void prep_transhuge_page(struct page *page) {} static inline bool is_transparent_hugepage(struct page *page) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a76b3835251d..8529ba6af151 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -64,7 +64,14 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool transparent_hugepage_enabled(struct vm_area_struct *vma) +static inline bool file_thp_enabled(struct vm_area_struct *vma) +{ + return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file && + !inode_is_open_for_write(vma->vm_file->f_inode) && + (vma->vm_flags & VM_EXEC); +} + +bool transparent_hugepage_active(struct vm_area_struct *vma) { /* The addr is used to check if the vma size fits */ unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; @@ -75,6 +82,8 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma) return __transparent_hugepage_enabled(vma); if (vma_is_shmem(vma)) return shmem_huge_enabled(vma); + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) + return file_thp_enabled(vma); return false; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6c0185fdd815..d97b20fad6e8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -442,9 +442,7 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) static bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags) { - /* Explicitly disabled through madvise. */ - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (!transhuge_vma_enabled(vma, vm_flags)) return false; /* Enabled via shmem mount options or sysfs settings. */ diff --git a/mm/shmem.c b/mm/shmem.c index e72931b9246c..1155279a6276 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4040,8 +4040,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) loff_t i_size; pgoff_t off; - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (!transhuge_vma_enabled(vma, vma->vm_flags)) return false; if (shmem_huge == SHMEM_HUGE_FORCE) return true; -- cgit v1.2.3 From e6d41f12df0efcaa6e30b575d40f2529024cfce9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 30 Jun 2021 18:48:28 -0700 Subject: mm: hugetlb: introduce CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap pages associated with each HugeTLB page is default off. Now the vmemmap is PMD mapped. So there is no side effect when this feature is enabled with no HugeTLB pages in the system. Someone may want to enable this feature in the compiler time instead of using boot command line. So add a config to make it default on when someone do not want to enable it via command line. Link: https://lkml.kernel.org/r/20210616094915.34432-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Chen Huang Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ fs/Kconfig | 10 ++++++++++ mm/hugetlb_vmemmap.c | 6 ++++-- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 7a7da02b3bc6..c4e34420a485 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1577,6 +1577,9 @@ on: enable the feature off: disable the feature + Built with CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON=y, + the default is on. + This is not compatible with memory_hotplug.memmap_on_memory. If both parameters are enabled, hugetlb_free_vmemmap takes precedence over memory_hotplug.memmap_on_memory. diff --git a/fs/Kconfig b/fs/Kconfig index 58a53455d1fe..a7749c126b8e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -245,6 +245,16 @@ config HUGETLB_PAGE_FREE_VMEMMAP depends on X86_64 depends on SPARSEMEM_VMEMMAP +config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON + bool "Default freeing vmemmap pages of HugeTLB to on" + default n + depends on HUGETLB_PAGE_FREE_VMEMMAP + help + When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + pages associated with each HugeTLB page is default off. Say Y here + to enable freeing vmemmap pages of HugeTLB by default. It can then + be disabled on the command line via hugetlb_free_vmemmap=off. + config MEMFD_CREATE def_bool TMPFS || HUGETLBFS diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 06802056f296..c540c21e26f5 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -182,7 +182,7 @@ #define RESERVE_VMEMMAP_NR 2U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled; +bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); static int __init early_hugetlb_free_vmemmap_param(char *buf) { @@ -197,7 +197,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) if (!strcmp(buf, "on")) hugetlb_free_vmemmap_enabled = true; - else if (strcmp(buf, "off")) + else if (!strcmp(buf, "off")) + hugetlb_free_vmemmap_enabled = false; + else return -EINVAL; return 0; -- cgit v1.2.3 From 00b151f21f390f1e0b294720a3660506abaf49cd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 30 Jun 2021 18:49:06 -0700 Subject: mm/userfaultfd: fail uffd-wp registration if not supported We should fail uffd-wp registration immediately if the arch does not even have CONFIG_HAVE_ARCH_USERFAULTFD_WP defined. That'll block also relevant ioctls on e.g. UFFDIO_WRITEPROTECT because that'll check against VM_UFFD_WP, which can only be applied with a success registration. Remove the WP feature bit too for those archs when handling UFFDIO_API ioctl. Link: https://lkml.kernel.org/r/20210428225030.9708-5-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Jerome Glisse Cc: Joe Perches Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Oliver Upton Cc: Shaohua Li Cc: Shuah Khan Cc: Stephen Rothwell Cc: Wang Qing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 14f92285d04f..5dd78238cc15 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1304,8 +1304,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP + goto out; +#endif vm_flags |= VM_UFFD_WP; + } if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR goto out; @@ -1942,6 +1946,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; +#endif +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; -- cgit v1.2.3 From fb8e37f35a2fe1f983ac21850e856e2c7498d469 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 30 Jun 2021 18:49:09 -0700 Subject: mm/pagemap: export uffd-wp protection information Export the PTE/PMD status of uffd-wp to pagemap too. Link: https://lkml.kernel.org/r/20210428225030.9708-6-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Jerome Glisse Cc: Joe Perches Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Oliver Upton Cc: Shaohua Li Cc: Shuah Khan Cc: Stephen Rothwell Cc: Wang Qing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 2 ++ fs/proc/task_mmu.c | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'fs') diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 340a5aee9b80..fb578fbbb76c 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -21,6 +21,8 @@ There are four components to pagemap: * Bit 55 pte is soft-dirty (see :ref:`Documentation/admin-guide/mm/soft-dirty.rst `) * Bit 56 page exclusively mapped (since 4.2) + * Bit 57 pte is uffd-wp write-protected (since 5.13) (see + :ref:`Documentation/admin-guide/mm/userfaultfd.rst `) * Bits 57-60 zero * Bit 61 page is file-page or shared-anon (since 3.5) * Bit 62 page swapped diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9feda7792882..95c8f1e8fea6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1302,6 +1302,7 @@ struct pagemapread { #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_UFFD_WP BIT_ULL(57) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) @@ -1375,10 +1376,14 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; + if (pte_uffd_wp(pte)) + flags |= PM_UFFD_WP; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; + if (pte_swp_uffd_wp(pte)) + flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); if (pm->show_pfn) frame = swp_type(entry) | @@ -1426,6 +1431,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, flags |= PM_PRESENT; if (pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; + if (pmd_uffd_wp(pmd)) + flags |= PM_UFFD_WP; if (pm->show_pfn) frame = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1444,6 +1451,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; + if (pmd_swp_uffd_wp(pmd)) + flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); page = migration_entry_to_page(entry); } -- cgit v1.2.3 From c949b097ef2e332fa90708127c972b823fb58ec1 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 30 Jun 2021 18:49:20 -0700 Subject: userfaultfd/shmem: support minor fault registration for shmem This patch allows shmem-backed VMAs to be registered for minor faults. Minor faults are appropriately relayed to userspace in the fault path, for VMAs with the relevant flag. This commit doesn't hook up the UFFDIO_CONTINUE ioctl for shmem-backed minor faults, though, so userspace doesn't yet have a way to resolve such faults. Because of this, we also don't yet advertise this as a supported feature. That will be done in a separate commit when the feature is fully implemented. Link: https://lkml.kernel.org/r/20210503180737.2487560-4-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Acked-by: Hugh Dickins Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Brian Geffon Cc: "Dr . David Alan Gilbert" Cc: Jerome Glisse Cc: Joe Perches Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Oliver Upton Cc: Shaohua Li Cc: Shuah Khan Cc: Stephen Rothwell Cc: Wang Qing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 3 +-- mm/memory.c | 8 +++++--- mm/shmem.c | 12 +++++++++++- 3 files changed, 17 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5dd78238cc15..82ef253d66b6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, } if (vm_flags & VM_UFFD_MINOR) { - /* FIXME: Add minor fault interception for shmem. */ - if (!is_vm_hugetlb_page(vma)) + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) return false; } diff --git a/mm/memory.c b/mm/memory.c index 07a71a016c18..b535a1d0317d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4026,9 +4026,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(vmf); - if (ret) - return ret; + if (likely(!userfaultfd_minor(vmf->vma))) { + ret = do_fault_around(vmf); + if (ret) + return ret; + } } ret = __do_fault(vmf); diff --git a/mm/shmem.c b/mm/shmem.c index 9b66a2b2680c..0d186b91a2f9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1797,7 +1797,7 @@ unlock: * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vmf and fault_type are only supplied by shmem_fault: + * vma, vmf, and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, @@ -1832,6 +1832,16 @@ repeat: page = pagecache_get_page(mapping, index, FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0); + + if (page && vma && userfaultfd_minor(vma)) { + if (!xa_is_value(page)) { + unlock_page(page); + put_page(page); + } + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); + return 0; + } + if (xa_is_value(page)) { error = shmem_swapin_page(inode, index, &page, sgp, gfp, vma, fault_type); -- cgit v1.2.3 From 964ab0040ff9598783bf37776b5e31b27b50e293 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 30 Jun 2021 18:49:27 -0700 Subject: userfaultfd/shmem: advertise shmem minor fault support Now that the feature is fully implemented (the faulting path hooks exist so userspace is notified, and the ioctl to resolve such faults is available), advertise this as a supported feature. Link: https://lkml.kernel.org/r/20210503180737.2487560-6-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Hugh Dickins Acked-by: Peter Xu Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Brian Geffon Cc: "Dr . David Alan Gilbert" Cc: Jerome Glisse Cc: Joe Perches Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Oliver Upton Cc: Shaohua Li Cc: Shuah Khan Cc: Stephen Rothwell Cc: Wang Qing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/userfaultfd.rst | 3 ++- fs/userfaultfd.c | 3 ++- include/uapi/linux/userfaultfd.h | 7 ++++++- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 3aa38e8b8361..6528036093e1 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -77,7 +77,8 @@ events, except page fault notifications, may be generated: - ``UFFD_FEATURE_MINOR_HUGETLBFS`` indicates that the kernel supports ``UFFDIO_REGISTER_MODE_MINOR`` registration for hugetlbfs virtual memory - areas. + areas. ``UFFD_FEATURE_MINOR_SHMEM`` is the analogous feature indicating + support for shmem virtual memory areas. The userland application should set the feature flags it intends to use when invoking the ``UFFDIO_API`` ioctl, to request that those features be diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 82ef253d66b6..19ebae443ade 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1944,7 +1944,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR - uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; + uffdio_api.features &= + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); #endif #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 650480f41f1d..05b31d60acf6 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -31,7 +31,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \ - UFFD_FEATURE_MINOR_HUGETLBFS) + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -185,6 +186,9 @@ struct uffdio_api { * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults * can be intercepted (via REGISTER_MODE_MINOR) for * hugetlbfs-backed pages. + * + * UFFD_FEATURE_MINOR_SHMEM indicates the same support as + * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -196,6 +200,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) __u64 features; __u64 ioctls; -- cgit v1.2.3 From 3c36b419b111e28a657e6534aae07964a98a5ca9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 30 Jun 2021 18:50:03 -0700 Subject: fs/proc/kcore: drop KCORE_REMAP and KCORE_OTHER Patch series "fs/proc/kcore: don't read offline sections, logically offline pages and hwpoisoned pages", v3. Looking for places where the kernel might unconditionally read PageOffline() pages, I stumbled over /proc/kcore; turns out /proc/kcore needs some more love to not touch some other pages we really don't want to read -- i.e., hwpoisoned ones. Examples for PageOffline() pages are pages inflated in a balloon, memory unplugged via virtio-mem, and partially-present sections in memory added by the Hyper-V balloon. When reading pages inflated in a balloon, we essentially produce unnecessary load in the hypervisor; holes in partially present sections in case of Hyper-V are not accessible and already were a problem for /proc/vmcore, fixed in makedumpfile by detecting PageOffline() pages. In the future, virtio-mem might disallow reading unplugged memory -- marked as PageOffline() -- in some environments, resulting in undefined behavior when accessed; therefore, I'm trying to identify and rework all these (corner) cases. With this series, there is really only access via /dev/mem, /proc/vmcore and kdb left after I ripped out /dev/kmem. kdb is an advanced corner-case use case -- we won't care for now if someone explicitly tries to do nasty things by reading from/writing to physical addresses we better not touch. /dev/mem is a use case we won't support for virtio-mem, at least for now, so we'll simply disallow mapping any virtio-mem memory via /dev/mem next. /proc/vmcore is really only a problem when dumping the old kernel via something that's not makedumpfile (read: basically never), however, we'll try sanitizing that as well in the second kernel in the future. Tested via kcore_dump: https://github.com/schlafwandler/kcore_dump This patch (of 6): Commit db779ef67ffe ("proc/kcore: Remove unused kclist_add_remap()") removed the last user of KCORE_REMAP. Commit 595dd46ebfc1 ("vfs/proc/kcore, x86/mm/kcore: Fix SMAP fault when dumping vsyscall user page") removed the last user of KCORE_OTHER. Let's drop both types. While at it, also drop vaddr in "struct kcore_list", used by KCORE_REMAP only. Link: https://lkml.kernel.org/r/20210526093041.8800-1-david@redhat.com Link: https://lkml.kernel.org/r/20210526093041.8800-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Alexey Dobriyan Cc: "Matthew Wilcox (Oracle)" Cc: Oscar Salvador Cc: Michal Hocko Cc: Roman Gushchin Cc: Alex Shi Cc: Steven Price Cc: Mike Kravetz Cc: Aili Yao Cc: Jiri Bohac Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 7 ++----- include/linux/kcore.h | 3 --- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 4d2e64e9016c..09f77d3c6e15 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -380,11 +380,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) phdr->p_type = PT_LOAD; phdr->p_flags = PF_R | PF_W | PF_X; phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset; - if (m->type == KCORE_REMAP) - phdr->p_vaddr = (size_t)m->vaddr; - else - phdr->p_vaddr = (size_t)m->addr; - if (m->type == KCORE_RAM || m->type == KCORE_REMAP) + phdr->p_vaddr = (size_t)m->addr; + if (m->type == KCORE_RAM) phdr->p_paddr = __pa(m->addr); else if (m->type == KCORE_TEXT) phdr->p_paddr = __pa_symbol(m->addr); diff --git a/include/linux/kcore.h b/include/linux/kcore.h index da676cdbd727..86c0f1d18998 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -11,14 +11,11 @@ enum kcore_type { KCORE_RAM, KCORE_VMEMMAP, KCORE_USER, - KCORE_OTHER, - KCORE_REMAP, }; struct kcore_list { struct list_head list; unsigned long addr; - unsigned long vaddr; size_t size; int type; }; -- cgit v1.2.3 From 2711032c64a9c151a6469d53fdc7f9f4df7f6e45 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 30 Jun 2021 18:50:07 -0700 Subject: fs/proc/kcore: pfn_is_ram check only applies to KCORE_RAM Let's resturcture the code, using switch-case, and checking pfn_is_ram() only when we are dealing with KCORE_RAM. Link: https://lkml.kernel.org/r/20210526093041.8800-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport Cc: Aili Yao Cc: Alexey Dobriyan Cc: Alex Shi Cc: Haiyang Zhang Cc: Jason Wang Cc: Jiri Bohac Cc: "K. Y. Srinivasan" Cc: "Matthew Wilcox (Oracle)" Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Roman Gushchin Cc: Stephen Hemminger Cc: Steven Price Cc: Wei Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 09f77d3c6e15..ed6fbb3bd50c 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -483,25 +483,36 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) goto out; } m = NULL; /* skip the list anchor */ - } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; - goto out; - } - } else if (m->type == KCORE_VMALLOC) { + goto skip; + } + + switch (m->type) { + case KCORE_VMALLOC: vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ if (copy_to_user(buffer, buf, tsz)) { ret = -EFAULT; goto out; } - } else if (m->type == KCORE_USER) { + break; + case KCORE_USER: /* User page is handled prior to normal kernel page: */ if (copy_to_user(buffer, (char *)start, tsz)) { ret = -EFAULT; goto out; } - } else { + break; + case KCORE_RAM: + if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } + break; + } + fallthrough; + case KCORE_VMEMMAP: + case KCORE_TEXT: if (kern_addr_valid(start)) { /* * Using bounce buffer to bypass the @@ -525,7 +536,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) goto out; } } + break; + default: + pr_warn_once("Unhandled KCORE type: %d\n", m->type); + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } +skip: buflen -= tsz; *fpos += tsz; buffer += tsz; -- cgit v1.2.3 From 0daa322b8ff94d8ee4081c2c6868a1aaf1309642 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 30 Jun 2021 18:50:10 -0700 Subject: fs/proc/kcore: don't read offline sections, logically offline pages and hwpoisoned pages Let's avoid reading: 1) Offline memory sections: the content of offline memory sections is stale as the memory is effectively unused by the kernel. On s390x with standby memory, offline memory sections (belonging to offline storage increments) are not accessible. With virtio-mem and the hyper-v balloon, we can have unavailable memory chunks that should not be accessed inside offline memory sections. Last but not least, offline memory sections might contain hwpoisoned pages which we can no longer identify because the memmap is stale. 2) PG_offline pages: logically offline pages that are documented as "The content of these pages is effectively stale. Such pages should not be touched (read/write/dump/save) except by their owner.". Examples include pages inflated in a balloon or unavailble memory ranges inside hotplugged memory sections with virtio-mem or the hyper-v balloon. 3) PG_hwpoison pages: Reading pages marked as hwpoisoned can be fatal. As documented: "Accessing is not safe since it may cause another machine check. Don't touch!" Introduce is_page_hwpoison(), adding a comment that it is inherently racy but best we can really do. Reading /proc/kcore now performs similar checks as when reading /proc/vmcore for kdump via makedumpfile: problematic pages are exclude. It's also similar to hibernation code, however, we don't skip hwpoisoned pages when processing pages in kernel/power/snapshot.c:saveable_page() yet. Note 1: we can race against memory offlining code, especially memory going offline and getting unplugged: however, we will properly tear down the identity mapping and handle faults gracefully when accessing this memory from kcore code. Note 2: we can race against drivers setting PageOffline() and turning memory inaccessible in the hypervisor. We'll handle this in a follow-up patch. Link: https://lkml.kernel.org/r/20210526093041.8800-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport Reviewed-by: Oscar Salvador Cc: Aili Yao Cc: Alexey Dobriyan Cc: Alex Shi Cc: Haiyang Zhang Cc: Jason Wang Cc: Jiri Bohac Cc: "K. Y. Srinivasan" Cc: "Matthew Wilcox (Oracle)" Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Roman Gushchin Cc: Stephen Hemminger Cc: Steven Price Cc: Wei Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 14 +++++++++++++- include/linux/page-flags.h | 12 ++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index ed6fbb3bd50c..92ff1e4436cb 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -465,6 +465,9 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) m = NULL; while (buflen) { + struct page *page; + unsigned long pfn; + /* * If this is the first iteration or the address is not within * the previous entry, search for a matching entry. @@ -503,7 +506,16 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } break; case KCORE_RAM: - if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) { + pfn = __pa(start) >> PAGE_SHIFT; + page = pfn_to_online_page(pfn); + + /* + * Don't read offline sections, logically offline pages + * (e.g., inflated in a balloon), hwpoisoned pages, + * and explicitly excluded physical ranges. + */ + if (!page || PageOffline(page) || + is_page_hwpoison(page) || !pfn_is_ram(pfn)) { if (clear_user(buffer, tsz)) { ret = -EFAULT; goto out; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d8e26243db25..613295588848 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -694,6 +694,18 @@ PAGEFLAG_FALSE(DoubleMap) TESTSCFLAG_FALSE(DoubleMap) #endif +/* + * Check if a page is currently marked HWPoisoned. Note that this check is + * best effort only and inherently racy: there is no way to synchronize with + * failing hardware. + */ +static inline bool is_page_hwpoison(struct page *page) +{ + if (PageHWPoison(page)) + return true; + return PageHuge(page) && PageHWPoison(compound_head(page)); +} + /* * For pages that are never mapped to userspace (and aren't PageSlab), * page_type may be used. Because it is initialised to -1, we invert the -- cgit v1.2.3 From c6d9eee2a68619b5ba1c25e406a9403f33b56902 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 30 Jun 2021 18:50:21 -0700 Subject: fs/proc/kcore: use page_offline_(freeze|thaw) Let's properly synchronize with drivers that set PageOffline(). Unfreeze/thaw every now and then, so drivers that want to set PageOffline() can make progress. Link: https://lkml.kernel.org/r/20210526093041.8800-7-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Oscar Salvador Cc: Aili Yao Cc: Alexey Dobriyan Cc: Alex Shi Cc: Haiyang Zhang Cc: Jason Wang Cc: Jiri Bohac Cc: "K. Y. Srinivasan" Cc: "Matthew Wilcox (Oracle)" Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Roman Gushchin Cc: Stephen Hemminger Cc: Steven Price Cc: Wei Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 92ff1e4436cb..982e694aae77 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -313,6 +313,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { char *buf = file->private_data; size_t phdrs_offset, notes_offset, data_offset; + size_t page_offline_frozen = 1; size_t phdrs_len, notes_len; struct kcore_list *m; size_t tsz; @@ -322,6 +323,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) int ret = 0; down_read(&kclist_lock); + /* + * Don't race against drivers that set PageOffline() and expect no + * further page access. + */ + page_offline_freeze(); get_kcore_size(&nphdr, &phdrs_len, ¬es_len, &data_offset); phdrs_offset = sizeof(struct elfhdr); @@ -480,6 +486,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } } + if (page_offline_frozen++ % MAX_ORDER_NR_PAGES == 0) { + page_offline_thaw(); + cond_resched(); + page_offline_freeze(); + } + if (&m->list == &kclist_head) { if (clear_user(buffer, tsz)) { ret = -EFAULT; @@ -565,6 +577,7 @@ skip: } out: + page_offline_thaw(); up_read(&kclist_lock); if (ret) return ret; -- cgit v1.2.3 From eb6ecbed0aa27360712d0674bf132843a9567344 Mon Sep 17 00:00:00 2001 From: Collin Fijalkovich Date: Wed, 30 Jun 2021 18:51:32 -0700 Subject: mm, thp: relax the VM_DENYWRITE constraint on file-backed THPs Transparent huge pages are supported for read-only non-shmem files, but are only used for vmas with VM_DENYWRITE. This condition ensures that file THPs are protected from writes while an application is running (ETXTBSY). Any existing file THPs are then dropped from the page cache when a file is opened for write in do_dentry_open(). Since sys_mmap ignores MAP_DENYWRITE, this constrains the use of file THPs to vmas produced by execve(). Systems that make heavy use of shared libraries (e.g. Android) are unable to apply VM_DENYWRITE through the dynamic linker, preventing them from benefiting from the resultant reduced contention on the TLB. This patch reduces the constraint on file THPs allowing use with any executable mapping from a file not opened for write (see inode_is_open_for_write()). It also introduces additional conditions to ensure that files opened for write will never be backed by file THPs. Restricting the use of THPs to executable mappings eliminates the risk that a read-only file later opened for write would encounter significant latencies due to page cache truncation. The ld linker flag '-z max-page-size=(hugepage size)' can be used to produce executables with the necessary layout. The dynamic linker must map these file's segments at a hugepage size aligned vma for the mapping to be backed with THPs. Comparison of the performance characteristics of 4KB and 2MB-backed libraries follows; the Android dex2oat tool was used to AOT compile an example application on a single ARM core. 4KB Pages: ========== count event_name # count / runtime 598,995,035,942 cpu-cycles # 1.800861 GHz 81,195,620,851 raw-stall-frontend # 244.112 M/sec 347,754,466,597 iTLB-loads # 1.046 G/sec 2,970,248,900 iTLB-load-misses # 0.854122% miss rate Total test time: 332.854998 seconds. 2MB Pages: ========== count event_name # count / runtime 592,872,663,047 cpu-cycles # 1.800358 GHz 76,485,624,143 raw-stall-frontend # 232.261 M/sec 350,478,413,710 iTLB-loads # 1.064 G/sec 803,233,322 iTLB-load-misses # 0.229182% miss rate Total test time: 329.826087 seconds A check of /proc/$(pidof dex2oat64)/smaps shows THPs in use: /apex/com.android.art/lib64/libart.so FilePmdMapped: 4096 kB /apex/com.android.art/lib64/libart-compiler.so FilePmdMapped: 2048 kB Link: https://lkml.kernel.org/r/20210406000930.3455850-1-cfijalkovich@google.com Signed-off-by: Collin Fijalkovich Acked-by: Hugh Dickins Reviewed-by: William Kucharski Acked-by: Song Liu Cc: Suren Baghdasaryan Cc: Hridya Valsaraju Cc: Kalesh Singh Cc: Tim Murray Cc: Matthew Wilcox Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 13 +++++++++++-- mm/khugepaged.c | 16 +++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index e53af13b5835..f76e960d10ea 100644 --- a/fs/open.c +++ b/fs/open.c @@ -852,8 +852,17 @@ static int do_dentry_open(struct file *f, * XXX: Huge page cache doesn't support writing yet. Drop all page * cache for this file before processing writes. */ - if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping)) - truncate_pagecache(inode, 0); + if (f->f_mode & FMODE_WRITE) { + /* + * Paired with smp_mb() in collapse_file() to ensure nr_thps + * is up to date and the update to i_writecount by + * get_write_access() is visible. Ensures subsequent insertion + * of THPs into the page cache will fail. + */ + smp_mb(); + if (filemap_nr_thps(inode->i_mapping)) + truncate_pagecache(inode, 0); + } return 0; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d97b20fad6e8..b0412be08fa2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -457,7 +457,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, /* Read-only file mappings need to be aligned for THP to work. */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && - (vm_flags & VM_DENYWRITE)) { + !inode_is_open_for_write(vma->vm_file->f_inode) && + (vm_flags & VM_EXEC)) { return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); } @@ -1862,6 +1863,19 @@ out_unlock: else { __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); + /* + * Paired with smp_mb() in do_dentry_open() to ensure + * i_writecount is up to date and the update to nr_thps is + * visible. Ensures the page cache will be truncated if the + * file is opened writable. + */ + smp_mb(); + if (inode_is_open_for_write(mapping->host)) { + result = SCAN_FAIL; + __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + goto xa_locked; + } } if (nr_none) { -- cgit v1.2.3 From af5cdaf82238fb3637a0d0fff4670e5be71c611c Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 30 Jun 2021 18:54:06 -0700 Subject: mm: remove special swap entry functions Patch series "Add support for SVM atomics in Nouveau", v11. Introduction ============ Some devices have features such as atomic PTE bits that can be used to implement atomic access to system memory. To support atomic operations to a shared virtual memory page such a device needs access to that page which is exclusive of the CPU. This series introduces a mechanism to temporarily unmap pages granting exclusive access to a device. These changes are required to support OpenCL atomic operations in Nouveau to shared virtual memory (SVM) regions allocated with the CL_MEM_SVM_ATOMICS clSVMAlloc flag. A more complete description of the OpenCL SVM feature is available at https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/ OpenCL_API.html#_shared_virtual_memory . Implementation ============== Exclusive device access is implemented by adding a new swap entry type (SWAP_DEVICE_EXCLUSIVE) which is similar to a migration entry. The main difference is that on fault the original entry is immediately restored by the fault handler instead of waiting. Restoring the entry triggers calls to MMU notifers which allows a device driver to revoke the atomic access permission from the GPU prior to the CPU finalising the entry. Patches ======= Patches 1 & 2 refactor existing migration and device private entry functions. Patches 3 & 4 rework try_to_unmap_one() by splitting out unrelated functionality into separate functions - try_to_migrate_one() and try_to_munlock_one(). Patch 5 renames some existing code but does not introduce functionality. Patch 6 is a small clean-up to swap entry handling in copy_pte_range(). Patch 7 contains the bulk of the implementation for device exclusive memory. Patch 8 contains some additions to the HMM selftests to ensure everything works as expected. Patch 9 is a cleanup for the Nouveau SVM implementation. Patch 10 contains the implementation of atomic access for the Nouveau driver. Testing ======= This has been tested with upstream Mesa 21.1.0 and a simple OpenCL program which checks that GPU atomic accesses to system memory are atomic. Without this series the test fails as there is no way of write-protecting the page mapping which results in the device clobbering CPU writes. For reference the test is available at https://ozlabs.org/~apopple/opencl_svm_atomics/ Further testing has been performed by adding support for testing exclusive access to the hmm-tests kselftests. This patch (of 10): Remove multiple similar inline functions for dealing with different types of special swap entries. Both migration and device private swap entries use the swap offset to store a pfn. Instead of multiple inline functions to obtain a struct page for each swap entry type use a common function pfn_swap_entry_to_page(). Also open-code the various entry_to_pfn() functions as this results is shorter code that is easier to understand. Link: https://lkml.kernel.org/r/20210616105937.23201-1-apopple@nvidia.com Link: https://lkml.kernel.org/r/20210616105937.23201-2-apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Ralph Campbell Reviewed-by: Christoph Hellwig Cc: "Matthew Wilcox (Oracle)" Cc: Hugh Dickins Cc: Peter Xu Cc: Shakeel Butt Cc: Ben Skeggs Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/pgtable.c | 2 +- fs/proc/task_mmu.c | 23 ++++++----------- include/linux/swap.h | 4 +-- include/linux/swapops.h | 69 +++++++++++++++++-------------------------------- mm/hmm.c | 5 ++-- mm/huge_memory.c | 6 ++--- mm/memcontrol.c | 2 +- mm/memory.c | 10 +++---- mm/migrate.c | 6 ++--- mm/page_vma_mapped.c | 6 ++--- 10 files changed, 51 insertions(+), 82 deletions(-) (limited to 'fs') diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 18205f851c24..eec3a9d7176e 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -691,7 +691,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) if (!non_swap_entry(entry)) dec_mm_counter(mm, MM_SWAPENTS); else if (is_migration_entry(entry)) { - struct page *page = migration_entry_to_page(entry); + struct page *page = pfn_swap_entry_to_page(entry); dec_mm_counter(mm, mm_counter(page)); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 95c8f1e8fea6..eb97468dfe4c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -514,10 +514,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_migration_entry(swpent)) - page = migration_entry_to_page(swpent); - else if (is_device_private_entry(swpent)) - page = device_private_entry_to_page(swpent); + } else if (is_pfn_swap_entry(swpent)) + page = pfn_swap_entry_to_page(swpent); } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { page = xa_load(&vma->vm_file->f_mapping->i_pages, @@ -549,7 +547,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, swp_entry_t entry = pmd_to_swp_entry(*pmd); if (is_migration_entry(entry)) - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); } if (IS_ERR_OR_NULL(page)) return; @@ -694,10 +692,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); - if (is_migration_entry(swpent)) - page = migration_entry_to_page(swpent); - else if (is_device_private_entry(swpent)) - page = device_private_entry_to_page(swpent); + if (is_pfn_swap_entry(swpent)) + page = pfn_swap_entry_to_page(swpent); } if (page) { int mapcount = page_mapcount(page); @@ -1389,11 +1385,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; - if (is_migration_entry(entry)) - page = migration_entry_to_page(entry); - - if (is_device_private_entry(entry)) - page = device_private_entry_to_page(entry); + if (is_pfn_swap_entry(entry)) + page = pfn_swap_entry_to_page(entry); } if (page && !PageAnon(page)) @@ -1454,7 +1447,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); } #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index ac9bd84c905e..df7cbb6b3d3e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -564,8 +564,8 @@ static inline void show_swap_cache_info(void) { } -#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));}) -#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));}) +/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ +#define free_swap_and_cache(e) is_pfn_swap_entry(e) static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 708fbeb21dd3..c24c79812bc1 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -128,16 +128,6 @@ static inline bool is_write_device_private_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); } - -static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry) -{ - return swp_offset(entry); -} - -static inline struct page *device_private_entry_to_page(swp_entry_t entry) -{ - return pfn_to_page(swp_offset(entry)); -} #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_device_private_entry(struct page *page, bool write) { @@ -157,16 +147,6 @@ static inline bool is_write_device_private_entry(swp_entry_t entry) { return false; } - -static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry) -{ - return 0; -} - -static inline struct page *device_private_entry_to_page(swp_entry_t entry) -{ - return NULL; -} #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION @@ -189,22 +169,6 @@ static inline int is_write_migration_entry(swp_entry_t entry) return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); } -static inline unsigned long migration_entry_to_pfn(swp_entry_t entry) -{ - return swp_offset(entry); -} - -static inline struct page *migration_entry_to_page(swp_entry_t entry) -{ - struct page *p = pfn_to_page(swp_offset(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - BUG_ON(!PageLocked(compound_head(p))); - return p; -} - static inline void make_migration_entry_read(swp_entry_t *entry) { *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); @@ -224,16 +188,6 @@ static inline int is_migration_entry(swp_entry_t swp) return 0; } -static inline unsigned long migration_entry_to_pfn(swp_entry_t entry) -{ - return 0; -} - -static inline struct page *migration_entry_to_page(swp_entry_t entry) -{ - return NULL; -} - static inline void make_migration_entry_read(swp_entry_t *entryp) { } static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl) { } @@ -248,6 +202,29 @@ static inline int is_write_migration_entry(swp_entry_t entry) #endif +static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) +{ + struct page *p = pfn_to_page(swp_offset(entry)); + + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + BUG_ON(is_migration_entry(entry) && !PageLocked(p)); + + return p; +} + +/* + * A pfn swap entry is a special type of swap entry that always has a pfn stored + * in the swap offset. They are used to represent unaddressable device memory + * and to restrict access to a page undergoing migration. + */ +static inline bool is_pfn_swap_entry(swp_entry_t entry) +{ + return is_migration_entry(entry) || is_device_private_entry(entry); +} + struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION diff --git a/mm/hmm.c b/mm/hmm.c index 943cb2ba4442..3b2dda71d0ed 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -214,7 +214,7 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range, swp_entry_t entry) { return is_device_private_entry(entry) && - device_private_entry_to_page(entry)->pgmap->owner == + pfn_swap_entry_to_page(entry)->pgmap->owner == range->dev_private_owner; } @@ -257,8 +257,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, cpu_flags = HMM_PFN_VALID; if (is_write_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; - *hmm_pfn = device_private_entry_to_pfn(entry) | - cpu_flags; + *hmm_pfn = swp_offset(entry) | cpu_flags; return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d513b0cd1161..327b8d9d8d2f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1643,7 +1643,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); @@ -2012,7 +2012,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); } else { page = pmd_page(old_pmd); if (!PageDirty(page) && pmd_dirty(old_pmd)) @@ -2066,7 +2066,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); write = is_write_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f63399bff018..b826dad6fa36 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5532,7 +5532,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * as special swap entry in the CPU page table. */ if (is_device_private_entry(ent)) { - page = device_private_entry_to_page(ent); + page = pfn_swap_entry_to_page(ent); /* * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have * a refcount of 1 when free (unlike normal page) diff --git a/mm/memory.c b/mm/memory.c index 64eda960b75e..6723931085c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -729,7 +729,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); rss[mm_counter(page)]++; @@ -748,7 +748,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { - page = device_private_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); /* * Update rss count even for unaddressable pages, as @@ -1280,7 +1280,7 @@ again: entry = pte_to_swp_entry(ptent); if (is_device_private_entry(entry)) { - struct page *page = device_private_entry_to_page(entry); + struct page *page = pfn_swap_entry_to_page(entry); if (unlikely(details && details->check_mapping)) { /* @@ -1309,7 +1309,7 @@ again: else if (is_migration_entry(entry)) { struct page *page; - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); rss[mm_counter(page)]--; } if (unlikely(!free_swap_and_cache(entry))) @@ -3372,7 +3372,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_private_entry(entry)) { - vmf->page = device_private_entry_to_page(entry); + vmf->page = pfn_swap_entry_to_page(entry); ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; diff --git a/mm/migrate.c b/mm/migrate.c index 8810c1421f5d..b4abb87249e1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -296,7 +296,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!is_migration_entry(entry)) goto out; - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); page = compound_head(page); /* @@ -337,7 +337,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); + page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd)); if (!get_page_unless_zero(page)) goto unlock; spin_unlock(ptl); @@ -2289,7 +2289,7 @@ again: if (!is_device_private_entry(entry)) goto next; - page = device_private_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); if (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || page->pgmap->owner != migrate->pgmap_owner) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index a4435311754b..4df6093e759b 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -96,7 +96,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) if (!is_migration_entry(entry)) return false; - pfn = migration_entry_to_pfn(entry); + pfn = swp_offset(entry); } else if (is_swap_pte(*pvmw->pte)) { swp_entry_t entry; @@ -105,7 +105,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) if (!is_device_private_entry(entry)) return false; - pfn = device_private_entry_to_pfn(entry); + pfn = swp_offset(entry); } else { if (!pte_present(*pvmw->pte)) return false; @@ -233,7 +233,7 @@ restart: return not_found(pvmw); entry = pmd_to_swp_entry(pmde); if (!is_migration_entry(entry) || - migration_entry_to_page(entry) != page) + pfn_swap_entry_to_page(entry) != page) return not_found(pvmw); return true; } -- cgit v1.2.3 From d238692b4b9f2c36e35af4c6e6f6da36184aeb3e Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Cerri Date: Wed, 30 Jun 2021 18:54:38 -0700 Subject: proc: Avoid mixing integer types in mem_rw() Use size_t when capping the count argument received by mem_rw(). Since count is size_t, using min_t(int, ...) can lead to a negative value that will later be passed to access_remote_vm(), which can cause unexpected behavior. Since we are capping the value to at maximum PAGE_SIZE, the conversion from size_t to int when passing it to access_remote_vm() as "len" shouldn't be a problem. Link: https://lkml.kernel.org/r/20210512125215.3348316-1-marcelo.cerri@canonical.com Reviewed-by: David Disseldorp Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Marcelo Henrique Cerri Cc: Alexey Dobriyan Cc: Souza Cascardo Cc: Christian Brauner Cc: Michel Lespinasse Cc: Helge Deller Cc: Oleg Nesterov Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 9cbd915025ad..a0a2fc1c9da2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -854,7 +854,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); while (count > 0) { - int this_len = min_t(int, count, PAGE_SIZE); + size_t this_len = min_t(size_t, count, PAGE_SIZE); if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; -- cgit v1.2.3 From 7bc3fa0172a423afb34e6df7a3998e5f23b1a94a Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 30 Jun 2021 18:54:44 -0700 Subject: procfs: allow reading fdinfo with PTRACE_MODE_READ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Android captures per-process system memory state when certain low memory events (e.g a foreground app kill) occur, to identify potential memory hoggers. In order to measure how much memory a process actually consumes, it is necessary to include the DMA buffer sizes for that process in the memory accounting. Since the handle to DMA buffers are raw FDs, it is important to be able to identify which processes have FD references to a DMA buffer. Currently, DMA buffer FDs can be accounted using /proc//fd/* and /proc//fdinfo -- both are only readable by the process owner, as follows: 1. Do a readlink on each FD. 2. If the target path begins with "/dmabuf", then the FD is a dmabuf FD. 3. stat the file to get the dmabuf inode number. 4. Read/ proc//fdinfo/, to get the DMA buffer size. Accessing other processes' fdinfo requires root privileges. This limits the use of the interface to debugging environments and is not suitable for production builds. Granting root privileges even to a system process increases the attack surface and is highly undesirable. Since fdinfo doesn't permit reading process memory and manipulating process state, allow accessing fdinfo under PTRACE_MODE_READ_FSCRED. Link: https://lkml.kernel.org/r/20210308170651.919148-1-kaleshsingh@google.com Signed-off-by: Kalesh Singh Suggested-by: Jann Horn Acked-by: Christian König Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Alexey Gladkov Cc: Andrei Vagin Cc: Bernd Edlinger Cc: Christian Brauner Cc: Eric W. Biederman Cc: Helge Deller Cc: Hridya Valsaraju Cc: James Morris Cc: Jeff Vander Stoep Cc: Jonathan Corbet Cc: Kees Cook Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Randy Dunlap Cc: Suren Baghdasaryan Cc: Szabolcs Nagy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ++-- fs/proc/fd.c | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index a0a2fc1c9da2..e5b5f7709d48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3172,7 +3172,7 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), - DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), @@ -3517,7 +3517,7 @@ static const struct inode_operations proc_tid_comm_inode_operations = { */ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), - DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 07fc4fad2602..6a80b40fd2fe 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,18 @@ out: static int seq_fdinfo_open(struct inode *inode, struct file *file) { + bool allowed = false; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + + allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); + put_task_struct(task); + + if (!allowed) + return -EACCES; + return single_open(file, seq_show, inode); } @@ -308,7 +321,7 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO); if (!inode) return ERR_PTR(-ENOENT); -- cgit v1.2.3 From 3845f256a8b527127bfbd4ced21e93d9e89aa6d7 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 30 Jun 2021 18:54:49 -0700 Subject: procfs/dmabuf: add inode number to /proc/*/fdinfo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit And 'ino' field to /proc//fdinfo/ and /proc//task//fdinfo/. The inode numbers can be used to uniquely identify DMA buffers in user space and avoids a dependency on /proc//fd/* when accounting per-process DMA buffer sizes. Link: https://lkml.kernel.org/r/20210308170651.919148-2-kaleshsingh@google.com Signed-off-by: Kalesh Singh Acked-by: Randy Dunlap Acked-by: Christian König Cc: Jann Horn Cc: Jeff Vander Stoep Cc: Kees Cook Cc: Suren Baghdasaryan Cc: Minchan Kim Cc: Hridya Valsaraju Cc: Matthew Wilcox Cc: Alexander Viro Cc: Kalesh Singh Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Mauro Carvalho Chehab Cc: Michal Hocko Cc: Alexey Gladkov Cc: Szabolcs Nagy Cc: Eric W. Biederman Cc: Christian Brauner Cc: Michel Lespinasse Cc: Bernd Edlinger Cc: Andrei Vagin Cc: Helge Deller Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.rst | 37 +++++++++++++++++++++++++++++++------ fs/proc/fd.c | 5 +++-- 2 files changed, 34 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index dd4e3e18c22b..042c418f4090 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1920,18 +1920,20 @@ if precise results are needed. 3.8 /proc//fdinfo/ - Information about opened file --------------------------------------------------------------- This file provides information associated with an opened file. The regular -files have at least three fields -- 'pos', 'flags' and 'mnt_id'. The 'pos' -represents the current offset of the opened file in decimal form [see lseek(2) -for details], 'flags' denotes the octal O_xxx mask the file has been -created with [see open(2) for details] and 'mnt_id' represents mount ID of -the file system containing the opened file [see 3.5 /proc//mountinfo -for details]. +files have at least four fields -- 'pos', 'flags', 'mnt_id' and 'ino'. +The 'pos' represents the current offset of the opened file in decimal +form [see lseek(2) for details], 'flags' denotes the octal O_xxx mask the +file has been created with [see open(2) for details] and 'mnt_id' represents +mount ID of the file system containing the opened file [see 3.5 +/proc//mountinfo for details]. 'ino' represents the inode number of +the file. A typical output is:: pos: 0 flags: 0100002 mnt_id: 19 + ino: 63107 All locks associated with a file descriptor are shown in its fdinfo too:: @@ -1948,6 +1950,7 @@ Eventfd files pos: 0 flags: 04002 mnt_id: 9 + ino: 63107 eventfd-count: 5a where 'eventfd-count' is hex value of a counter. @@ -1960,6 +1963,7 @@ Signalfd files pos: 0 flags: 04002 mnt_id: 9 + ino: 63107 sigmask: 0000000000000200 where 'sigmask' is hex value of the signal mask associated @@ -1973,6 +1977,7 @@ Epoll files pos: 0 flags: 02 mnt_id: 9 + ino: 63107 tfd: 5 events: 1d data: ffffffffffffffff pos:0 ino:61af sdev:7 where 'tfd' is a target file descriptor number in decimal form, @@ -1989,6 +1994,8 @@ For inotify files the format is the following:: pos: 0 flags: 02000000 + mnt_id: 9 + ino: 63107 inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:7e9e0000640d1b6d where 'wd' is a watch descriptor in decimal form, i.e. a target file @@ -2011,6 +2018,7 @@ For fanotify files the format is:: pos: 0 flags: 02 mnt_id: 9 + ino: 63107 fanotify flags:10 event-flags:0 fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003 fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4 @@ -2035,6 +2043,7 @@ Timerfd files pos: 0 flags: 02 mnt_id: 9 + ino: 63107 clockid: 0 ticks: 0 settime flags: 01 @@ -2049,6 +2058,22 @@ details]. 'it_value' is remaining time until the timer expiration. with TIMER_ABSTIME option which will be shown in 'settime flags', but 'it_value' still exhibits timer's remaining time. +DMA Buffer files +~~~~~~~~~~~~~~~~ + +:: + + pos: 0 + flags: 04002 + mnt_id: 9 + ino: 63107 + size: 32768 + count: 2 + exp_name: system-heap + +where 'size' is the size of the DMA buffer in bytes. 'count' is the file count of +the DMA buffer file. 'exp_name' is the name of the DMA buffer exporter. + 3.9 /proc//map_files - Information about memory mapped files --------------------------------------------------------------------- This directory contains symbolic links which represent memory mapped files diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 6a80b40fd2fe..172c86270b31 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -54,9 +54,10 @@ static int seq_show(struct seq_file *m, void *v) if (ret) return ret; - seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n", (long long)file->f_pos, f_flags, - real_mount(file->f_path.mnt)->mnt_id); + real_mount(file->f_path.mnt)->mnt_id, + file_inode(file)->i_ino); /* show_fd_locks() never deferences files so a stale value is safe */ show_fd_locks(m, file, files); -- cgit v1.2.3 From 1d31aa172a4e6728918a06ee7f1d6bcb7507172c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 30 Jun 2021 18:55:34 -0700 Subject: seq_file: introduce seq_escape_mem() Introduce seq_escape_mem() to allow users to pass additional parameters to string_escape_mem(). Link: https://lkml.kernel.org/r/20210504180819.73127-12-andriy.shevchenko@linux.intel.com Suggested-by: Al Viro Signed-off-by: Andy Shevchenko Cc: Chuck Lever Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 25 +++++++++++++++++++++++++ include/linux/seq_file.h | 2 ++ 2 files changed, 27 insertions(+) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 5059248f2d64..532cac2eae0f 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -355,6 +355,31 @@ int seq_release(struct inode *inode, struct file *file) } EXPORT_SYMBOL(seq_release); +/** + * seq_escape_mem - print data into buffer, escaping some characters + * @m: target buffer + * @src: source buffer + * @len: size of source buffer + * @flags: flags to pass to string_escape_mem() + * @esc: set of characters that need escaping + * + * Puts data into buffer, replacing each occurrence of character from + * given class (defined by @flags and @esc) with printable escaped sequence. + * + * Use seq_has_overflowed() to check for errors. + */ +void seq_escape_mem(struct seq_file *m, const char *src, size_t len, + unsigned int flags, const char *esc) +{ + char *buf; + size_t size = seq_get_buf(m, &buf); + int ret; + + ret = string_escape_mem(src, len, buf, size, flags, esc); + seq_commit(m, ret < size ? ret : -1); +} +EXPORT_SYMBOL(seq_escape_mem); + /** * seq_escape - print string into buffer, escaping some characters * @m: target buffer diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 723b1fa1177e..6de442182784 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -126,6 +126,8 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num void seq_put_hex_ll(struct seq_file *m, const char *delimiter, unsigned long long v, unsigned int width); +void seq_escape_mem(struct seq_file *m, const char *src, size_t len, + unsigned int flags, const char *esc); void seq_escape(struct seq_file *m, const char *s, const char *esc); void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz); -- cgit v1.2.3 From fc3de02eae89a1eb4a964b7b0a05bfb717904700 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 30 Jun 2021 18:55:40 -0700 Subject: seq_file: convert seq_escape() to use seq_escape_str() Convert seq_escape() to use seq_escape_str() rather than open coding it. Note, for now we leave it as an exported symbol due to some old code that can't tolerate ctype.h being (indirectly) included. Link: https://lkml.kernel.org/r/20210504180819.73127-14-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alexander Viro Cc: Chuck Lever Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 532cac2eae0f..08f54029c2b1 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -392,12 +392,7 @@ EXPORT_SYMBOL(seq_escape_mem); */ void seq_escape(struct seq_file *m, const char *s, const char *esc) { - char *buf; - size_t size = seq_get_buf(m, &buf); - int ret; - - ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc); - seq_commit(m, ret < size ? ret : -1); + seq_escape_str(m, s, ESCAPE_OCTAL, esc); } EXPORT_SYMBOL(seq_escape); -- cgit v1.2.3 From c0546391c20f01ca98c6fa42c8cd9e247599550a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 30 Jun 2021 18:55:43 -0700 Subject: nfsd: avoid non-flexible API in seq_quote_mem() The seq_escape_mem_ascii() is completely non-flexible and shouldn't be used. Replace it with properly called seq_escape_mem(). Link: https://lkml.kernel.org/r/20210504180819.73127-15-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alexander Viro Cc: Chuck Lever Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b517a8794400..cd5eac2ba054 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2351,7 +2351,7 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode) static void seq_quote_mem(struct seq_file *m, char *data, int len) { seq_printf(m, "\""); - seq_escape_mem_ascii(m, data, len); + seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\"); seq_printf(m, "\""); } -- cgit v1.2.3 From cc72181a65990193f54284417efa01d4580014e6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 30 Jun 2021 18:55:46 -0700 Subject: seq_file: drop unused *_escape_mem_ascii() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no more users of the seq_escape_mem_ascii() followed by string_escape_mem_ascii(). Remove them for good. Link: https://lkml.kernel.org/r/20210504180819.73127-16-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alexander Viro Cc: Chuck Lever Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 11 ----------- include/linux/seq_file.h | 1 - include/linux/string_helpers.h | 3 --- lib/string_helpers.c | 19 ------------------- 4 files changed, 34 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 08f54029c2b1..b117b212ef28 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -396,17 +396,6 @@ void seq_escape(struct seq_file *m, const char *s, const char *esc) } EXPORT_SYMBOL(seq_escape); -void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz) -{ - char *buf; - size_t size = seq_get_buf(m, &buf); - int ret; - - ret = string_escape_mem_ascii(src, isz, buf, size); - seq_commit(m, ret < size ? ret : -1); -} -EXPORT_SYMBOL(seq_escape_mem_ascii); - void seq_vprintf(struct seq_file *m, const char *f, va_list args) { int len; diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 63f021cb1b12..dd99569595fd 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -136,7 +136,6 @@ static inline void seq_escape_str(struct seq_file *m, const char *src, } void seq_escape(struct seq_file *m, const char *s, const char *esc); -void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz); void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 9b0eca2badf2..68189c4a2eb1 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -63,9 +63,6 @@ static inline int string_unescape_any_inplace(char *buf) int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, unsigned int flags, const char *only); -int string_escape_mem_ascii(const char *src, size_t isz, char *dst, - size_t osz); - static inline int string_escape_mem_any_np(const char *src, size_t isz, char *dst, size_t osz, const char *only) { diff --git a/lib/string_helpers.c b/lib/string_helpers.c index c15aea7a82e9..5a35c7e16e96 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -582,25 +582,6 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, } EXPORT_SYMBOL(string_escape_mem); -int string_escape_mem_ascii(const char *src, size_t isz, char *dst, - size_t osz) -{ - char *p = dst; - char *end = p + osz; - - while (isz--) { - unsigned char c = *src++; - - if (!isprint(c) || !isascii(c) || c == '"' || c == '\\') - escape_hex(c, &p, end); - else - escape_passthrough(c, &p, end); - } - - return p - dst; -} -EXPORT_SYMBOL(string_escape_mem_ascii); - /* * Return an allocated string that has been escaped of special characters * and double quotes, making it safe to log in quotes. -- cgit v1.2.3 From f4048e5aa148b13da84132cc23b6503b626e2576 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 30 Jun 2021 18:56:34 -0700 Subject: nilfs2: remove redundant continue statement in a while-loop The continue statement at the end of the while-loop is redundant, remove it. Addresses-Coverity: ("Continue has no effect") Link: https://lkml.kernel.org/r/20210621100519.10257-1-colin.king@canonical.com Link: https://lkml.kernel.org/r/1624557664-17159-1-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/btree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index f42ab57201e7..ab9ec073330f 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -738,7 +738,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, if (ptr2 != ptr + cnt || ++cnt == maxblocks) goto end; index++; - continue; } if (level == maxlevel) break; -- cgit v1.2.3 From 7dcae11f4c5862be62443dabe94e10a07b5639fc Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 30 Jun 2021 18:56:37 -0700 Subject: hfsplus: remove unnecessary oom message Fixes scripts/checkpatch.pl warning: WARNING: Possible unnecessary 'out of memory' message Remove it can help us save a bit of memory. Link: https://lkml.kernel.org/r/20210617084944.1279-1-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/xattr.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 4d169c5a2673..e2855ceefd39 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -204,7 +204,6 @@ check_attr_tree_state_again: buf = kzalloc(node_size, GFP_NOFS); if (!buf) { - pr_err("failed to allocate memory for header node\n"); err = -ENOMEM; goto end_attr_file_creation; } -- cgit v1.2.3 From c3eb84092b326a353725edcc8274a3782f1d1524 Mon Sep 17 00:00:00 2001 From: Chung-Chiang Cheng Date: Wed, 30 Jun 2021 18:56:40 -0700 Subject: hfsplus: report create_date to kstat.btime The create_date field of inode in hfsplus is corresponding to kstat.btime and could be reported in statx. Link: https://lkml.kernel.org/r/20210416172147.8736-1-cccheng@synology.com Signed-off-by: Chung-Chiang Cheng Reviewed-by: Viacheslav Dubeyko Cc: Christian Brauner Cc: James Morris Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/inode.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 70e8374ddac4..6fef67c2a9f0 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -281,6 +281,11 @@ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, struct inode *inode = d_inode(path->dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + if (request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime = hfsp_mt2ut(hip->create_date); + } + if (inode->i_flags & S_APPEND) stat->attributes |= STATX_ATTR_APPEND; if (inode->i_flags & S_IMMUTABLE) -- cgit v1.2.3 From bae7702a17e9a29d90a997c266296b44d7b087f0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 30 Jun 2021 18:56:46 -0700 Subject: exec: remove checks in __register_bimfmt() Delete NULL check, all callers pass valid pointer. Delete ->load_binary check -- failure to provide hook in a custom module will be very noticeable at the very first execve call. Link: https://lkml.kernel.org/r/YK1Gy1qXaLAR+tPl@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 18594f11c31f..379afcf5d106 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -84,9 +84,6 @@ static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { - BUG_ON(!fmt); - if (WARN_ON(!fmt->load_binary)) - return; write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); -- cgit v1.2.3