From 0871bc0129d403747ea0272a4384895d7ad37a6c Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: mm: Move lowmem_page_address() a little later LoongArch will override page_to_virt() which use page_address() in the KFENCE case (by defining WANT_PAGE_VIRTUAL/HASHED_PAGE_VIRTUAL). So move lowmem_page_address() a little later to avoid such build errors: error: implicit declaration of function 'page_address'. Acked-by: Andrew Morton Signed-off-by: Huacai Chen --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0436b919f1c7..7b0ee64225de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2207,11 +2207,6 @@ static inline int arch_make_folio_accessible(struct folio *folio) */ #include -static __always_inline void *lowmem_page_address(const struct page *page) -{ - return page_to_virt(page); -} - #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif @@ -2234,6 +2229,11 @@ void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif +static __always_inline void *lowmem_page_address(const struct page *page) +{ + return page_to_virt(page); +} + #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) -- cgit v1.2.3 From fd1a745ce03e37945674c14833870a9af0882e2d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:42 +0000 Subject: mm: support page_mapcount() on page_has_type() pages Return 0 for pages which can't be mapped. This matches how page_mapped() works. It is more convenient for users to not have to filter out these pages. Link: https://lkml.kernel.org/r/20240321142448.1645400-5-willy@infradead.org Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- fs/proc/page.c | 7 ++----- include/linux/mm.h | 8 +++++--- include/linux/page-flags.h | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux/mm.h') diff --git a/fs/proc/page.c b/fs/proc/page.c index 195b077c0fac..9223856c934b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -67,7 +67,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, */ ppage = pfn_to_online_page(pfn); - if (!ppage || PageSlab(ppage) || page_has_type(ppage)) + if (!ppage) pcount = 0; else pcount = page_mapcount(ppage); @@ -124,11 +124,8 @@ u64 stable_page_flags(struct page *page) /* * pseudo flags for the well known (anonymous) memory mapped pages - * - * Note that page->_mapcount is overloaded in SLAB, so the - * simple test in page_mapped() is not enough. */ - if (!PageSlab(page) && page_mapped(page)) + if (page_mapped(page)) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b0ee64225de..b6bdaa18b9e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1223,14 +1223,16 @@ static inline void page_mapcount_reset(struct page *page) * a large folio, it includes the number of times this page is mapped * as part of that folio. * - * The result is undefined for pages which cannot be mapped into userspace. - * For example SLAB or special types of pages. See function page_has_type(). - * They use this field in struct page differently. + * Will report 0 for pages which cannot be mapped into userspace, eg + * slab, page tables and similar. */ static inline int page_mapcount(struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; + /* Handle page_has_type() pages */ + if (mapcount < 0) + mapcount = 0; if (unlikely(PageCompound(page))) mapcount += folio_entire_mapcount(page_folio(page)); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index dc1607f1415e..35a0087d0910 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -971,12 +971,12 @@ static inline bool is_page_hwpoison(struct page *page) * page_type may be used. Because it is initialised to -1, we invert the * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and - * low bits so that an underflow or overflow of page_mapcount() won't be + * low bits so that an underflow or overflow of _mapcount won't be * mistaken for a page type value. */ #define PAGE_TYPE_BASE 0xf0000000 -/* Reserve 0x0000007f to catch underflows of page_mapcount */ +/* Reserve 0x0000007f to catch underflows of _mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 #define PG_offline 0x00000100 -- cgit v1.2.3 From d224eb0287fbd84f4f13eca042c7f08f87138f3b Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 21 Mar 2024 09:36:56 -0700 Subject: codetag: debug: mark codetags for reserved pages as empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid debug warnings while freeing reserved pages which were not allocated with usual allocators, mark their codetags as empty before freeing. Link: https://lkml.kernel.org/r/20240321163705.3067592-35-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Tested-by: Kees Cook Cc: Alexander Viro Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Boqun Feng Cc: Christoph Lameter Cc: Dennis Zhou Cc: Gary Guo Cc: Kent Overstreet Cc: Miguel Ojeda Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 1 + include/linux/mm.h | 9 +++++++++ include/linux/pgalloc_tag.h | 2 ++ mm/mm_init.c | 12 +++++++++++- 4 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 3af5d6a5ced4..afc9e259a2d3 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -47,6 +47,7 @@ static inline void set_codetag_empty(union codetag_ref *ref) #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } +static inline void set_codetag_empty(union codetag_ref *ref) {} #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ diff --git a/include/linux/mm.h b/include/linux/mm.h index b6bdaa18b9e9..0dbd737cfabd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -3134,6 +3135,14 @@ extern void reserve_bootmem_region(phys_addr_t start, /* Free the reserved page into the buddy system, so it gets managed. */ static inline void free_reserved_page(struct page *page) { + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } ClearPageReserved(page); init_page_count(page); __free_page(page); diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 50d212330bbb..62d8dad74b37 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -120,6 +120,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) #else /* CONFIG_MEM_ALLOC_PROFILING */ +static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } +static inline void put_page_tag_ref(union codetag_ref *ref) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} diff --git a/mm/mm_init.c b/mm/mm_init.c index 2fd9bf044a79..f45c2b32ba82 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2567,7 +2567,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { int nid = early_pfn_to_nid(pfn); @@ -2579,6 +2578,17 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* KMSAN will take care of these pages. */ return; } + + /* pages were reserved and not allocated */ + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } + __free_pages_core(page, order); } -- cgit v1.2.3 From 2c321f3f70bc284510598f712b702ce8d60c4d14 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sun, 14 Apr 2024 19:07:31 -0700 Subject: mm: change inlined allocation helpers to account at the call site Main goal of memory allocation profiling patchset is to provide accounting that is cheap enough to run in production. To achieve that we inject counters using codetags at the allocation call sites to account every time allocation is made. This injection allows us to perform accounting efficiently because injected counters are immediately available as opposed to the alternative methods, such as using _RET_IP_, which would require counter lookup and appropriate locking that makes accounting much more expensive. This method requires all allocation functions to inject separate counters at their call sites so that their callers can be individually accounted. Counter injection is implemented by allocation hooks which should wrap all allocation functions. Inlined functions which perform allocations but do not use allocation hooks are directly charged for the allocations they perform. In most cases these functions are just specialized allocation wrappers used from multiple places to allocate objects of a specific type. It would be more useful to do the accounting at their call sites instead. Instrument these helpers to do accounting at the call site. Simple inlined allocation wrappers are converted directly into macros. More complex allocators or allocators with documentation are converted into _noprof versions and allocation hooks are added. This allows memory allocation profiling mechanism to charge allocations to the callers of these functions. Link: https://lkml.kernel.org/r/20240415020731.1152108-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Jan Kara [jbd2] Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Benjamin Tissoires Cc: Christoph Lameter Cc: David Rientjes Cc: David S. Miller Cc: Dennis Zhou Cc: Eric Dumazet Cc: Herbert Xu Cc: Jakub Kicinski Cc: Jakub Sitnicki Cc: Jiri Kosina Cc: Joerg Roedel Cc: Joonsoo Kim Cc: Kent Overstreet Cc: Matthew Wilcox (Oracle) Cc: Paolo Abeni Cc: Pekka Enberg Cc: Tejun Heo Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/iommu/amd/amd_iommu.h | 5 +++-- fs/nfs/iostat.h | 5 +---- include/acpi/platform/aclinuxex.h | 19 ++++++------------ include/asm-generic/pgalloc.h | 35 ++++++++++++++++++++------------- include/crypto/hash.h | 7 ++++--- include/crypto/internal/acompress.h | 5 +++-- include/crypto/skcipher.h | 7 ++++--- include/linux/bpf.h | 33 ++++++++----------------------- include/linux/bpfptr.h | 5 +++-- include/linux/dma-fence-chain.h | 6 ++---- include/linux/hid_bpf.h | 6 ++---- include/linux/jbd2.h | 12 ++++------- include/linux/mm.h | 5 +++-- include/linux/mm_types.h | 5 +++-- include/linux/percpu.h | 3 +++ include/linux/ptr_ring.h | 28 +++++++++++++++----------- include/linux/skb_array.h | 19 ++++++++++-------- include/linux/skbuff.h | 20 ++++++++----------- include/linux/skmsg.h | 8 +++----- include/linux/slab.h | 5 +++++ include/linux/sockptr.h | 10 ++++++---- include/net/netlabel.h | 16 +++++++++------ include/net/netlink.h | 5 +++-- include/net/request_sock.h | 5 +++-- include/net/tcx.h | 5 +++-- net/sunrpc/auth_gss/auth_gss_internal.h | 6 ++++-- 26 files changed, 142 insertions(+), 143 deletions(-) (limited to 'include/linux/mm.h') diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index f482aab420f7..52575ba9a141 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -134,13 +134,14 @@ static inline int get_pci_sbdf_id(struct pci_dev *pdev) return PCI_SEG_DEVID_TO_SBDF(seg, devid); } -static inline void *alloc_pgtable_page(int nid, gfp_t gfp) +static inline void *alloc_pgtable_page_noprof(int nid, gfp_t gfp) { struct page *page; - page = alloc_pages_node(nid, gfp | __GFP_ZERO, 0); + page = alloc_pages_node_noprof(nid, gfp | __GFP_ZERO, 0); return page ? page_address(page) : NULL; } +#define alloc_pgtable_page(...) alloc_hooks(alloc_pgtable_page_noprof(__VA_ARGS__)) /* * This must be called after device probe completes. During probe diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 5aa776b5a3e7..b17a9eb9b148 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -46,10 +46,7 @@ static inline void nfs_add_stats(const struct inode *inode, nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } -static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) -{ - return alloc_percpu(struct nfs_iostats); -} +#define nfs_alloc_iostats() alloc_percpu(struct nfs_iostats) static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats) { diff --git a/include/acpi/platform/aclinuxex.h b/include/acpi/platform/aclinuxex.h index 600d4e2641da..62cac266a1c8 100644 --- a/include/acpi/platform/aclinuxex.h +++ b/include/acpi/platform/aclinuxex.h @@ -47,26 +47,19 @@ acpi_status acpi_os_terminate(void); * However, boot has (system_state != SYSTEM_RUNNING) * to quiet __might_sleep() in kmalloc() and resume does not. */ -static inline void *acpi_os_allocate(acpi_size size) -{ - return kmalloc(size, irqs_disabled()? GFP_ATOMIC : GFP_KERNEL); -} +#define acpi_os_allocate(_size) \ + kmalloc(_size, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL) -static inline void *acpi_os_allocate_zeroed(acpi_size size) -{ - return kzalloc(size, irqs_disabled()? GFP_ATOMIC : GFP_KERNEL); -} +#define acpi_os_allocate_zeroed(_size) \ + kzalloc(_size, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL) static inline void acpi_os_free(void *memory) { kfree(memory); } -static inline void *acpi_os_acquire_object(acpi_cache_t * cache) -{ - return kmem_cache_zalloc(cache, - irqs_disabled()? GFP_ATOMIC : GFP_KERNEL); -} +#define acpi_os_acquire_object(_cache) \ + kmem_cache_zalloc(_cache, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL) static inline acpi_thread_id acpi_os_get_thread_id(void) { diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 879e5f8aa5e9..7c48f5fbf8aa 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -16,15 +16,16 @@ * * Return: pointer to the allocated memory or %NULL on error */ -static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm) +static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) { - struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & + struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); if (!ptdesc) return NULL; return ptdesc_address(ptdesc); } +#define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL /** @@ -33,10 +34,11 @@ static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm) * * Return: pointer to the allocated memory or %NULL on error */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm) { - return __pte_alloc_one_kernel(mm); + return __pte_alloc_one_kernel_noprof(mm); } +#define pte_alloc_one_kernel(...) alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__)) #endif /** @@ -61,11 +63,11 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) * * Return: `struct page` referencing the ptdesc or %NULL on error */ -static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) +static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp) { struct ptdesc *ptdesc; - ptdesc = pagetable_alloc(gfp, 0); + ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(ptdesc)) { @@ -75,6 +77,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) return ptdesc_page(ptdesc); } +#define __pte_alloc_one(...) alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PTE_ALLOC_ONE /** @@ -85,10 +88,11 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) * * Return: `struct page` referencing the ptdesc or %NULL on error */ -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) +static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm) { - return __pte_alloc_one(mm, GFP_PGTABLE_USER); + return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER); } +#define pte_alloc_one(...) alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__)) #endif /* @@ -124,14 +128,14 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) * * Return: pointer to the allocated memory or %NULL on error */ -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { struct ptdesc *ptdesc; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - ptdesc = pagetable_alloc(gfp, 0); + ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pmd_ctor(ptdesc)) { @@ -140,6 +144,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) } return ptdesc_address(ptdesc); } +#define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) #endif #ifndef __HAVE_ARCH_PMD_FREE @@ -157,7 +162,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #if CONFIG_PGTABLE_LEVELS > 3 -static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr) +static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; @@ -166,13 +171,14 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; - ptdesc = pagetable_alloc(gfp, 0); + ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; pagetable_pud_ctor(ptdesc); return ptdesc_address(ptdesc); } +#define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** @@ -184,10 +190,11 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr) * * Return: pointer to the allocated memory or %NULL on error */ -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { - return __pud_alloc_one(mm, addr); + return __pud_alloc_one_noprof(mm, addr); } +#define pud_alloc_one(...) alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__)) #endif static inline void __pud_free(struct mm_struct *mm, pud_t *pud) diff --git a/include/crypto/hash.h b/include/crypto/hash.h index 5d61f576cfc8..e5181cc9b7c5 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -578,19 +578,20 @@ static inline void ahash_request_set_tfm(struct ahash_request *req, * * Return: allocated request handle in case of success, or NULL if out of memory */ -static inline struct ahash_request *ahash_request_alloc( +static inline struct ahash_request *ahash_request_alloc_noprof( struct crypto_ahash *tfm, gfp_t gfp) { struct ahash_request *req; - req = kmalloc(sizeof(struct ahash_request) + - crypto_ahash_reqsize(tfm), gfp); + req = kmalloc_noprof(sizeof(struct ahash_request) + + crypto_ahash_reqsize(tfm), gfp); if (likely(req)) ahash_request_set_tfm(req, tfm); return req; } +#define ahash_request_alloc(...) alloc_hooks(ahash_request_alloc_noprof(__VA_ARGS__)) /** * ahash_request_free() - zeroize and free the request data structure diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h index 4ac46bafba9d..2a67793f52ad 100644 --- a/include/crypto/internal/acompress.h +++ b/include/crypto/internal/acompress.h @@ -69,15 +69,16 @@ static inline void acomp_request_complete(struct acomp_req *req, crypto_request_complete(&req->base, err); } -static inline struct acomp_req *__acomp_request_alloc(struct crypto_acomp *tfm) +static inline struct acomp_req *__acomp_request_alloc_noprof(struct crypto_acomp *tfm) { struct acomp_req *req; - req = kzalloc(sizeof(*req) + crypto_acomp_reqsize(tfm), GFP_KERNEL); + req = kzalloc_noprof(sizeof(*req) + crypto_acomp_reqsize(tfm), GFP_KERNEL); if (likely(req)) acomp_request_set_tfm(req, tfm); return req; } +#define __acomp_request_alloc(...) alloc_hooks(__acomp_request_alloc_noprof(__VA_ARGS__)) static inline void __acomp_request_free(struct acomp_req *req) { diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index c8857d7bdb37..6c5330e316b0 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -861,19 +861,20 @@ static inline struct skcipher_request *skcipher_request_cast( * * Return: allocated request handle in case of success, or NULL if out of memory */ -static inline struct skcipher_request *skcipher_request_alloc( +static inline struct skcipher_request *skcipher_request_alloc_noprof( struct crypto_skcipher *tfm, gfp_t gfp) { struct skcipher_request *req; - req = kmalloc(sizeof(struct skcipher_request) + - crypto_skcipher_reqsize(tfm), gfp); + req = kmalloc_noprof(sizeof(struct skcipher_request) + + crypto_skcipher_reqsize(tfm), gfp); if (likely(req)) skcipher_request_set_tfm(req, tfm); return req; } +#define skcipher_request_alloc(...) alloc_hooks(skcipher_request_alloc_noprof(__VA_ARGS__)) /** * skcipher_request_free() - zeroize and free request data structure diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 890e152d553e..a86da1f38b3c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2244,31 +2244,14 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags); #else -static inline void * -bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, - int node) -{ - return kmalloc_node(size, flags, node); -} - -static inline void * -bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) -{ - return kzalloc(size, flags); -} - -static inline void * -bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags) -{ - return kvcalloc(n, size, flags); -} - -static inline void __percpu * -bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, - gfp_t flags) -{ - return __alloc_percpu_gfp(size, align, flags); -} +#define bpf_map_kmalloc_node(_map, _size, _flags, _node) \ + kmalloc_node(_size, _flags, _node) +#define bpf_map_kzalloc(_map, _size, _flags) \ + kzalloc(_size, _flags) +#define bpf_map_kvcalloc(_map, _n, _size, _flags) \ + kvcalloc(_n, _size, _flags) +#define bpf_map_alloc_percpu(_map, _size, _align, _flags) \ + __alloc_percpu_gfp(_size, _align, _flags) #endif static inline int diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h index 79b2f78eec1a..1af241525a17 100644 --- a/include/linux/bpfptr.h +++ b/include/linux/bpfptr.h @@ -65,9 +65,9 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset, return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size); } -static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len) +static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len) { - void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN); + void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); @@ -77,6 +77,7 @@ static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len) } return p; } +#define kvmemdup_bpfptr(...) alloc_hooks(kvmemdup_bpfptr_noprof(__VA_ARGS__)) static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count) { diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h index 4bdf0b96da28..ad9e2506c2f4 100644 --- a/include/linux/dma-fence-chain.h +++ b/include/linux/dma-fence-chain.h @@ -86,10 +86,8 @@ dma_fence_chain_contained(struct dma_fence *fence) * * Returns a new struct dma_fence_chain object or NULL on failure. */ -static inline struct dma_fence_chain *dma_fence_chain_alloc(void) -{ - return kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL); -}; +#define dma_fence_chain_alloc() \ + ((struct dma_fence_chain *)kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL)) /** * dma_fence_chain_free diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 7118ac28d468..ca70eeb6393e 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -149,10 +149,8 @@ static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} static inline void hid_bpf_device_init(struct hid_device *hid) {} -static inline u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *size) -{ - return kmemdup(rdesc, *size, GFP_KERNEL); -} +#define call_hid_bpf_rdesc_fixup(_hdev, _rdesc, _size) \ + ((u8 *)kmemdup(_rdesc, *(_size), GFP_KERNEL)) #endif /* CONFIG_HID_BPF */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 971f3e826e15..9512fe332668 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1586,10 +1586,8 @@ void jbd2_journal_put_journal_head(struct journal_head *jh); */ extern struct kmem_cache *jbd2_handle_cache; -static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags) -{ - return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags); -} +#define jbd2_alloc_handle(_gfp_flags) \ + ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags)) static inline void jbd2_free_handle(handle_t *handle) { @@ -1602,10 +1600,8 @@ static inline void jbd2_free_handle(handle_t *handle) */ extern struct kmem_cache *jbd2_inode_cache; -static inline struct jbd2_inode *jbd2_alloc_inode(gfp_t gfp_flags) -{ - return kmem_cache_alloc(jbd2_inode_cache, gfp_flags); -} +#define jbd2_alloc_inode(_gfp_flags) \ + ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags)) static inline void jbd2_free_inode(struct jbd2_inode *jinode) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 0dbd737cfabd..60eabc6c8e00 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2860,12 +2860,13 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt) * * Return: The ptdesc describing the allocated page tables. */ -static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order) +static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) { - struct page *page = alloc_pages(gfp | __GFP_COMP, order); + struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); return page_ptdesc(page); } +#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) /** * pagetable_free - Free pagetables diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4ae4684d1add..6fef56938a17 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1170,14 +1170,15 @@ static inline void mm_init_cid(struct mm_struct *mm) cpumask_clear(mm_cidmask(mm)); } -static inline int mm_alloc_cid(struct mm_struct *mm) +static inline int mm_alloc_cid_noprof(struct mm_struct *mm) { - mm->pcpu_cid = alloc_percpu(struct mm_cid); + mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid); if (!mm->pcpu_cid) return -ENOMEM; mm_init_cid(mm); return 0; } +#define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) static inline void mm_destroy_cid(struct mm_struct *mm) { diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 13a82f11e4fd..03053de557cf 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -151,6 +151,9 @@ extern size_t pcpu_alloc_size(void __percpu *__pdata); #define alloc_percpu(type) \ (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) +#define alloc_percpu_noprof(type) \ + ((typeof(type) __percpu *)pcpu_alloc_noprof(sizeof(type), \ + __alignof__(type), false, GFP_KERNEL)) extern void free_percpu(void __percpu *__pdata); diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 808f9d3ee546..fd037c127bb0 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -464,11 +464,11 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, /* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See * documentation for vmalloc for which of them are legal. */ -static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp) +static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp) { if (size > KMALLOC_MAX_SIZE / sizeof(void *)) return NULL; - return kvmalloc_array(size, sizeof(void *), gfp | __GFP_ZERO); + return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO); } static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) @@ -484,9 +484,9 @@ static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) r->batch = 1; } -static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp) +static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp) { - r->queue = __ptr_ring_init_queue_alloc(size, gfp); + r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!r->queue) return -ENOMEM; @@ -497,6 +497,7 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp) return 0; } +#define ptr_ring_init(...) alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__)) /* * Return entries into ring. Destroy entries that don't fit. @@ -587,11 +588,11 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ -static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, +static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp, void (*destroy)(void *)) { unsigned long flags; - void **queue = __ptr_ring_init_queue_alloc(size, gfp); + void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); void **old; if (!queue) @@ -609,6 +610,7 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, return 0; } +#define ptr_ring_resize(...) alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__)) /* * Note: producer lock is nested within consumer lock, so if you @@ -616,21 +618,21 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ -static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, - unsigned int nrings, - int size, - gfp_t gfp, void (*destroy)(void *)) +static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, + unsigned int nrings, + int size, + gfp_t gfp, void (*destroy)(void *)) { unsigned long flags; void ***queues; int i; - queues = kmalloc_array(nrings, sizeof(*queues), gfp); + queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp); if (!queues) goto noqueues; for (i = 0; i < nrings; ++i) { - queues[i] = __ptr_ring_init_queue_alloc(size, gfp); + queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!queues[i]) goto nomem; } @@ -660,6 +662,8 @@ nomem: noqueues: return -ENOMEM; } +#define ptr_ring_resize_multiple(...) \ + alloc_hooks(ptr_ring_resize_multiple_noprof(__VA_ARGS__)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) { diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h index e2d45b7cb619..926496c9cc9c 100644 --- a/include/linux/skb_array.h +++ b/include/linux/skb_array.h @@ -177,10 +177,11 @@ static inline int skb_array_peek_len_any(struct skb_array *a) return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag); } -static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp) +static inline int skb_array_init_noprof(struct skb_array *a, int size, gfp_t gfp) { - return ptr_ring_init(&a->ring, size, gfp); + return ptr_ring_init_noprof(&a->ring, size, gfp); } +#define skb_array_init(...) alloc_hooks(skb_array_init_noprof(__VA_ARGS__)) static void __skb_array_destroy_skb(void *ptr) { @@ -198,15 +199,17 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } -static inline int skb_array_resize_multiple(struct skb_array **rings, - int nrings, unsigned int size, - gfp_t gfp) +static inline int skb_array_resize_multiple_noprof(struct skb_array **rings, + int nrings, unsigned int size, + gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); - return ptr_ring_resize_multiple((struct ptr_ring **)rings, - nrings, size, gfp, - __skb_array_destroy_skb); + return ptr_ring_resize_multiple_noprof((struct ptr_ring **)rings, + nrings, size, gfp, + __skb_array_destroy_skb); } +#define skb_array_resize_multiple(...) \ + alloc_hooks(skb_array_resize_multiple_noprof(__VA_ARGS__)) static inline void skb_array_cleanup(struct skb_array *a) { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9d24aec064e8..b72920c9887b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3371,7 +3371,7 @@ void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason); * * %NULL is returned if there is no free memory. */ -static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, +static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { /* This piece of code contains several assumptions. @@ -3384,13 +3384,11 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, */ gfp_mask |= __GFP_COMP | __GFP_MEMALLOC; - return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); + return alloc_pages_node_noprof(NUMA_NO_NODE, gfp_mask, order); } +#define __dev_alloc_pages(...) alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__)) -static inline struct page *dev_alloc_pages(unsigned int order) -{ - return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order); -} +#define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order) /** * __dev_alloc_page - allocate a page for network Rx @@ -3400,15 +3398,13 @@ static inline struct page *dev_alloc_pages(unsigned int order) * * %NULL is returned if there is no free memory. */ -static inline struct page *__dev_alloc_page(gfp_t gfp_mask) +static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask) { - return __dev_alloc_pages(gfp_mask, 0); + return __dev_alloc_pages_noprof(gfp_mask, 0); } +#define __dev_alloc_page(...) alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__)) -static inline struct page *dev_alloc_page(void) -{ - return dev_alloc_pages(0); -} +#define dev_alloc_page() dev_alloc_pages(0) /** * dev_page_is_reusable - check whether a page can be reused for network Rx diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e65ec3fd2799..78efc5b20284 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -410,11 +410,9 @@ void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); -static inline struct sk_psock_link *sk_psock_init_link(void) -{ - return kzalloc(sizeof(struct sk_psock_link), - GFP_ATOMIC | __GFP_NOWARN); -} +#define sk_psock_init_link() \ + ((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link), \ + GFP_ATOMIC | __GFP_NOWARN)) static inline void sk_psock_free_link(struct sk_psock_link *link) { diff --git a/include/linux/slab.h b/include/linux/slab.h index c0be1cd03cf6..4cc37ef22aae 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -744,6 +744,9 @@ void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node, */ #define kmalloc_track_caller(...) kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE) +#define kmalloc_track_caller_noprof(...) \ + kmalloc_node_track_caller_noprof(__VA_ARGS__, NUMA_NO_NODE, _RET_IP_) + static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) { @@ -781,6 +784,7 @@ extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_si #define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__)) #define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE) +#define kvmalloc_noprof(_size, _flags) kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE) #define kvzalloc(_size, _flags) kvmalloc(_size, _flags|__GFP_ZERO) #define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, _flags|__GFP_ZERO, _node) @@ -797,6 +801,7 @@ static inline __alloc_size(1, 2) void *kvmalloc_array_noprof(size_t n, size_t si #define kvmalloc_array(...) alloc_hooks(kvmalloc_array_noprof(__VA_ARGS__)) #define kvcalloc(_n, _size, _flags) kvmalloc_array(_n, _size, _flags|__GFP_ZERO) +#define kvcalloc_noprof(_n, _size, _flags) kvmalloc_array_noprof(_n, _size, _flags|__GFP_ZERO) extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) __realloc_size(3); diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index 317200cd3a60..fc5a206c4043 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -117,9 +117,9 @@ static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size) return copy_to_sockptr_offset(dst, 0, src, size); } -static inline void *memdup_sockptr(sockptr_t src, size_t len) +static inline void *memdup_sockptr_noprof(sockptr_t src, size_t len) { - void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); + void *p = kmalloc_track_caller_noprof(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); @@ -129,10 +129,11 @@ static inline void *memdup_sockptr(sockptr_t src, size_t len) } return p; } +#define memdup_sockptr(...) alloc_hooks(memdup_sockptr_noprof(__VA_ARGS__)) -static inline void *memdup_sockptr_nul(sockptr_t src, size_t len) +static inline void *memdup_sockptr_nul_noprof(sockptr_t src, size_t len) { - char *p = kmalloc_track_caller(len + 1, GFP_KERNEL); + char *p = kmalloc_track_caller_noprof(len + 1, GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -143,6 +144,7 @@ static inline void *memdup_sockptr_nul(sockptr_t src, size_t len) p[len] = '\0'; return p; } +#define memdup_sockptr_nul(...) alloc_hooks(memdup_sockptr_nul_noprof(__VA_ARGS__)) static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count) { diff --git a/include/net/netlabel.h b/include/net/netlabel.h index f3ab0b8a4b18..c31bd96dafdb 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -274,15 +274,17 @@ struct netlbl_calipso_ops { * on success, NULL on failure. * */ -static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc(gfp_t flags) +static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc_noprof(gfp_t flags) { struct netlbl_lsm_cache *cache; - cache = kzalloc(sizeof(*cache), flags); + cache = kzalloc_noprof(sizeof(*cache), flags); if (cache) refcount_set(&cache->refcount, 1); return cache; } +#define netlbl_secattr_cache_alloc(...) \ + alloc_hooks(netlbl_secattr_cache_alloc_noprof(__VA_ARGS__)) /** * netlbl_secattr_cache_free - Frees a netlbl_lsm_cache struct @@ -311,10 +313,11 @@ static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache) * on failure. * */ -static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc(gfp_t flags) +static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc_noprof(gfp_t flags) { - return kzalloc(sizeof(struct netlbl_lsm_catmap), flags); + return kzalloc_noprof(sizeof(struct netlbl_lsm_catmap), flags); } +#define netlbl_catmap_alloc(...) alloc_hooks(netlbl_catmap_alloc_noprof(__VA_ARGS__)) /** * netlbl_catmap_free - Free a LSM secattr catmap @@ -376,10 +379,11 @@ static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) * pointer on success, or NULL on failure. * */ -static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc(gfp_t flags) +static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc_noprof(gfp_t flags) { - return kzalloc(sizeof(struct netlbl_lsm_secattr), flags); + return kzalloc_noprof(sizeof(struct netlbl_lsm_secattr), flags); } +#define netlbl_secattr_alloc(...) alloc_hooks(netlbl_secattr_alloc_noprof(__VA_ARGS__)) /** * netlbl_secattr_free - Frees a netlbl_lsm_secattr struct diff --git a/include/net/netlink.h b/include/net/netlink.h index c19ff921b661..972b5484fa6f 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1891,10 +1891,11 @@ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla) * @src: netlink attribute to duplicate from * @gfp: GFP mask */ -static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp) +static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) { - return kmemdup(nla_data(src), nla_len(src), gfp); + return kmemdup_noprof(nla_data(src), nla_len(src), gfp); } +#define nla_memdup(...) alloc_hooks(nla_memdup_noprof(__VA_ARGS__)) /** * nla_nest_start_noflag - Start a new level of nested attributes diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 004e651e6067..29495c331d20 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -127,12 +127,12 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb, } static inline struct request_sock * -reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, +reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { struct request_sock *req; - req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN); + req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!req) return NULL; req->rsk_listener = NULL; @@ -157,6 +157,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, return req; } +#define reqsk_alloc(...) alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__)) static inline void __reqsk_free(struct request_sock *req) { diff --git a/include/net/tcx.h b/include/net/tcx.h index 04be9377785d..72a3e75e539f 100644 --- a/include/net/tcx.h +++ b/include/net/tcx.h @@ -75,9 +75,9 @@ tcx_entry_fetch(struct net_device *dev, bool ingress) return rcu_dereference_rtnl(dev->tcx_egress); } -static inline struct bpf_mprog_entry *tcx_entry_create(void) +static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void) { - struct tcx_entry *tcx = kzalloc(sizeof(*tcx), GFP_KERNEL); + struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL); if (tcx) { bpf_mprog_bundle_init(&tcx->bundle); @@ -85,6 +85,7 @@ static inline struct bpf_mprog_entry *tcx_entry_create(void) } return NULL; } +#define tcx_entry_create(...) alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__)) static inline void tcx_entry_free(struct bpf_mprog_entry *entry) { diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h index c53b329092d4..4ebc1b7043d9 100644 --- a/net/sunrpc/auth_gss/auth_gss_internal.h +++ b/net/sunrpc/auth_gss/auth_gss_internal.h @@ -23,7 +23,7 @@ simple_get_bytes(const void *p, const void *end, void *res, size_t len) } static inline const void * -simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) +simple_get_netobj_noprof(const void *p, const void *end, struct xdr_netobj *dest) { const void *q; unsigned int len; @@ -35,7 +35,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) if (unlikely(q > end || q < p)) return ERR_PTR(-EFAULT); if (len) { - dest->data = kmemdup(p, len, GFP_KERNEL); + dest->data = kmemdup_noprof(p, len, GFP_KERNEL); if (unlikely(dest->data == NULL)) return ERR_PTR(-ENOMEM); } else @@ -43,3 +43,5 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) dest->len = len; return q; } + +#define simple_get_netobj(...) alloc_hooks(simple_get_netobj_noprof(__VA_ARGS__)) -- cgit v1.2.3 From e0932b6c1f942fa747258e152cdce0d0b2b5be5c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Mar 2024 14:02:15 -0400 Subject: mm: page_alloc: consolidate free page accounting Free page accounting currently happens a bit too high up the call stack, where it has to deal with guard pages, compaction capturing, block stealing and even page isolation. This is subtle and fragile, and makes it difficult to hack on the code. Now that type violations on the freelists have been fixed, push the accounting down to where pages enter and leave the freelist. [hannes@cmpxchg.org: undo unrelated drive-by line wrap] Link: https://lkml.kernel.org/r/20240327185736.GA7597@cmpxchg.org [hannes@cmpxchg.org: remove unused page parameter from account_freepages()] Link: https://lkml.kernel.org/r/20240327185831.GB7597@cmpxchg.org [baolin.wang@linux.alibaba.com: fix free page accounting] Link: https://lkml.kernel.org/r/a2a48baca69f103aa431fd201f8a06e3b95e203d.1712648441.git.baolin.wang@linux.alibaba.com [andriy.shevchenko@linux.intel.com: avoid defining unused function] Link: https://lkml.kernel.org/r/20240423161506.2637177-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20240320180429.678181-11-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andy Shevchenko Signed-off-by: Baolin Wang Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Mel Gorman Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 18 +++-- include/linux/vmstat.h | 8 --- mm/debug_page_alloc.c | 12 +--- mm/internal.h | 5 -- mm/page_alloc.c | 192 +++++++++++++++++++++++++++---------------------- 5 files changed, 118 insertions(+), 117 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 60eabc6c8e00..1588fe15a38e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3797,24 +3797,22 @@ static inline bool page_is_guard(struct page *page) return PageGuard(page); } -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype); +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) + unsigned int order) { if (!debug_guardpage_enabled()) return false; - return __set_page_guard(zone, page, order, migratetype); + return __set_page_guard(zone, page, order); } -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype); +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) + unsigned int order) { if (!debug_guardpage_enabled()) return; - __clear_page_guard(zone, page, order, migratetype); + __clear_page_guard(zone, page, order); } #else /* CONFIG_DEBUG_PAGEALLOC */ @@ -3824,9 +3822,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) { return false; } + unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} + unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 343906a98d6e..735eae6e272c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -487,14 +487,6 @@ static inline void node_stat_sub_folio(struct folio *folio, mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); } -static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, - int migratetype) -{ - __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); - if (is_migrate_cma(migratetype)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); -} - extern const char * const vmstat_text[]; static inline const char *zone_stat_name(enum zone_stat_item item) diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c index 6755f0c9d4a3..d46acf989dde 100644 --- a/mm/debug_page_alloc.c +++ b/mm/debug_page_alloc.c @@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) } early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype) +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (order >= debug_guardpage_minorder()) return false; @@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, __SetPageGuard(page); INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); - /* Guard pages are not available for any usage */ - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype) +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order) { __ClearPageGuard(page); - set_page_private(page, 0); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, (1 << order), migratetype); } diff --git a/mm/internal.h b/mm/internal.h index cbc40f07537c..fb219e31f0f0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1039,11 +1039,6 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype) return migratetype == MIGRATE_HIGHATOMIC; } -static inline bool is_migrate_highatomic_page(struct page *page) -{ - return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe7853eb3ea0..b3b98a91d6a7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -643,23 +643,33 @@ compaction_capture(struct capture_control *capc, struct page *page, } #endif /* CONFIG_COMPACTION */ -/* Used for pages not on another list */ -static inline void add_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) +static inline void account_freepages(struct zone *zone, int nr_pages, + int migratetype) { - struct free_area *area = &zone->free_area[order]; + if (is_migrate_isolate(migratetype)) + return; - list_add(&page->buddy_list, &area->free_list[migratetype]); - area->nr_free++; + __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); + + if (is_migrate_cma(migratetype)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); } /* Used for pages not on another list */ -static inline void add_to_free_list_tail(struct page *page, struct zone *zone, - unsigned int order, int migratetype) +static inline void __add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype, + bool tail) { struct free_area *area = &zone->free_area[order]; - list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + + if (tail) + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + else + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -669,16 +679,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, * allocation again (e.g., optimization for memory onlining). */ static inline void move_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) + unsigned int order, int old_mt, int new_mt) { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->buddy_list, &area->free_list[migratetype]); + /* Free page moving can fail, so it happens before the type update */ + VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), old_mt, 1 << order); + + list_move_tail(&page->buddy_list, &area->free_list[new_mt]); + + account_freepages(zone, -(1 << order), old_mt); + account_freepages(zone, 1 << order, new_mt); } -static inline void del_page_from_free_list(struct page *page, struct zone *zone, - unsigned int order) +static inline void __del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) { + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + /* clear reported state and update reported page count */ if (page_reported(page)) __ClearPageReported(page); @@ -689,6 +711,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, zone->free_area[order].nr_free--; } +static inline void del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + __del_page_from_free_list(page, zone, order, migratetype); + account_freepages(zone, -(1 << order), migratetype); +} + static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { @@ -760,16 +789,16 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (likely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); - VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); + account_freepages(zone, 1 << order, migratetype); + while (order < MAX_PAGE_ORDER) { + int buddy_mt = migratetype; + if (compaction_capture(capc, page, order, migratetype)) { - __mod_zone_freepage_state(zone, -(1 << order), - migratetype); + account_freepages(zone, -(1 << order), migratetype); return; } @@ -784,19 +813,12 @@ static inline void __free_one_page(struct page *page, * pageblock isolation could cause incorrect freepage or CMA * accounting or HIGHATOMIC accounting. */ - int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); + buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); - if (migratetype != buddy_mt) { - if (!migratetype_is_mergeable(migratetype) || - !migratetype_is_mergeable(buddy_mt)) - goto done_merging; - /* - * Match buddy type. This ensures that - * an expand() down the line puts the - * sub-blocks on the right freelists. - */ - set_pageblock_migratetype(buddy, migratetype); - } + if (migratetype != buddy_mt && + (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) + goto done_merging; } /* @@ -804,9 +826,19 @@ static inline void __free_one_page(struct page *page, * merge with it and move up one order. */ if (page_is_guard(buddy)) - clear_page_guard(zone, buddy, order, migratetype); + clear_page_guard(zone, buddy, order); else - del_page_from_free_list(buddy, zone, order); + __del_page_from_free_list(buddy, zone, order, buddy_mt); + + if (unlikely(buddy_mt != migratetype)) { + /* + * Match buddy type. This ensures that an + * expand() down the line puts the sub-blocks + * on the right freelists. + */ + set_pageblock_migratetype(buddy, migratetype); + } + combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -823,10 +855,7 @@ done_merging: else to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); - if (to_tail) - add_to_free_list_tail(page, zone, order, migratetype); - else - add_to_free_list(page, zone, order, migratetype); + __add_to_free_list(page, zone, order, migratetype, to_tail); /* Notify page reporting subsystem of freed page */ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) @@ -1318,10 +1347,10 @@ static inline void expand(struct zone *zone, struct page *page, * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ - if (set_page_guard(zone, &page[size], high, migratetype)) + if (set_page_guard(zone, &page[size], high)) continue; - add_to_free_list(&page[size], zone, high, migratetype); + add_to_free_list(&page[size], zone, high, migratetype, false); set_buddy_order(&page[size], high); } } @@ -1492,7 +1521,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_list(page, zone, current_order); + del_page_from_free_list(page, zone, current_order, migratetype); expand(zone, page, order, current_order, migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, pcp_allowed_order(order) && @@ -1532,7 +1561,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * type's freelist. */ static int move_freepages(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn, int migratetype) + unsigned long end_pfn, int old_mt, int new_mt) { struct page *page; unsigned long pfn; @@ -1554,12 +1583,14 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn, VM_BUG_ON_PAGE(page_zone(page) != zone, page); order = buddy_order(page); - move_to_free_list(page, zone, order, migratetype); + + move_to_free_list(page, zone, order, old_mt, new_mt); + pfn += 1 << order; pages_moved += 1 << order; } - set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype); + set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); return pages_moved; } @@ -1617,7 +1648,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, } static int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) + int old_mt, int new_mt) { unsigned long start_pfn, end_pfn; @@ -1625,7 +1656,7 @@ static int move_freepages_block(struct zone *zone, struct page *page, NULL, NULL)) return -1; - return move_freepages(zone, start_pfn, end_pfn, migratetype); + return move_freepages(zone, start_pfn, end_pfn, old_mt, new_mt); } #ifdef CONFIG_MEMORY_ISOLATION @@ -1697,7 +1728,6 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, int migratetype) { unsigned long start_pfn, end_pfn, pfn; - int nr_moved, mt; if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, NULL, NULL)) @@ -1712,11 +1742,9 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, if (pfn != start_pfn) { struct page *buddy = pfn_to_page(pfn); int order = buddy_order(buddy); - int mt = get_pfnblock_migratetype(buddy, pfn); - if (!is_migrate_isolate(mt)) - __mod_zone_freepage_state(zone, -(1UL << order), mt); - del_page_from_free_list(buddy, zone, order); + del_page_from_free_list(buddy, zone, order, + get_pfnblock_migratetype(buddy, pfn)); set_pageblock_migratetype(page, migratetype); split_large_buddy(zone, buddy, pfn, order); return true; @@ -1724,23 +1752,17 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, /* We're the starting block of a larger buddy */ if (PageBuddy(page) && buddy_order(page) > pageblock_order) { - int mt = get_pfnblock_migratetype(page, pfn); int order = buddy_order(page); - if (!is_migrate_isolate(mt)) - __mod_zone_freepage_state(zone, -(1UL << order), mt); - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, + get_pfnblock_migratetype(page, pfn)); set_pageblock_migratetype(page, migratetype); split_large_buddy(zone, page, pfn, order); return true; } move: - mt = get_pfnblock_migratetype(page, start_pfn); - nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype); - if (!is_migrate_isolate(mt)) - __mod_zone_freepage_state(zone, -nr_moved, mt); - else if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, nr_moved, migratetype); + move_freepages(zone, start_pfn, end_pfn, + get_pfnblock_migratetype(page, start_pfn), migratetype); return true; } #endif /* CONFIG_MEMORY_ISOLATION */ @@ -1854,7 +1876,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { - del_page_from_free_list(page, zone, current_order); + del_page_from_free_list(page, zone, current_order, block_type); change_pageblock_range(page, current_order, start_type); expand(zone, page, order, current_order, start_type); return page; @@ -1904,12 +1926,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page, */ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) { - move_freepages(zone, start_pfn, end_pfn, start_type); + move_freepages(zone, start_pfn, end_pfn, block_type, start_type); return __rmqueue_smallest(zone, order, start_type); } single_page: - del_page_from_free_list(page, zone, current_order); + del_page_from_free_list(page, zone, current_order, block_type); expand(zone, page, order, current_order, block_type); return page; } @@ -1979,7 +2001,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ if (migratetype_is_mergeable(mt)) - if (move_freepages_block(zone, page, + if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) != -1) zone->nr_reserved_highatomic += pageblock_nr_pages; @@ -2020,11 +2042,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); + int mt; page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; + mt = get_pageblock_migratetype(page); /* * In page freeing path, migratetype change is racy so * we can counter several free pages in a pageblock @@ -2032,7 +2056,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (is_migrate_highatomic_page(page)) { + if (is_migrate_highatomic(mt)) { /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2054,7 +2078,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * of pageblocks that cannot be completely freed * may increase. */ - ret = move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, mt, + ac->migratetype); /* * Reserving this block already succeeded, so this should * not fail on zone boundaries. @@ -2225,12 +2250,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->pcp_list, list); - if (is_migrate_cma(get_pageblock_migratetype(page))) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, - -(1 << order)); } - - __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock_irqrestore(&zone->lock, flags); return i; @@ -2723,11 +2743,9 @@ int __isolate_free_page(struct page *page, unsigned int order) watermark = zone->_watermark[WMARK_MIN] + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; - - __mod_zone_freepage_state(zone, -(1UL << order), mt); } - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, mt); /* * Set the pageblock if the isolated page is at least half of a @@ -2742,7 +2760,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * with others) */ if (migratetype_is_mergeable(mt)) - move_freepages_block(zone, page, + move_freepages_block(zone, page, mt, MIGRATE_MOVABLE); } } @@ -2827,8 +2845,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, return NULL; } } - __mod_zone_freepage_state(zone, -(1 << order), - get_pageblock_migratetype(page)); spin_unlock_irqrestore(&zone->lock, flags); } while (check_new_pages(page, order)); @@ -6716,8 +6732,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); + VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE); order = buddy_order(page); - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); @@ -6745,6 +6762,14 @@ bool is_free_buddy_page(struct page *page) EXPORT_SYMBOL(is_free_buddy_page); #ifdef CONFIG_MEMORY_FAILURE +static inline void add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype, + bool tail) +{ + __add_to_free_list(page, zone, order, migratetype, tail); + account_freepages(zone, 1 << order, migratetype); +} + /* * Break down a higher-order page in sub-pages, and keep our target out of * buddy allocator. @@ -6767,10 +6792,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, current_buddy = page + size; } - if (set_page_guard(zone, current_buddy, high, migratetype)) + if (set_page_guard(zone, current_buddy, high)) continue; - add_to_free_list(current_buddy, zone, high, migratetype); + add_to_free_list(current_buddy, zone, high, migratetype, false); set_buddy_order(current_buddy, high); } } @@ -6796,12 +6821,11 @@ bool take_page_off_buddy(struct page *page) int migratetype = get_pfnblock_migratetype(page_head, pfn_head); - del_page_from_free_list(page_head, zone, page_order); + del_page_from_free_list(page_head, zone, page_order, + migratetype); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); SetPageHWPoisonTakenOff(page); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -1, migratetype); ret = true; break; } @@ -6909,7 +6933,7 @@ static bool try_to_accept_memory_one(struct zone *zone) list_del(&page->lru); last = list_empty(&zone->unaccepted_pages); - __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); @@ -6961,7 +6985,7 @@ static bool __free_unaccepted(struct page *page) spin_lock_irqsave(&zone->lock, flags); first = list_empty(&zone->unaccepted_pages); list_add_tail(&page->lru, &zone->unaccepted_pages); - __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); -- cgit v1.2.3 From ebb34f78d72c2320620ba6d55cb22a52949047a1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 27 Feb 2024 21:15:48 +0100 Subject: mm: convert folio_estimated_sharers() to folio_likely_mapped_shared() Callers of folio_estimated_sharers() only care about "mapped shared vs. mapped exclusively", not the exact estimate of sharers. Let's consolidate and unify the condition users are checking. While at it clarify the semantics and extend the discussion on the fuzziness. Use the "likely mapped shared" terminology to better express what the (adjusted) function actually checks. Whether a partially-mappable folio is more likely to not be partially mapped than partially mapped is debatable. In the future, we might be able to improve our estimate for partially-mappable folios, though. Note that we will now consistently detect "mapped shared" only if the first subpage is actually mapped multiple times. When the first subpage is not mapped, we will consistently detect it as "mapped exclusively". This change should currently only affect the usage in madvise_free_pte_range() and queue_folios_pte_range() for large folios: if the first page was already unmapped, we would have skipped the folio. [david@redhat.com: folio_likely_mapped_shared() kerneldoc fixup] Link: https://lkml.kernel.org/r/dd0ad9f2-2d7a-45f3-9ba3-979488c7dd27@redhat.com Link: https://lkml.kernel.org/r/20240227201548.857831-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Khalid Aziz Acked-by: Barry Song Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Ryan Roberts Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 48 ++++++++++++++++++++++++++++++++++++++---------- mm/huge_memory.c | 2 +- mm/madvise.c | 6 +++--- mm/memory.c | 2 +- mm/mempolicy.c | 14 ++++++-------- mm/migrate.c | 8 ++++---- 6 files changed, 53 insertions(+), 27 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1588fe15a38e..fe66b515aaab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2165,21 +2165,49 @@ static inline size_t folio_size(struct folio *folio) } /** - * folio_estimated_sharers - Estimate the number of sharers of a folio. + * folio_likely_mapped_shared - Estimate if the folio is mapped into the page + * tables of more than one MM * @folio: The folio. * - * folio_estimated_sharers() aims to serve as a function to efficiently - * estimate the number of processes sharing a folio. This is done by - * looking at the precise mapcount of the first subpage in the folio, and - * assuming the other subpages are the same. This may not be true for large - * folios. If you want exact mapcounts for exact calculations, look at - * page_mapcount() or folio_total_mapcount(). + * This function checks if the folio is currently mapped into more than one + * MM ("mapped shared"), or if the folio is only mapped into a single MM + * ("mapped exclusively"). * - * Return: The estimated number of processes sharing a folio. + * As precise information is not easily available for all folios, this function + * estimates the number of MMs ("sharers") that are currently mapping a folio + * using the number of times the first page of the folio is currently mapped + * into page tables. + * + * For small anonymous folios (except KSM folios) and anonymous hugetlb folios, + * the return value will be exactly correct, because they can only be mapped + * at most once into an MM, and they cannot be partially mapped. + * + * For other folios, the result can be fuzzy: + * #. For partially-mappable large folios (THP), the return value can wrongly + * indicate "mapped exclusively" (false negative) when the folio is + * only partially mapped into at least one MM. + * #. For pagecache folios (including hugetlb), the return value can wrongly + * indicate "mapped shared" (false positive) when two VMAs in the same MM + * cover the same file range. + * #. For (small) KSM folios, the return value can wrongly indicate "mapped + * shared" (false negative), when the folio is mapped multiple times into + * the same MM. + * + * Further, this function only considers current page table mappings that + * are tracked using the folio mapcount(s). + * + * This function does not consider: + * #. If the folio might get mapped in the (near) future (e.g., swapcache, + * pagecache, temporary unmapping for migration). + * #. If the folio is mapped differently (VM_PFNMAP). + * #. If hugetlb page table sharing applies. Callers might want to check + * hugetlb_pmd_shared(). + * + * Return: Whether the folio is estimated to be mapped into more than one MM. */ -static inline int folio_estimated_sharers(struct folio *folio) +static inline bool folio_likely_mapped_shared(struct folio *folio) { - return page_mapcount(folio_page(folio, 0)); + return page_mapcount(folio_page(folio, 0)) > 1; } #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 16b2c5622fb1..1170fc22ed89 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1821,7 +1821,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio. */ - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) goto out; if (!folio_trylock(folio)) diff --git a/mm/madvise.c b/mm/madvise.c index a2dd70c4a2e6..7625830d6ae9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -366,7 +366,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio = pfn_folio(pmd_pfn(orig_pmd)); /* Do not interfere with other mappings of this folio */ - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) @@ -453,7 +453,7 @@ restart: if (folio_test_large(folio)) { int err; - if (folio_estimated_sharers(folio) > 1) + if (folio_likely_mapped_shared(folio)) break; if (pageout_anon_only_filter && !folio_test_anon(folio)) break; @@ -677,7 +677,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (folio_test_large(folio)) { int err; - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) break; if (!folio_trylock(folio)) break; diff --git a/mm/memory.c b/mm/memory.c index 805cebb6fd72..bcd29104f5ea 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5112,7 +5112,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED)) + if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; nid = folio_nid(folio); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72626b2cefa9..913cff5da5a3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -642,12 +642,11 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated sharers of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || - (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) + (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) if (!isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: @@ -1032,11 +1031,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated sharers of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, diff --git a/mm/migrate.c b/mm/migrate.c index 63c97bb639d7..a31aa75d223d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2593,11 +2593,11 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, /* * Don't migrate file folios that are mapped in multiple processes * with execute permissions as they are probably shared libraries. - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) && + if (folio_likely_mapped_shared(folio) && folio_is_file_lru(folio) && (vma->vm_flags & VM_EXEC)) goto out; -- cgit v1.2.3 From 6600a6b10c3210280e7623b8ba7feac9619cc2cd Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 25 Mar 2024 22:56:42 +0800 Subject: mm/mm_init.c: remove the useless dma_reserve Now nobody calls set_dma_reserve() to set value for dma_reserve, remove set_dma_reserve(), global variable dma_reserve and the codes using it. Link: https://lkml.kernel.org/r/20240325145646.1044760-3-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/mm_init.c | 23 ----------------------- 2 files changed, 24 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index fe66b515aaab..22ae6ab621da 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3250,7 +3250,6 @@ static inline int early_pfn_to_nid(unsigned long pfn) extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif -extern void set_dma_reserve(unsigned long new_dma_reserve); extern void mem_init(void); extern void __init mmap_init(void); diff --git a/mm/mm_init.c b/mm/mm_init.c index f45c2b32ba82..91bb8600c0fa 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -227,7 +227,6 @@ static unsigned long required_movablecore_percent __initdata; static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; -static unsigned long dma_reserve __initdata; static bool deferred_struct_pages __meminitdata; @@ -1584,12 +1583,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat) zone_names[j], memmap_pages, freesize); } - /* Account for reserved pages */ - if (j == 0 && freesize > dma_reserve) { - freesize -= dma_reserve; - pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); - } - if (!is_highmem_idx(j)) nr_kernel_pages += freesize; /* Charge for highmem memmap if there are enough kernel pages */ @@ -2548,22 +2541,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/** - * set_dma_reserve - set the specified number of pages reserved in the first zone - * @new_dma_reserve: The number of pages to mark reserved - * - * The per-cpu batchsize and zone watermarks are determined by managed_pages. - * In the DMA zone, a significant percentage may be consumed by kernel image - * and other unfreeable allocations which can skew the watermarks badly. This - * function may optionally be used to account for unfreeable pages in the - * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and - * smaller per-cpu batchsize. - */ -void __init set_dma_reserve(unsigned long new_dma_reserve) -{ - dma_reserve = new_dma_reserve; -} - void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { -- cgit v1.2.3 From 0b52663f7547520fbf54e2c07f61f8bd1b5cb6eb Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 25 Mar 2024 22:56:46 +0800 Subject: mm/mm_init.c: remove arch_reserved_kernel_pages() Since the current calculation of calc_nr_kernel_pages() has taken into consideration of kernel reserved memory, no need to have arch_reserved_kernel_pages() any more. Link: https://lkml.kernel.org/r/20240325145646.1044760-7-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/mmu.h | 4 ---- arch/powerpc/kernel/fadump.c | 5 ----- include/linux/mm.h | 3 --- mm/mm_init.c | 12 ------------ 4 files changed, 24 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 3b72c7ed24cf..aa5c0fd5edb1 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -406,9 +406,5 @@ extern void *abatron_pteptrs[2]; #include #endif -#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) -#define __HAVE_ARCH_RESERVED_KERNEL_PAGES -#endif - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MMU_H_ */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d14eda1e8589..ae8c7619e597 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1735,8 +1735,3 @@ static void __init fadump_reserve_crash_area(u64 base) memblock_reserve(mstart, msize); } } - -unsigned long __init arch_reserved_kernel_pages(void) -{ - return memblock_reserved_size() / PAGE_SIZE; -} diff --git a/include/linux/mm.h b/include/linux/mm.h index 22ae6ab621da..dd47ba7a1c04 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3261,9 +3261,6 @@ static inline void show_mem(void) extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); -#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES -extern unsigned long arch_reserved_kernel_pages(void); -#endif extern __printf(3, 4) void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); diff --git a/mm/mm_init.c b/mm/mm_init.c index 8c261572ca6e..0d32bcc301e2 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2374,17 +2374,6 @@ void __init page_alloc_init_late(void) page_alloc_sysctl_init(); } -#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES -/* - * Returns the number of pages that arch has reserved but - * is not known to alloc_large_system_hash(). - */ -static unsigned long __init arch_reserved_kernel_pages(void) -{ - return 0; -} -#endif - /* * Adaptive scale is meant to reduce sizes of hash tables on large memory * machines. As memory size is increased the scale is also increased but at @@ -2427,7 +2416,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; - numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SIZE < SZ_1M) -- cgit v1.2.3 From cb10c28ac82c9b7a5e9b3b1dc7157036c20c36dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 25 Mar 2024 07:45:41 +0800 Subject: mm: remove follow_pfn Remove follow_pfn now that the last user is gone. Link: https://lkml.kernel.org/r/20240324234542.2038726-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fei Li Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/memory.c | 36 ++---------------------------------- mm/nommu.c | 21 --------------------- 3 files changed, 2 insertions(+), 57 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index dd47ba7a1c04..cb8e6158b577 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2424,8 +2424,6 @@ int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index bcd29104f5ea..652370703e41 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5884,8 +5884,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore * should be taken for read. * - * KVM uses this function. While it is arguably less bad than ``follow_pfn``, - * it is not a good general-purpose API. + * KVM uses this function. While it is arguably less bad than the historic + * ``follow_pfn``, it is not a good general-purpose API. * * Return: zero on success, -ve otherwise. */ @@ -5927,38 +5927,6 @@ out: } EXPORT_SYMBOL_GPL(follow_pte); -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * This function does not allow the caller to read the permissions - * of the PTE. Do not use it. - * - * Return: zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) -{ - int ret = -EINVAL; - spinlock_t *ptl; - pte_t *ptep; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return ret; - - ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); - if (ret) - return ret; - *pfn = pte_pfn(ptep_get(ptep)); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(follow_pfn); - #ifdef CONFIG_HAVE_IOREMAP_PROT int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, diff --git a/mm/nommu.c b/mm/nommu.c index 88b646ea6d30..331d2f778695 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -110,27 +110,6 @@ unsigned int kobjsize(const void *objp) return page_size(page); } -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * Returns zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) -{ - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - - *pfn = address >> PAGE_SHIFT; - return 0; -} -EXPORT_SYMBOL(follow_pfn); - void vfree(const void *addr) { kfree(addr); -- cgit v1.2.3 From 5b34b76cb0cd8a21dee5c7677eae98480b0d05cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 25 Mar 2024 07:45:42 +0800 Subject: mm: move follow_phys to arch/x86/mm/pat/memtype.c follow_phys is only used by two callers in arch/x86/mm/pat/memtype.c. Move it there and hardcode the two arguments that get the same values passed by both callers. [david@redhat.com: conflict resolutions] Link: https://lkml.kernel.org/r/20240403212131.929421-4-david@redhat.com Link: https://lkml.kernel.org/r/20240324234542.2038726-4-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Andy Lutomirski Cc: Dave Hansen Cc: Fei Li Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- arch/x86/mm/pat/memtype.c | 29 ++++++++++++++++++++++++++++- include/linux/mm.h | 2 -- mm/memory.c | 32 -------------------------------- 3 files changed, 28 insertions(+), 35 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 36b603d0cdde..d01c3b0bd6eb 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -947,6 +948,32 @@ static void free_pfn_range(u64 paddr, unsigned long size) memtype_free(paddr, paddr + size); } +static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, + resource_size_t *phys) +{ + pte_t *ptep, pte; + spinlock_t *ptl; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + if (follow_pte(vma->vm_mm, vma->vm_start, &ptep, &ptl)) + return -EINVAL; + + pte = ptep_get(ptep); + + /* Never return PFNs of anon folios in COW mappings. */ + if (vm_normal_folio(vma, vma->vm_start, pte)) { + pte_unmap_unlock(ptep, ptl); + return -EINVAL; + } + + *prot = pgprot_val(pte_pgprot(pte)); + *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + pte_unmap_unlock(ptep, ptl); + return 0; +} + static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, pgprot_t *pgprot) { @@ -964,7 +991,7 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, * detect the PFN. If we need the cachemode as well, we're out of luck * for now and have to fail fork(). */ - if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) { + if (!follow_phys(vma, &prot, paddr)) { if (pgprot) *pgprot = __pgprot(prot); return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index cb8e6158b577..5dc65618e386 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2424,8 +2424,6 @@ int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); -int follow_phys(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); diff --git a/mm/memory.c b/mm/memory.c index 652370703e41..62ee4a15092a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5928,38 +5928,6 @@ out: EXPORT_SYMBOL_GPL(follow_pte); #ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot, resource_size_t *phys) -{ - int ret = -EINVAL; - pte_t *ptep, pte; - spinlock_t *ptl; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - goto out; - - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) - goto out; - pte = ptep_get(ptep); - - /* Never return PFNs of anon folios in COW mappings. */ - if (vm_normal_folio(vma, address, pte)) - goto unlock; - - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - *prot = pgprot_val(pte_pgprot(pte)); - *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - - ret = 0; -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return ret; -} - /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access -- cgit v1.2.3 From b84fd2835c70e0149e2522fd746d3fb7049a1e19 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 Mar 2024 17:10:28 +0000 Subject: mm: make page_mapped() take a const argument None of the functions called by page_mapped() modify the page or folio, so mark them all as const. Link: https://lkml.kernel.org/r/20240326171045.410737-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++------ mm/internal.h | 7 ++++--- mm/rmap.c | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5dc65618e386..9c0e441664d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1200,7 +1200,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) * debugging purposes - it does not include PTE-mapped sub-pages; look * at folio_mapcount() or page_mapcount() instead. */ -static inline int folio_entire_mapcount(struct folio *folio) +static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_entire_mapcount) + 1; @@ -1240,7 +1240,7 @@ static inline int page_mapcount(struct page *page) return mapcount; } -int folio_total_mapcount(struct folio *folio); +int folio_total_mapcount(const struct folio *folio); /** * folio_mapcount() - Calculate the number of mappings of this folio. @@ -1253,14 +1253,14 @@ int folio_total_mapcount(struct folio *folio); * * Return: The number of times this folio is mapped. */ -static inline int folio_mapcount(struct folio *folio) +static inline int folio_mapcount(const struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) + 1; return folio_total_mapcount(folio); } -static inline bool folio_large_is_mapped(struct folio *folio) +static inline bool folio_large_is_mapped(const struct folio *folio) { /* * Reading _entire_mapcount below could be omitted if hugetlb @@ -1288,7 +1288,7 @@ static inline bool folio_mapped(struct folio *folio) * For compound page it returns true if any sub-page of compound page is mapped, * even if this particular sub-page is not itself mapped by any PTE or PMD. */ -static inline bool page_mapped(struct page *page) +static inline bool page_mapped(const struct page *page) { if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) >= 0; @@ -2070,7 +2070,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone, * * Return: A positive power of two. */ -static inline long folio_nr_pages(struct folio *folio) +static inline long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; diff --git a/mm/internal.h b/mm/internal.h index 5dbfa1c12e89..8e11f7b2da21 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -71,7 +71,7 @@ void page_writeback_init(void); * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. */ -static inline int folio_nr_pages_mapped(struct folio *folio) +static inline int folio_nr_pages_mapped(const struct folio *folio) { return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } @@ -81,7 +81,8 @@ static inline int folio_nr_pages_mapped(struct folio *folio) * folio. We cannot rely on folio->swap as there is no guarantee that it has * been initialized. Used for calling arch_swap_restore() */ -static inline swp_entry_t folio_swap(swp_entry_t entry, struct folio *folio) +static inline swp_entry_t folio_swap(swp_entry_t entry, + const struct folio *folio) { swp_entry_t swap = { .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), @@ -90,7 +91,7 @@ static inline swp_entry_t folio_swap(swp_entry_t entry, struct folio *folio) return swap; } -static inline void *folio_raw_mapping(struct folio *folio) +static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; diff --git a/mm/rmap.c b/mm/rmap.c index 3746a5531018..d52759aa3ff7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1134,7 +1134,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } -int folio_total_mapcount(struct folio *folio) +int folio_total_mapcount(const struct folio *folio) { int mapcount = folio_entire_mapcount(folio); int nr_pages; -- cgit v1.2.3 From b6dd94596f7fafd162da236b438e31b12d352187 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 26 Mar 2024 14:11:29 +0800 Subject: mm: make __absent_pages_in_range() as static It's only called in mm/mm_init.c now. Link: https://lkml.kernel.org/r/20240326061134.1055295-4-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/mm_init.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c0e441664d4..00a46a5d5c9d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3229,8 +3229,6 @@ static inline unsigned long get_num_physpages(void) */ void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); -unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, - unsigned long end_pfn); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, diff --git a/mm/mm_init.c b/mm/mm_init.c index 0fe466abde43..19c78b54fc8a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1144,7 +1144,7 @@ static void __init adjust_zone_range_for_zone_movable(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -unsigned long __init __absent_pages_in_range(int nid, +static unsigned long __init __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { -- cgit v1.2.3 From 239e9a90c887d33e9339870a650d2bd621b95674 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 27 Mar 2024 11:23:23 -0400 Subject: mm: introduce vma_pgtable_walk_{begin|end}() Introduce per-vma begin()/end() helpers for pgtable walks. This is a preparation work to merge hugetlb pgtable walkers with generic mm. The helpers need to be called before and after a pgtable walk, will start to be needed if the pgtable walker code supports hugetlb pages. It's a hook point for any type of VMA, but for now only hugetlb uses it to stablize the pgtable pages from getting away (due to possible pmd unsharing). Link: https://lkml.kernel.org/r/20240327152332.950956-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Christoph Hellwig Reviewed-by: Muchun Song Tested-by: Ryan Roberts Cc: Andrea Arcangeli Cc: Andrew Jones Cc: Aneesh Kumar K.V (IBM) Cc: Axel Rasmussen Cc: Christophe Leroy Cc: David Hildenbrand Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: "Mike Rapoport (IBM)" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +++ mm/memory.c | 12 ++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 00a46a5d5c9d..3dda0dbcae14 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4230,4 +4230,7 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn) return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE); } +void vma_pgtable_walk_begin(struct vm_area_struct *vma); +void vma_pgtable_walk_end(struct vm_area_struct *vma); + #endif /* _LINUX_MM_H */ diff --git a/mm/memory.c b/mm/memory.c index 62ee4a15092a..0b92336bcebd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6378,3 +6378,15 @@ void ptlock_free(struct ptdesc *ptdesc) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif + +void vma_pgtable_walk_begin(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_lock_read(vma); +} + +void vma_pgtable_walk_end(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_unlock_read(vma); +} -- cgit v1.2.3 From 8a0fe564bb2e744f21ca94e3eabe96d232cf5f41 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 25 Mar 2024 19:16:47 -0700 Subject: mm: use get_unmapped_area_vmflags() When memory is being placed, mmap() will take care to respect the guard gaps of certain types of memory (VM_SHADOWSTACK, VM_GROWSUP and VM_GROWSDOWN). In order to ensure guard gaps between mappings, mmap() needs to consider two things: 1. That the new mapping isn't placed in an any existing mappings guard gaps. 2. That the new mapping isn't placed such that any existing mappings are not in *its* guard gaps. The long standing behavior of mmap() is to ensure 1, but not take any care around 2. So for example, if there is a PAGE_SIZE free area, and a mmap() with a PAGE_SIZE size, and a type that has a guard gap is being placed, mmap() may place the shadow stack in the PAGE_SIZE free area. Then the mapping that is supposed to have a guard gap will not have a gap to the adjacent VMA. Use mm_get_unmapped_area_vmflags() in the do_mmap() so future changes can cause shadow stack mappings to be placed with a guard gap. Also use the THP variant that takes vm_flags, such that THP shadow stack can get the same treatment. Adjust the vm_flags calculation to happen earlier so that the vm_flags can be passed into __get_unmapped_area(). Link: https://lkml.kernel.org/r/20240326021656.202649-6-rick.p.edgecombe@intel.com Signed-off-by: Rick Edgecombe Reviewed-by: Christophe Leroy Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov (AMD) Cc: Dan Williams Cc: Dave Hansen Cc: Deepak Gupta Cc: Guo Ren Cc: Helge Deller Cc: H. Peter Anvin (Intel) Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Liam R. Howlett Cc: Mark Brown Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++++- mm/mmap.c | 32 ++++++++++++++++---------------- 2 files changed, 26 insertions(+), 17 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3dda0dbcae14..1b6903f4c57b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3413,7 +3413,16 @@ extern int install_special_mapping(struct mm_struct *mm, unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +unsigned long +__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); + +static inline unsigned long +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return __get_unmapped_area(file, addr, len, pgoff, flags, 0); +} extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, diff --git a/mm/mmap.c b/mm/mmap.c index 30baba087c65..a37c8dd29d64 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1255,18 +1255,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; - /* Obtain the address to map to. we verify (or select) it and ensure - * that it represents a valid section of the address space. - */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (IS_ERR_VALUE(addr)) - return addr; - - if (flags & MAP_FIXED_NOREPLACE) { - if (find_vma_intersection(mm, addr, addr + len)) - return -EEXIST; - } - if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) @@ -1280,6 +1268,18 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags); + if (IS_ERR_VALUE(addr)) + return addr; + + if (flags & MAP_FIXED_NOREPLACE) { + if (find_vma_intersection(mm, addr, addr + len)) + return -EEXIST; + } + if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; @@ -1836,8 +1836,8 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi } unsigned long -get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) +__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) @@ -1872,8 +1872,8 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) addr = get_area(file, addr, len, pgoff, flags); else - addr = mm_get_unmapped_area(current->mm, file, addr, len, - pgoff, flags); + addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, + pgoff, flags, vm_flags); if (IS_ERR_VALUE(addr)) return addr; -- cgit v1.2.3 From 44bd7ace9fd6490dce3b001b21d5c6bac869171b Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 25 Mar 2024 19:16:53 -0700 Subject: mm: take placement mappings gap into account When memory is being placed, mmap() will take care to respect the guard gaps of certain types of memory (VM_SHADOWSTACK, VM_GROWSUP and VM_GROWSDOWN). In order to ensure guard gaps between mappings, mmap() needs to consider two things: 1. That the new mapping isn't placed in an any existing mappings guard gaps. 2. That the new mapping isn't placed such that any existing mappings are not in *its* guard gaps. The longstanding behavior of mmap() is to ensure 1, but not take any care around 2. So for example, if there is a PAGE_SIZE free area, and a mmap() with a PAGE_SIZE size, and a type that has a guard gap is being placed, mmap() may place the shadow stack in the PAGE_SIZE free area. Then the mapping that is supposed to have a guard gap will not have a gap to the adjacent VMA. For MAP_GROWSDOWN/VM_GROWSDOWN and MAP_GROWSUP/VM_GROWSUP this has not been a problem in practice because applications place these kinds of mappings very early, when there is not many mappings to find a space between. But for shadow stacks, they may be placed throughout the lifetime of the application. Use the start_gap field to find a space that includes the guard gap for the new mapping. Take care to not interfere with the alignment. Link: https://lkml.kernel.org/r/20240326021656.202649-12-rick.p.edgecombe@intel.com Signed-off-by: Rick Edgecombe Reviewed-by: Christophe Leroy Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov (AMD) Cc: Dan Williams Cc: Dave Hansen Cc: Deepak Gupta Cc: Guo Ren Cc: Helge Deller Cc: H. Peter Anvin (Intel) Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Liam R. Howlett Cc: Mark Brown Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + mm/mmap.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1b6903f4c57b..2d5e492ef57f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3468,6 +3468,7 @@ struct vm_unmapped_area_info { unsigned long high_limit; unsigned long align_mask; unsigned long align_offset; + unsigned long start_gap; }; extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); diff --git a/mm/mmap.c b/mm/mmap.c index 4ad386f3f63f..d3c2ca8efa53 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1579,7 +1579,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask; + length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; @@ -1591,7 +1591,13 @@ retry: if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) return -ENOMEM; - gap = vma_iter_addr(&vmi); + /* + * Adjust for the gap first so it doesn't interfere with the + * later alignment. The first step is the minimum needed to + * fulill the start gap, the next steps is the minimum to align + * that. It is the minimum needed to fulill both. + */ + gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ @@ -1630,7 +1636,7 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask; + length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; -- cgit v1.2.3 From ba42b524a0408b5f92bd41edaee1ea84309ab9ae Mon Sep 17 00:00:00 2001 From: York Jasper Niebuhr Date: Fri, 29 Mar 2024 15:56:05 +0100 Subject: mm: init_mlocked_on_free_v3 Implements the "init_mlocked_on_free" boot option. When this boot option is enabled, any mlock'ed pages are zeroed on free. If the pages are munlock'ed beforehand, no initialization takes place. This boot option is meant to combat the performance hit of "init_on_free" as reported in commit 6471384af2a6 ("mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options"). With "init_mlocked_on_free=1" only relevant data is freed while everything else is left untouched by the kernel. Correspondingly, this patch introduces no performance hit for unmapping non-mlock'ed memory. The unmapping overhead for purely mlocked memory was measured to be approximately 13%. Realistically, most systems mlock only a fraction of the total memory so the real-world system overhead should be close to zero. Optimally, userspace programs clear any key material or other confidential memory before exit and munlock the according memory regions. If a program crashes, userspace key managers fail to do this job. Accordingly, no munlock operations are performed so the data is caught and zeroed by the kernel. Should the program not crash, all memory will ideally be munlocked so no overhead is caused. CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON can be set to enable "init_mlocked_on_free" by default. Link: https://lkml.kernel.org/r/20240329145605.149917-1-yjnworkstation@gmail.com Signed-off-by: York Jasper Niebuhr Cc: Matthew Wilcox (Oracle) Cc: York Jasper Niebuhr Cc: Kees Cook Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++ include/linux/mm.h | 9 +++++- mm/internal.h | 1 + mm/memory.c | 6 ++++ mm/mm_init.c | 43 +++++++++++++++++++++---- mm/page_alloc.c | 2 +- security/Kconfig.hardening | 15 +++++++++ 7 files changed, 73 insertions(+), 9 deletions(-) (limited to 'include/linux/mm.h') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 902ecd92a29f..3ff97de349da 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2148,6 +2148,12 @@ Format: 0 | 1 Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. + init_mlocked_on_free= [MM] Fill freed userspace memory with zeroes if + it was mlock'ed and not explicitly munlock'ed + afterwards. + Format: 0 | 1 + Default set by CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON + init_pkru= [X86] Specify the default memory protection keys rights register contents for all processes. 0x55555554 by default (disallow access to all but pkey 0). Can diff --git a/include/linux/mm.h b/include/linux/mm.h index 2d5e492ef57f..4f4e460d7853 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3762,7 +3762,14 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, - &init_on_free); + &init_on_free); +} + +DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); +static inline bool want_init_mlocked_on_free(void) +{ + return static_branch_maybe(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, + &init_mlocked_on_free); } extern bool _debug_pagealloc_enabled_early; diff --git a/mm/internal.h b/mm/internal.h index 6614ba4ca9de..cf7799e29391 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -506,6 +506,7 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); +extern void kernel_init_pages(struct page *page, int numpages); /* * This will have no effect, other than possibly generating a warning, if the diff --git a/mm/memory.c b/mm/memory.c index 0b92336bcebd..80944acb5b4e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1506,6 +1506,12 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); } + + if (want_init_mlocked_on_free() && folio_test_mlocked(folio) && + !delay_rmap && folio_test_anon(folio)) { + kernel_init_pages(page, folio_nr_pages(folio)); + } + if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; diff --git a/mm/mm_init.c b/mm/mm_init.c index d01912b8a597..2c8f3af4430f 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2522,6 +2522,9 @@ EXPORT_SYMBOL(init_on_alloc); DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); EXPORT_SYMBOL(init_on_free); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); +EXPORT_SYMBOL(init_mlocked_on_free); + static bool _init_on_alloc_enabled_early __read_mostly = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) @@ -2539,6 +2542,14 @@ static int __init early_init_on_free(char *buf) } early_param("init_on_free", early_init_on_free); +static bool _init_mlocked_on_free_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON); +static int __init early_init_mlocked_on_free(char *buf) +{ + return kstrtobool(buf, &_init_mlocked_on_free_enabled_early); +} +early_param("init_mlocked_on_free", early_init_mlocked_on_free); + DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); /* @@ -2566,12 +2577,21 @@ static void __init mem_debugging_and_hardening_init(void) } #endif - if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && + if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early || + _init_mlocked_on_free_enabled_early) && page_poisoning_requested) { pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " - "will take precedence over init_on_alloc and init_on_free\n"); + "will take precedence over init_on_alloc, init_on_free " + "and init_mlocked_on_free\n"); _init_on_alloc_enabled_early = false; _init_on_free_enabled_early = false; + _init_mlocked_on_free_enabled_early = false; + } + + if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) { + pr_info("mem auto-init: init_on_free is on, " + "will take precedence over init_mlocked_on_free\n"); + _init_mlocked_on_free_enabled_early = false; } if (_init_on_alloc_enabled_early) { @@ -2588,9 +2608,17 @@ static void __init mem_debugging_and_hardening_init(void) static_branch_disable(&init_on_free); } - if (IS_ENABLED(CONFIG_KMSAN) && - (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) - pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); + if (_init_mlocked_on_free_enabled_early) { + want_check_pages = true; + static_branch_enable(&init_mlocked_on_free); + } else { + static_branch_disable(&init_mlocked_on_free); + } + + if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early || + _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and " + "init_mlocked_on_free are disabled when running KMSAN\n"); #ifdef CONFIG_DEBUG_PAGEALLOC if (debug_pagealloc_enabled()) { @@ -2629,9 +2657,10 @@ static void __init report_meminit(void) else stack = "off"; - pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", + pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n", stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off", - want_init_on_free() ? "on" : "off"); + want_init_on_free() ? "on" : "off", + want_init_mlocked_on_free() ? "on" : "off"); if (want_init_on_free()) pr_info("mem auto-init: clearing system memory may take some time...\n"); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 47ab0297838a..e030ccf9d5bc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1032,7 +1032,7 @@ static inline bool should_skip_kasan_poison(struct page *page) return page_kasan_tag(page) == KASAN_TAG_KERNEL; } -static void kernel_init_pages(struct page *page, int numpages) +void kernel_init_pages(struct page *page, int numpages) { int i; diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index 2cff851ebfd7..effbf5982be1 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -255,6 +255,21 @@ config INIT_ON_FREE_DEFAULT_ON touching "cold" memory areas. Most cases see 3-5% impact. Some synthetic workloads have measured as high as 8%. +config INIT_MLOCKED_ON_FREE_DEFAULT_ON + bool "Enable mlocked memory zeroing on free" + depends on !KMSAN + help + This config has the effect of setting "init_mlocked_on_free=1" + on the kernel command line. If it is enabled, all mlocked process + memory is zeroed when freed. This restriction to mlocked memory + improves performance over "init_on_free" but can still be used to + protect confidential data like key material from content exposures + to other processes, as well as live forensics and cold boot attacks. + Any non-mlocked memory is not cleared before it is reassigned. This + configuration can be overwritten by setting "init_mlocked_on_free=0" + on the command line. The "init_on_free" boot option takes + precedence over "init_mlocked_on_free". + config CC_HAS_ZERO_CALL_USED_REGS def_bool $(cc-option,-fzero-call-used-regs=used-gpr) # https://github.com/ClangBuiltLinux/linux/issues/1766 -- cgit v1.2.3 From 2542b1ac9a46ac58f9565de0048457956898d481 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Apr 2024 16:32:25 +0100 Subject: mm: inline destroy_large_folio() into __folio_put_large() destroy_large_folio() has only one caller, move its contents there. Link: https://lkml.kernel.org/r/20240405153228.2563754-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/page_alloc.c | 14 -------------- mm/swap.c | 13 ++++++++++--- 3 files changed, 10 insertions(+), 19 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4f4e460d7853..cc9410d772dd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1318,8 +1318,6 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -void destroy_large_folio(struct folio *folio); - /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c01757251de2..22e8b9f1d710 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -565,20 +565,6 @@ void prep_compound_page(struct page *page, unsigned int order) prep_compound_head(page, order); } -void destroy_large_folio(struct folio *folio) -{ - if (folio_test_hugetlb(folio)) { - free_huge_folio(folio); - return; - } - - if (folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); - - mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, folio_order(folio)); -} - static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); diff --git a/mm/swap.c b/mm/swap.c index d7db3cd4e80a..6e3bd03673ea 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -127,9 +127,16 @@ static void __folio_put_large(struct folio *folio) * (it's never listed to any LRU lists) and no memcg routines should * be called for hugetlb (it has a separate hugetlb_cgroup.) */ - if (!folio_test_hugetlb(folio)) - page_cache_release(folio); - destroy_large_folio(folio); + if (folio_test_hugetlb(folio)) { + free_huge_folio(folio); + return; + } + + page_cache_release(folio); + if (folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + mem_cgroup_uncharge(folio); + free_unref_page(&folio->page, folio_order(folio)); } void __folio_put(struct folio *folio) -- cgit v1.2.3 From 38bc9c28c3780d8d0ca1ed1a1fbfe8c91e20f5f2 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Sun, 7 Apr 2024 14:26:53 +0800 Subject: mm/mmap: make vma_wants_writenotify return bool vma_wants_writenotify() should return bool, so change it. Link: https://lkml.kernel.org/r/20240407062653.803142-1-gehao@kylinos.cn Signed-off-by: Hao Ge Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/mmap.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index cc9410d772dd..0b933f9f4864 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2575,7 +2575,7 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, MM_CP_UFFD_WP_RESOLVE) bool vma_needs_dirty_tracking(struct vm_area_struct *vma); -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* diff --git a/mm/mmap.c b/mm/mmap.c index d3c2ca8efa53..c9367454e19f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1514,32 +1514,32 @@ bool vma_needs_dirty_tracking(struct vm_area_struct *vma) * to the private version (using protection_map[] without the * VM_SHARED bit). */ -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { /* If it was private or non-writable, the write bit is already clear */ if (!vma_is_shared_writable(vma)) - return 0; + return false; /* The backer wishes to know when pages are first written to? */ if (vm_ops_needs_writenotify(vma->vm_ops)) - return 1; + return true; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) - return 0; + return false; /* * Do we need to track softdirty? hugetlb does not support softdirty * tracking yet. */ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) - return 1; + return true; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_wp(vma)) - return 1; + return true; /* Can the mapping track the dirty pages? */ return vma_fs_can_writeback(vma); -- cgit v1.2.3 From 29ae7d96d166fa08c7232daf8a314ef5ba1efd20 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 10 Apr 2024 17:55:26 +0200 Subject: mm: pass VMA instead of MM to follow_pte() ... and centralize the VM_IO/VM_PFNMAP sanity check in there. We'll now also perform these sanity checks for direct follow_pte() invocations. For generic_access_phys(), we might now check multiple times: nothing to worry about, really. Link: https://lkml.kernel.org/r/20240410155527.474777-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Sean Christopherson [KVM] Cc: Alex Williamson Cc: Christoph Hellwig Cc: Fei Li Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Ingo Molnar Cc: Paolo Bonzini Cc: Yonghua Huang Signed-off-by: Andrew Morton --- arch/s390/pci/pci_mmio.c | 4 ++-- arch/x86/mm/pat/memtype.c | 5 +---- drivers/vfio/vfio_iommu_type1.c | 4 ++-- drivers/virt/acrn/mm.c | 3 +-- include/linux/mm.h | 2 +- mm/memory.c | 15 ++++++++------- virt/kvm/kvm_main.c | 4 ++-- 7 files changed, 17 insertions(+), 20 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index a90499c087f0..5398729bfe1b 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -169,7 +169,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); + ret = follow_pte(vma, mmio_addr, &ptep, &ptl); if (ret) goto out_unlock_mmap; @@ -308,7 +308,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock_mmap; - ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl); + ret = follow_pte(vma, mmio_addr, &ptep, &ptl); if (ret) goto out_unlock_mmap; diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index d01c3b0bd6eb..bdc2a240c2aa 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -954,10 +954,7 @@ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, pte_t *ptep, pte; spinlock_t *ptl; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - - if (follow_pte(vma->vm_mm, vma->vm_start, &ptep, &ptl)) + if (follow_pte(vma, vma->vm_start, &ptep, &ptl)) return -EINVAL; pte = ptep_get(ptep); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index b5c15fe8f9fc..3a0218171cfa 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -518,7 +518,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, spinlock_t *ptl; int ret; - ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); + ret = follow_pte(vma, vaddr, &ptep, &ptl); if (ret) { bool unlocked = false; @@ -532,7 +532,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, if (ret) return ret; - ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl); + ret = follow_pte(vma, vaddr, &ptep, &ptl); if (ret) return ret; } diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c index 2d98e1e185c4..db8ff1d0ac23 100644 --- a/drivers/virt/acrn/mm.c +++ b/drivers/virt/acrn/mm.c @@ -187,8 +187,7 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap) } for (i = 0; i < nr_pages; i++) { - ret = follow_pte(vma->vm_mm, - memmap->vma_base + i * PAGE_SIZE, + ret = follow_pte(vma, memmap->vma_base + i * PAGE_SIZE, &ptep, &ptl); if (ret) break; diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b933f9f4864..c25ce6992951 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2420,7 +2420,7 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_pte(struct mm_struct *mm, unsigned long address, +int follow_pte(struct vm_area_struct *vma, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); diff --git a/mm/memory.c b/mm/memory.c index 59c05dc8b18a..1d45d25c1bba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5926,7 +5926,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) /** * follow_pte - look up PTE at a user virtual address - * @mm: the mm_struct of the target address space + * @vma: the memory mapping * @address: user virtual address * @ptepp: location to store found PTE * @ptlp: location to store the lock for the PTE @@ -5945,15 +5945,19 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) * * Return: zero on success, -ve otherwise. */ -int follow_pte(struct mm_struct *mm, unsigned long address, +int follow_pte(struct vm_area_struct *vma, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; @@ -6007,11 +6011,8 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, int offset = offset_in_page(addr); int ret = -EINVAL; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - retry: - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pte(vma, addr, &ptep, &ptl)) return -EINVAL; pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); @@ -6026,7 +6027,7 @@ retry: if (!maddr) return -ENOMEM; - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pte(vma, addr, &ptep, &ptl)) goto out_unmap; if (!pte_same(pte, ptep_get(ptep))) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fb49c2a60200..f57dbacb8689 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2902,7 +2902,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, spinlock_t *ptl; int r; - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); + r = follow_pte(vma, addr, &ptep, &ptl); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does @@ -2917,7 +2917,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, if (r) return r; - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); + r = follow_pte(vma, addr, &ptep, &ptl); if (r) return r; } -- cgit v1.2.3 From 02faa73f174c4d1e11cb9a421f9a8eac0dd881f1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 9 Apr 2024 21:22:44 +0200 Subject: mm: allow for detecting underflows with page_mapcount() again Patch series "mm: mapcount for large folios + page_mapcount() cleanups". This series tracks the mapcount of large folios in a single value, so it can be read efficiently and atomically, just like the mapcount of small folios. folio_mapcount() is then used in a couple more places, most notably to reduce false negatives in folio_likely_mapped_shared(), and many users of page_mapcount() are cleaned up (that's maybe why you got CCed on the full series, sorry sh+xtensa folks! :) ). The remaining s390x user and one KSM user of page_mapcount() are getting removed separately on the list right now. I have patches to handle the other KSM one, the khugepaged one and the kpagecount one; as they are not as "obvious", I will send them out separately in the future. Once that is all in place, I'm planning on moving page_mapcount() into fs/proc/task_mmu.c, the remaining user for the time being (and we can discuss at LSF/MM details on that :) ). I proposed the mapcount for large folios (previously called total mapcount) originally in part of [1] and I later included it in [2] where it is a requirement. In the meantime, I changed the patch a bit so I dropped all RB's. During the discussion of [1], Peter Xu correctly raised that this additional tracking might affect the performance when PMD->PTE remapping THPs. In the meantime. I addressed that by batching RMAP operations during fork(), unmap/zap and when PMD->PTE remapping THPs. Running some of my micro-benchmarks [3] (fork,munmap,cow-byte,remap) on 1 GiB of memory backed by folios with the same order, I observe the following on an Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz tuned for reproducible results as much as possible: Standard deviation is mostly < 1%, except for order-9, where it's < 2% for fork() and munmap(). (1) Small folios are not affected (< 1%) in all 4 microbenchmarks. (2) Order-4 folios are not affected (< 1%) in all 4 microbenchmarks. A bit weird comapred to the other orders ... (3) PMD->PTE remapping of order-9 THPs is not affected (< 1%) (4) COW-byte (COWing a single page by writing a single byte) is not affected for any order (< 1 %). The page copy_fault overhead dominates everything. (5) fork() is mostly not affected (< 1%), except order-2, where we have a slowdown of ~4%. Already for order-3 folios, we're down to a slowdown of < 1%. (6) munmap() sees a slowdown by < 3% for some orders (order-5, order-6, order-9), but less for others (< 1% for order-4 and order-8, < 2% for order-2, order-3, order-7). Especially the fork() and munmap() benchmark are sensitive to each added instruction and other system noise, so I suspect some of the change and observed weirdness (order-4) is due to code layout changes and other factors, but not really due to the added atomics. So in the common case where we can batch, the added atomics don't really make a big difference, especially in light of the recent improvements for large folios that we recently gained due to batching. Surprisingly, for some cases where we cannot batch (e.g., COW), the added atomics don't seem to matter, because other overhead dominates. My fork and munmap micro-benchmarks don't cover cases where we cannot batch-process bigger parts of large folios. As this is not the common case, I'm not worrying about that right now. Future work is batching RMAP operations during swapout and folio migration. [1] https://lore.kernel.org/all/20230809083256.699513-1-david@redhat.com/ [2] https://lore.kernel.org/all/20231124132626.235350-1-david@redhat.com/ [3] https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/pte-mapped-folio-benchmarks.c?ref_type=heads This patch (of 18): Commit 53277bcf126d ("mm: support page_mapcount() on page_has_type() pages") made it impossible to detect mapcount underflows by treating any negative raw mapcount value as a mapcount of 0. We perform such underflow checks in zap_present_folio_ptes() and zap_huge_pmd(), which would currently no longer trigger. Let's check against PAGE_MAPCOUNT_RESERVE instead by using page_type_has_type(), like page_has_type() would, so we can still catch some underflows. [david@redhat.com: make page_mapcount() slighly more efficient] Link: https://lkml.kernel.org/r/1af4fd61-7926-47c8-be45-833c0dbec08b@redhat.com Link: https://lkml.kernel.org/r/20240409192301.907377-1-david@redhat.com Link: https://lkml.kernel.org/r/20240409192301.907377-2-david@redhat.com Fixes: 53277bcf126d ("mm: support page_mapcount() on page_has_type() pages") Signed-off-by: David Hildenbrand Cc: Chris Zankel Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Richard Chang Cc: Rich Felker Cc: Ryan Roberts Cc: Yang Shi Cc: Yin Fengwei Cc: Yoshinori Sato Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index c25ce6992951..8fa4d78bcc09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1232,7 +1232,7 @@ static inline int page_mapcount(struct page *page) int mapcount = atomic_read(&page->_mapcount) + 1; /* Handle page_has_type() pages */ - if (mapcount < 0) + if (mapcount < PAGE_MAPCOUNT_RESERVE + 1) mapcount = 0; if (unlikely(PageCompound(page))) mapcount += folio_entire_mapcount(page_folio(page)); -- cgit v1.2.3 From 05c5323b2a344c19c51cd1b91a4ab9ae90853794 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 9 Apr 2024 21:22:47 +0200 Subject: mm: track mapcount of large folios in single value Let's track the mapcount of large folios in a single value. The mapcount of a large folio currently corresponds to the sum of the entire mapcount and all page mapcounts. This sum is what we actually want to know in folio_mapcount() and it is also sufficient for implementing folio_mapped(). With PTE-mapped THP becoming more important and more widely used, we want to avoid looping over all pages of a folio just to obtain the mapcount of large folios. The comment "In the common case, avoid the loop when no pages mapped by PTE" in folio_total_mapcount() does no longer hold for mTHP that are always mapped by PTE. Further, we are planning on using folio_mapcount() more frequently, and might even want to remove page mapcounts for large folios in some kernel configs. Therefore, allow for reading the mapcount of large folios efficiently and atomically without looping over any pages. Maintain the mapcount also for hugetlb pages for simplicity. Use the new mapcount to implement folio_mapcount() and folio_mapped(). Make page_mapped() simply call folio_mapped(). We can now get rid of folio_large_is_mapped(). _nr_pages_mapped is now only used in rmap code and for debugging purposes. Keep folio_nr_pages_mapped() around, but document that its use should be limited to rmap internals and debugging purposes. This change implies one additional atomic add/sub whenever mapping/unmapping (parts of) a large folio. As we now batch RMAP operations for PTE-mapped THP during fork(), during unmap/zap, and when PTE-remapping a PMD-mapped THP, and we adjust the large mapcount for a PTE batch only once, the added overhead in the common case is small. Only when unmapping individual pages of a large folio (e.g., during COW), the overhead might be bigger in comparison, but it's essentially one additional atomic operation. Note that before the new mapcount would overflow, already our refcount would overflow: each mapping requires a folio reference. Extend the focumentation of folio_mapcount(). Link: https://lkml.kernel.org/r/20240409192301.907377-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Cc: Chris Zankel Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Richard Chang Cc: Rich Felker Cc: Ryan Roberts Cc: Yang Shi Cc: Yoshinori Sato Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 12 ++++++------ include/linux/mm.h | 44 +++++++++++++++++++----------------------- include/linux/mm_types.h | 5 +++-- include/linux/rmap.h | 10 ++++++++++ mm/debug.c | 3 ++- mm/hugetlb.c | 4 ++-- mm/internal.h | 3 +++ mm/khugepaged.c | 2 +- mm/page_alloc.c | 4 ++++ mm/rmap.c | 34 +++++++++++--------------------- 10 files changed, 62 insertions(+), 59 deletions(-) (limited to 'include/linux/mm.h') diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index 93c9239b9ebe..1ba0ad63246c 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -116,14 +116,14 @@ pages: succeeds on tail pages. - map/unmap of a PMD entry for the whole THP increment/decrement - folio->_entire_mapcount and also increment/decrement - folio->_nr_pages_mapped by ENTIRELY_MAPPED when _entire_mapcount - goes from -1 to 0 or 0 to -1. + folio->_entire_mapcount, increment/decrement folio->_large_mapcount + and also increment/decrement folio->_nr_pages_mapped by ENTIRELY_MAPPED + when _entire_mapcount goes from -1 to 0 or 0 to -1. - map/unmap of individual pages with PTE entry increment/decrement - page->_mapcount and also increment/decrement folio->_nr_pages_mapped - when page->_mapcount goes from -1 to 0 or 0 to -1 as this counts - the number of pages mapped by PTE. + page->_mapcount, increment/decrement folio->_large_mapcount and also + increment/decrement folio->_nr_pages_mapped when page->_mapcount goes + from -1 to 0 or 0 to -1 as this counts the number of pages mapped by PTE. split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page diff --git a/include/linux/mm.h b/include/linux/mm.h index 8fa4d78bcc09..059477821625 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1240,16 +1240,26 @@ static inline int page_mapcount(struct page *page) return mapcount; } -int folio_total_mapcount(const struct folio *folio); +static inline int folio_large_mapcount(const struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); + return atomic_read(&folio->_large_mapcount) + 1; +} /** - * folio_mapcount() - Calculate the number of mappings of this folio. + * folio_mapcount() - Number of mappings of this folio. * @folio: The folio. * - * A large folio tracks both how many times the entire folio is mapped, - * and how many times each individual page in the folio is mapped. - * This function calculates the total number of times the folio is - * mapped. + * The folio mapcount corresponds to the number of present user page table + * entries that reference any part of a folio. Each such present user page + * table entry must be paired with exactly on folio reference. + * + * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts + * exactly once. + * + * For hugetlb folios, each abstracted "hugetlb" user page table entry that + * references the entire folio counts exactly once, even when such special + * page table entries are comprised of multiple ordinary page table entries. * * Return: The number of times this folio is mapped. */ @@ -1257,17 +1267,7 @@ static inline int folio_mapcount(const struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) + 1; - return folio_total_mapcount(folio); -} - -static inline bool folio_large_is_mapped(const struct folio *folio) -{ - /* - * Reading _entire_mapcount below could be omitted if hugetlb - * participated in incrementing nr_pages_mapped when compound mapped. - */ - return atomic_read(&folio->_nr_pages_mapped) > 0 || - atomic_read(&folio->_entire_mapcount) >= 0; + return folio_large_mapcount(folio); } /** @@ -1276,11 +1276,9 @@ static inline bool folio_large_is_mapped(const struct folio *folio) * * Return: True if any page in this folio is referenced by user page tables. */ -static inline bool folio_mapped(struct folio *folio) +static inline bool folio_mapped(const struct folio *folio) { - if (likely(!folio_test_large(folio))) - return atomic_read(&folio->_mapcount) >= 0; - return folio_large_is_mapped(folio); + return folio_mapcount(folio) >= 1; } /* @@ -1290,9 +1288,7 @@ static inline bool folio_mapped(struct folio *folio) */ static inline bool page_mapped(const struct page *page) { - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) >= 0; - return folio_large_is_mapped(page_folio(page)); + return folio_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fa0d6995706f..db0adf5721cc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -289,7 +289,8 @@ typedef struct { * @virtual: Virtual address in the kernel direct map. * @_last_cpupid: IDs of last CPU and last process that accessed the folio. * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). - * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). + * @_large_mapcount: Do not use directly, call folio_mapcount(). + * @_nr_pages_mapped: Do not use outside of rmap and debug code. * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. @@ -348,8 +349,8 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; - unsigned long _folio_avail; /* public: */ + atomic_t _large_mapcount; atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; atomic_t _pincount; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 327f1ca5a487..0f906dc6d280 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -273,6 +273,7 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, ClearPageAnonExclusive(&folio->page); } atomic_inc(&folio->_entire_mapcount); + atomic_inc(&folio->_large_mapcount); return 0; } @@ -306,6 +307,7 @@ static inline void hugetlb_add_file_rmap(struct folio *folio) VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); + atomic_inc(&folio->_large_mapcount); } static inline void hugetlb_remove_rmap(struct folio *folio) @@ -313,11 +315,14 @@ static inline void hugetlb_remove_rmap(struct folio *folio) VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); atomic_dec(&folio->_entire_mapcount); + atomic_dec(&folio->_large_mapcount); } static __always_inline void __folio_dup_file_rmap(struct folio *folio, struct page *page, int nr_pages, enum rmap_level level) { + const int orig_nr_pages = nr_pages; + __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { @@ -330,9 +335,11 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, do { atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); + atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: atomic_inc(&folio->_entire_mapcount); + atomic_inc(&folio->_large_mapcount); break; } } @@ -382,6 +389,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *src_vma, enum rmap_level level) { + const int orig_nr_pages = nr_pages; bool maybe_pinned; int i; @@ -423,6 +431,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, ClearPageAnonExclusive(page); atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); + atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: if (PageAnonExclusive(page)) { @@ -431,6 +440,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, ClearPageAnonExclusive(page); } atomic_inc(&folio->_entire_mapcount); + atomic_inc(&folio->_large_mapcount); break; } return 0; diff --git a/mm/debug.c b/mm/debug.c index b71186f1fb0b..d064db42af54 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -68,8 +68,9 @@ static void __dump_folio(struct folio *folio, struct page *page, folio_ref_count(folio), mapcount, mapping, folio->index + idx, pfn); if (folio_test_large(folio)) { - pr_warn("head: order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", + pr_warn("head: order:%u mapcount:%d entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", folio_order(folio), + folio_mapcount(folio), folio_entire_mapcount(folio), folio_nr_pages_mapped(folio), atomic_read(&folio->_pincount)); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5dc3f5ea3a2e..d74289d3f30d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1517,7 +1517,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, struct page *p; atomic_set(&folio->_entire_mapcount, 0); - atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_large_mapcount, 0); atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { @@ -2120,7 +2120,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */ folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); - atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_large_mapcount, -1); atomic_set(&folio->_pincount, 0); return true; diff --git a/mm/internal.h b/mm/internal.h index 22152e0c8494..2adc3f616b71 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -72,6 +72,8 @@ void page_writeback_init(void); /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. + * + * Don't use this function outside of debugging code. */ static inline int folio_nr_pages_mapped(const struct folio *folio) { @@ -611,6 +613,7 @@ static inline void prep_compound_head(struct page *page, unsigned int order) struct folio *folio = (struct folio *)page; folio_set_order(folio, order); + atomic_set(&folio->_large_mapcount, -1); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 89e2624fb3ff..2f73d2aa9ae8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1358,7 +1358,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * Check if the page has any GUP (or other external) pins. * * Here the check may be racy: - * it may see total_mapcount > refcount in some cases? + * it may see folio_mapcount() > folio_ref_count(). * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22e8b9f1d710..dd4265c760ff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -935,6 +935,10 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) bad_page(page, "nonzero entire_mapcount"); goto out; } + if (unlikely(folio_large_mapcount(folio))) { + bad_page(page, "nonzero large_mapcount"); + goto out; + } if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { bad_page(page, "nonzero nr_pages_mapped"); goto out; diff --git a/mm/rmap.c b/mm/rmap.c index 4bde6d60db6c..2608c40dffad 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1138,34 +1138,12 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } -int folio_total_mapcount(const struct folio *folio) -{ - int mapcount = folio_entire_mapcount(folio); - int nr_pages; - int i; - - /* In the common case, avoid the loop when no pages mapped by PTE */ - if (folio_nr_pages_mapped(folio) == 0) - return mapcount; - /* - * Add all the PTE mappings of those pages mapped by PTE. - * Limit the loop to folio_nr_pages_mapped()? - * Perhaps: given all the raciness, that may be a good or a bad idea. - */ - nr_pages = folio_nr_pages(folio); - for (i = 0; i < nr_pages; i++) - mapcount += atomic_read(&folio_page(folio, i)->_mapcount); - - /* But each of those _mapcounts was based on -1 */ - mapcount += nr_pages; - return mapcount; -} - static __always_inline unsigned int __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, enum rmap_level level, int *nr_pmdmapped) { atomic_t *mapped = &folio->_nr_pages_mapped; + const int orig_nr_pages = nr_pages; int first, nr = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); @@ -1185,6 +1163,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, nr++; } } while (page++, --nr_pages > 0); + atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: first = atomic_inc_and_test(&folio->_entire_mapcount); @@ -1201,6 +1180,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, nr = 0; } } + atomic_inc(&folio->_large_mapcount); break; } return nr; @@ -1436,10 +1416,14 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, SetPageAnonExclusive(page); } + /* increment count (starts at -1) */ + atomic_set(&folio->_large_mapcount, nr - 1); atomic_set(&folio->_nr_pages_mapped, nr); } else { /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); + /* increment count (starts at -1) */ + atomic_set(&folio->_large_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); SetPageAnonExclusive(&folio->page); __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); @@ -1522,6 +1506,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, break; } + atomic_sub(nr_pages, &folio->_large_mapcount); do { last = atomic_add_negative(-1, &page->_mapcount); if (last) { @@ -1532,6 +1517,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, } while (page++, --nr_pages > 0); break; case RMAP_LEVEL_PMD: + atomic_dec(&folio->_large_mapcount); last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); @@ -2714,6 +2700,7 @@ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); + atomic_inc(&folio->_large_mapcount); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(&folio->page); VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && @@ -2728,6 +2715,7 @@ void hugetlb_add_new_anon_rmap(struct folio *folio, BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); + atomic_set(&folio->_large_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); __folio_set_anon(folio, vma, address, true); SetPageAnonExclusive(&folio->page); -- cgit v1.2.3 From eefb9b2725e395d58119a92eb8eaea8f2504b5eb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 9 Apr 2024 21:22:48 +0200 Subject: mm: improve folio_likely_mapped_shared() using the mapcount of large folios We can now read the mapcount of large folios very efficiently. Use it to improve our handling of partially-mappable folios, falling back to making a guess only in case the folio is not "obviously mapped shared". We can now better detect partially-mappable folios where the first page is not mapped as "mapped shared", reducing "false negatives"; but false negatives are still possible. While at it, fixup a wrong comment (false positive vs. false negative) for KSM folios. Link: https://lkml.kernel.org/r/20240409192301.907377-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Cc: Chris Zankel Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Richard Chang Cc: Rich Felker Cc: Ryan Roberts Cc: Yang Shi Cc: Yoshinori Sato Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 059477821625..c7470da5797f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2184,7 +2184,7 @@ static inline size_t folio_size(struct folio *folio) * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. * #. For (small) KSM folios, the return value can wrongly indicate "mapped - * shared" (false negative), when the folio is mapped multiple times into + * shared" (false positive), when the folio is mapped multiple times into * the same MM. * * Further, this function only considers current page table mappings that @@ -2201,7 +2201,22 @@ static inline size_t folio_size(struct folio *folio) */ static inline bool folio_likely_mapped_shared(struct folio *folio) { - return page_mapcount(folio_page(folio, 0)) > 1; + int mapcount = folio_mapcount(folio); + + /* Only partially-mappable folios require more care. */ + if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) + return mapcount > 1; + + /* A single mapping implies "mapped exclusively". */ + if (mapcount <= 1) + return false; + + /* If any page is mapped more than once we treat it "mapped shared". */ + if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio)) + return true; + + /* Let's guess based on the first subpage. */ + return atomic_read(&folio->_mapcount) > 0; } #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE -- cgit v1.2.3 From 4103b93b07bceac30bc83cbce81693bb2ea93c22 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 9 Apr 2024 21:22:49 +0200 Subject: mm: make folio_mapcount() return 0 for small typed folios We already handle it properly for large folios. Let's also return "0" for small typed folios, like page_mapcount() currently would. Consequently, folio_mapcount() will never return negative values for typed folios, but may return negative values for underflows. [david@redhat.com: make folio_mapcount() slightly more efficient] Link: https://lkml.kernel.org/r/c30fcda1-ed87-46f5-8297-cdedbddac009@redhat.com Link: https://lkml.kernel.org/r/20240409192301.907377-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Chris Zankel Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Richard Chang Cc: Rich Felker Cc: Ryan Roberts Cc: Yang Shi Cc: Yin Fengwei Cc: Yoshinori Sato Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index c7470da5797f..78e583b50e42 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1261,12 +1261,22 @@ static inline int folio_large_mapcount(const struct folio *folio) * references the entire folio counts exactly once, even when such special * page table entries are comprised of multiple ordinary page table entries. * + * Will report 0 for pages which cannot be mapped into userspace, such as + * slab, page tables and similar. + * * Return: The number of times this folio is mapped. */ static inline int folio_mapcount(const struct folio *folio) { - if (likely(!folio_test_large(folio))) - return atomic_read(&folio->_mapcount) + 1; + int mapcount; + + if (likely(!folio_test_large(folio))) { + mapcount = atomic_read(&folio->_mapcount) + 1; + /* Handle page_has_type() pages */ + if (mapcount < PAGE_MAPCOUNT_RESERVE + 1) + mapcount = 0; + return mapcount; + } return folio_large_mapcount(folio); } -- cgit v1.2.3 From fed5348ee2b136c84c5a27d6fceef14066beeb66 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Apr 2024 20:35:02 +0100 Subject: mm/memory-failure: convert shake_page() to shake_folio() Removes two calls to compound_head(). Move the prototype to internal.h; we definitely don't want code outside mm using it. Link: https://lkml.kernel.org/r/20240412193510.2356957-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jane Chu Acked-by: Miaohe Lin Cc: Dan Williams Cc: Miaohe Lin Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/hwpoison-inject.c | 11 ++++++----- mm/internal.h | 1 + mm/memory-failure.c | 15 ++++++++++----- 4 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 78e583b50e42..b9ac49c9eb00 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4033,7 +4033,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index d0548e382b6b..c9d653f51e45 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -15,7 +15,7 @@ static int hwpoison_inject(void *data, u64 val) { unsigned long pfn = val; struct page *p; - struct page *hpage; + struct folio *folio; int err; if (!capable(CAP_SYS_ADMIN)) @@ -25,16 +25,17 @@ static int hwpoison_inject(void *data, u64 val) return -ENXIO; p = pfn_to_page(pfn); - hpage = compound_head(p); + folio = page_folio(p); if (!hwpoison_filter_enable) goto inject; - shake_page(hpage); + shake_folio(folio); /* * This implies unable to support non-LRU pages except free page. */ - if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p)) + if (!folio_test_lru(folio) && !folio_test_hugetlb(folio) && + !is_free_buddy_page(p)) return 0; /* @@ -42,7 +43,7 @@ static int hwpoison_inject(void *data, u64 val) * the targeted owner (or on a free page). * memory_failure() will redo the check reliably inside page lock. */ - err = hwpoison_filter(hpage); + err = hwpoison_filter(&folio->page); if (err) return 0; diff --git a/mm/internal.h b/mm/internal.h index 5d5e49b86fe3..6803c7b17c1f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1037,6 +1037,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) /* * mm/memory-failure.c */ +void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 12e5d2844cb1..4daf581e3878 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -369,20 +369,25 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) * Unknown page type encountered. Try to check whether it can turn PageLRU by * lru_add_drain_all. */ -void shake_page(struct page *p) +void shake_folio(struct folio *folio) { - if (PageHuge(p)) + if (folio_test_hugetlb(folio)) return; /* * TODO: Could shrink slab caches here if a lightweight range-based * shrinker will be available. */ - if (PageSlab(p)) + if (folio_test_slab(folio)) return; lru_add_drain_all(); } -EXPORT_SYMBOL_GPL(shake_page); +EXPORT_SYMBOL_GPL(shake_folio); + +static void shake_page(struct page *page) +{ + shake_folio(page_folio(page)); +} static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, unsigned long address) @@ -1639,7 +1644,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * shake_page() again to ensure that it's flushed. */ if (mlocked) - shake_page(hpage); + shake_folio(folio); /* * Now that the dirty bit has been propagated to the -- cgit v1.2.3 From 6785c54a1b43c44ade9064ce46db4aa4d09e8cef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Apr 2024 20:19:07 +0100 Subject: mm: remove put_devmap_managed_page() It only has one caller; convert that caller to use put_devmap_managed_page_refs() instead. Link: https://lkml.kernel.org/r/20240424191914.361554-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index b9ac49c9eb00..deb51f8dc283 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1457,11 +1457,6 @@ static inline bool put_devmap_managed_page_refs(struct page *page, int refs) } #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ -static inline bool put_devmap_managed_page(struct page *page) -{ - return put_devmap_managed_page_refs(page, 1); -} - /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) @@ -1580,7 +1575,7 @@ static inline void put_page(struct page *page) * For some devmap managed pages we need to catch refcount transition * from 2 to 1: */ - if (put_devmap_managed_page(&folio->page)) + if (put_devmap_managed_page_refs(&folio->page, 1)) return; folio_put(folio); } -- cgit v1.2.3 From 53e45c4f6d4f6cd7e62f4cb016018ba31c2ac8b4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Apr 2024 20:19:08 +0100 Subject: mm: convert put_devmap_managed_page_refs() to put_devmap_managed_folio_refs() All callers have a folio so we can remove this use of page_ref_sub_return(). Link: https://lkml.kernel.org/r/20240424191914.361554-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++------ mm/gup.c | 6 +++--- mm/memremap.c | 10 +++++----- mm/swap.c | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index deb51f8dc283..9849dfda44d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1441,17 +1441,17 @@ vm_fault_t finish_fault(struct vm_fault *vmf); #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -bool __put_devmap_managed_page_refs(struct page *page, int refs); -static inline bool put_devmap_managed_page_refs(struct page *page, int refs) +bool __put_devmap_managed_folio_refs(struct folio *folio, int refs); +static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) { if (!static_branch_unlikely(&devmap_managed_key)) return false; - if (!is_zone_device_page(page)) + if (!folio_is_zone_device(folio)) return false; - return __put_devmap_managed_page_refs(page, refs); + return __put_devmap_managed_folio_refs(folio, refs); } #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ -static inline bool put_devmap_managed_page_refs(struct page *page, int refs) +static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) { return false; } @@ -1575,7 +1575,7 @@ static inline void put_page(struct page *page) * For some devmap managed pages we need to catch refcount transition * from 2 to 1: */ - if (put_devmap_managed_page_refs(&folio->page, 1)) + if (put_devmap_managed_folio_refs(folio, 1)) return; folio_put(folio); } diff --git a/mm/gup.c b/mm/gup.c index 8dcbeae714e2..44c6ceee7e1f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -89,7 +89,7 @@ retry: * belongs to this folio. */ if (unlikely(page_folio(page) != folio)) { - if (!put_devmap_managed_page_refs(&folio->page, refs)) + if (!put_devmap_managed_folio_refs(folio, refs)) folio_put_refs(folio, refs); goto retry; } @@ -156,7 +156,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) */ if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { - if (!put_devmap_managed_page_refs(&folio->page, refs)) + if (!put_devmap_managed_folio_refs(folio, refs)) folio_put_refs(folio, refs); return NULL; } @@ -198,7 +198,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) refs *= GUP_PIN_COUNTING_BIAS; } - if (!put_devmap_managed_page_refs(&folio->page, refs)) + if (!put_devmap_managed_folio_refs(folio, refs)) folio_put_refs(folio, refs); } diff --git a/mm/memremap.c b/mm/memremap.c index e1776693e2ea..40d4547ce514 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -512,9 +512,9 @@ void zone_device_page_init(struct page *page) EXPORT_SYMBOL_GPL(zone_device_page_init); #ifdef CONFIG_FS_DAX -bool __put_devmap_managed_page_refs(struct page *page, int refs) +bool __put_devmap_managed_folio_refs(struct folio *folio, int refs) { - if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) + if (folio->page.pgmap->type != MEMORY_DEVICE_FS_DAX) return false; /* @@ -522,9 +522,9 @@ bool __put_devmap_managed_page_refs(struct page *page, int refs) * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - if (page_ref_sub_return(page, refs) == 1) - wake_up_var(&page->_refcount); + if (folio_ref_sub_return(folio, refs) == 1) + wake_up_var(&folio->_refcount); return true; } -EXPORT_SYMBOL(__put_devmap_managed_page_refs); +EXPORT_SYMBOL(__put_devmap_managed_folio_refs); #endif /* CONFIG_FS_DAX */ diff --git a/mm/swap.c b/mm/swap.c index 8ae5cd4ed180..bfce6d53b237 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -980,7 +980,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - if (put_devmap_managed_page_refs(&folio->page, nr_refs)) + if (put_devmap_managed_folio_refs(folio, nr_refs)) continue; if (folio_ref_sub_and_test(folio, nr_refs)) free_zone_device_folio(folio); -- cgit v1.2.3