summaryrefslogtreecommitdiff
path: root/include/linux/mm.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r--include/linux/mm.h2017
1 files changed, 1341 insertions, 676 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b1c3db9cf355..abb4963c1f06 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_MM_H
#define _LINUX_MM_H
+#include <linux/args.h>
#include <linux/errno.h>
#include <linux/mmdebug.h>
#include <linux/gfp.h>
@@ -12,6 +13,7 @@
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
+#include <linux/compiler.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
@@ -32,6 +34,10 @@
#include <linux/memremap.h>
#include <linux/slab.h>
#include <linux/cacheinfo.h>
+#include <linux/rcuwait.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/iommu-debug-pagealloc.h>
struct mempolicy;
struct anon_vma;
@@ -40,22 +46,11 @@ struct user_struct;
struct pt_regs;
struct folio_batch;
-extern int sysctl_page_lock_unfairness;
-
+void arch_mm_preinit(void);
+void mm_core_init_early(void);
void mm_core_init(void);
void init_mm_internals(void);
-#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */
-extern unsigned long max_mapnr;
-
-static inline void set_max_mapnr(unsigned long limit)
-{
- max_mapnr = limit;
-}
-#else
-static inline void set_max_mapnr(unsigned long limit) { }
-#endif
-
extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
@@ -78,8 +73,15 @@ static inline void totalram_pages_add(long count)
}
extern void * high_memory;
-extern int page_cluster;
-extern const int page_cluster_max;
+
+/*
+ * Convert between pages and MB
+ * 20 is the shift for 1MB (2^20 = 1MB)
+ * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages)
+ * So (20 - PAGE_SHIFT) converts between pages and MB
+ */
+#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT))
+#define MB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
@@ -106,6 +108,8 @@ extern int mmap_rnd_compat_bits __read_mostly;
# endif
#endif
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+
#include <asm/page.h>
#include <asm/processor.h>
@@ -209,23 +213,14 @@ extern int sysctl_max_map_count;
extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;
-extern int sysctl_overcommit_memory;
-extern int sysctl_overcommit_ratio;
-extern unsigned long sysctl_overcommit_kbytes;
-
-int overcommit_ratio_handler(const struct ctl_table *, int, void *, size_t *,
- loff_t *);
-int overcommit_kbytes_handler(const struct ctl_table *, int, void *, size_t *,
- loff_t *);
-int overcommit_policy_handler(const struct ctl_table *, int, void *, size_t *,
- loff_t *);
-
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
-#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
-#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio))
+bool page_range_contiguous(const struct page *page, unsigned long nr_pages);
#else
-#define nth_page(page,n) ((page) + (n))
-#define folio_page_idx(folio, p) ((p) - &(folio)->page)
+static inline bool page_range_contiguous(const struct page *page,
+ unsigned long nr_pages)
+{
+ return true;
+}
#endif
/* to align the pointer to the (next) page boundary */
@@ -237,6 +232,20 @@ int overcommit_policy_handler(const struct ctl_table *, int, void *, size_t *,
/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
+/**
+ * folio_page_idx - Return the number of a page in a folio.
+ * @folio: The folio.
+ * @page: The folio page.
+ *
+ * This function expects that the page is actually part of the folio.
+ * The returned number is relative to the start of the folio.
+ */
+static inline unsigned long folio_page_idx(const struct folio *folio,
+ const struct page *page)
+{
+ return page - &folio->page;
+}
+
static inline struct folio *lru_to_folio(struct list_head *head)
{
return list_entry((head)->prev, struct folio, lru);
@@ -257,8 +266,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
-/* Use only if VMA has no other users */
-void __vm_area_free(struct vm_area_struct *vma);
#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
@@ -271,178 +278,236 @@ extern unsigned int kobjsize(const void *objp);
* vm_flags in vm_area_struct, see mm_types.h.
* When changing, update also include/trace/events/mmflags.h
*/
-#define VM_NONE 0x00000000
-#define VM_READ 0x00000001 /* currently active flags */
-#define VM_WRITE 0x00000002
-#define VM_EXEC 0x00000004
-#define VM_SHARED 0x00000008
+#define VM_NONE 0x00000000
-/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
-#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
-#define VM_MAYWRITE 0x00000020
-#define VM_MAYEXEC 0x00000040
-#define VM_MAYSHARE 0x00000080
+/**
+ * typedef vma_flag_t - specifies an individual VMA flag by bit number.
+ *
+ * This value is made type safe by sparse to avoid passing invalid flag values
+ * around.
+ */
+typedef int __bitwise vma_flag_t;
-#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
+#define DECLARE_VMA_BIT(name, bitnum) \
+ VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
+#define DECLARE_VMA_BIT_ALIAS(name, aliased) \
+ VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT)
+enum {
+ DECLARE_VMA_BIT(READ, 0),
+ DECLARE_VMA_BIT(WRITE, 1),
+ DECLARE_VMA_BIT(EXEC, 2),
+ DECLARE_VMA_BIT(SHARED, 3),
+ /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
+ DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */
+ DECLARE_VMA_BIT(MAYWRITE, 5),
+ DECLARE_VMA_BIT(MAYEXEC, 6),
+ DECLARE_VMA_BIT(MAYSHARE, 7),
+ DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */
#ifdef CONFIG_MMU
-#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
-#else /* CONFIG_MMU */
-#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
-#define VM_UFFD_MISSING 0
+ DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
+#else
+ /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
+ DECLARE_VMA_BIT(MAYOVERLAY, 9),
#endif /* CONFIG_MMU */
-#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
-#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
-
-#define VM_LOCKED 0x00002000
-#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
-
- /* Used by sys_madvise() */
-#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
-#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
-
-#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
-#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
-#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
-#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
-#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
-#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
-#define VM_SYNC 0x00800000 /* Synchronous page faults */
-#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
-#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
-#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
-
+ /* Page-ranges managed without "struct page", just pure PFN */
+ DECLARE_VMA_BIT(PFNMAP, 10),
+ DECLARE_VMA_BIT(MAYBE_GUARD, 11),
+ DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */
+ DECLARE_VMA_BIT(LOCKED, 13),
+ DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */
+ DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */
+ DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */
+ DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */
+ DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
+ DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
+ DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */
+ DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */
+ DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */
+ DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */
+ DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */
+ DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
+ DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */
+ DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */
+ DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */
+ DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */
+ DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
+ DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */
+ /* These bits are reused, we define specific uses below. */
+ DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
+ DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
+ DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
+ DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
+ DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
+ DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
+ DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
+ /*
+ * This flag is used to connect VFIO to arch specific KVM code. It
+ * indicates that the memory under this VMA is safe for use with any
+ * non-cachable memory type inside KVM. Some VFIO devices, on some
+ * platforms, are thought to be unsafe and can cause machine crashes
+ * if KVM does not lock down the memory type.
+ */
+ DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
+#ifdef CONFIG_PPC32
+ DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
+#else
+ DECLARE_VMA_BIT(DROPPABLE, 40),
+#endif
+ DECLARE_VMA_BIT(UFFD_MINOR, 41),
+ DECLARE_VMA_BIT(SEALED, 42),
+ /* Flags that reuse flags above. */
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
+#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_RISCV_USER_CFI)
+ /*
+ * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
+ * support core mm.
+ *
+ * These VMAs will get a single end guard page. This helps userspace
+ * protect itself from attacks. A single page is enough for current
+ * shadow stack archs (x86). See the comments near alloc_shstk() in
+ * arch/x86/kernel/shstk.c for more details on the guard size.
+ */
+ DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
+#elif defined(CONFIG_ARM64_GCS)
+ /*
+ * arm64's Guarded Control Stack implements similar functionality and
+ * has similar constraints to shadow stacks.
+ */
+ DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
+#endif
+ DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */
+ DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */
+ DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */
+ DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */
+ DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */
+ DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */
+ DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */
+ DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
+#ifdef CONFIG_STACK_GROWSUP
+ DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
+ DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
+#else
+ DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
+#endif
+};
+#undef DECLARE_VMA_BIT
+#undef DECLARE_VMA_BIT_ALIAS
+
+#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
+#define VM_READ INIT_VM_FLAG(READ)
+#define VM_WRITE INIT_VM_FLAG(WRITE)
+#define VM_EXEC INIT_VM_FLAG(EXEC)
+#define VM_SHARED INIT_VM_FLAG(SHARED)
+#define VM_MAYREAD INIT_VM_FLAG(MAYREAD)
+#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE)
+#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC)
+#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE)
+#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN)
+#ifdef CONFIG_MMU
+#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING)
+#else
+#define VM_UFFD_MISSING VM_NONE
+#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY)
+#endif
+#define VM_PFNMAP INIT_VM_FLAG(PFNMAP)
+#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD)
+#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP)
+#define VM_LOCKED INIT_VM_FLAG(LOCKED)
+#define VM_IO INIT_VM_FLAG(IO)
+#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ)
+#define VM_RAND_READ INIT_VM_FLAG(RAND_READ)
+#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY)
+#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND)
+#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT)
+#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT)
+#define VM_NORESERVE INIT_VM_FLAG(NORESERVE)
+#define VM_HUGETLB INIT_VM_FLAG(HUGETLB)
+#define VM_SYNC INIT_VM_FLAG(SYNC)
+#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1)
+#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK)
+#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP)
#ifdef CONFIG_MEM_SOFT_DIRTY
-# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
+#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY)
#else
-# define VM_SOFTDIRTY 0
+#define VM_SOFTDIRTY VM_NONE
+#endif
+#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP)
+#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE)
+#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE)
+#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE)
+#define VM_STACK INIT_VM_FLAG(STACK)
+#ifdef CONFIG_STACK_GROWSUP
+#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY)
+#else
+#define VM_STACK_EARLY VM_NONE
#endif
-
-#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
-#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
-#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
-#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
-
-#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
-#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
-#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
-#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
-#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
-#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
-#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
-#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6)
-#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
-
#ifdef CONFIG_ARCH_HAS_PKEYS
-# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
-# define VM_PKEY_BIT0 VM_HIGH_ARCH_0
-# define VM_PKEY_BIT1 VM_HIGH_ARCH_1
-# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
+#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
+/* Despite the naming, these are FLAGS not bits. */
+#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
+#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
+#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
#if CONFIG_ARCH_PKEY_BITS > 3
-# define VM_PKEY_BIT3 VM_HIGH_ARCH_3
+#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
#else
-# define VM_PKEY_BIT3 0
-#endif
+#define VM_PKEY_BIT3 VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 3 */
#if CONFIG_ARCH_PKEY_BITS > 4
-# define VM_PKEY_BIT4 VM_HIGH_ARCH_4
+#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
#else
-# define VM_PKEY_BIT4 0
-#endif
+#define VM_PKEY_BIT4 VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 4 */
#endif /* CONFIG_ARCH_HAS_PKEYS */
-
-#ifdef CONFIG_X86_USER_SHADOW_STACK
-/*
- * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
- * support core mm.
- *
- * These VMAs will get a single end guard page. This helps userspace protect
- * itself from attacks. A single page is enough for current shadow stack archs
- * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c
- * for more details on the guard size.
- */
-# define VM_SHADOW_STACK VM_HIGH_ARCH_5
-#endif
-
-#if defined(CONFIG_ARM64_GCS)
-/*
- * arm64's Guarded Control Stack implements similar functionality and
- * has similar constraints to shadow stacks.
- */
-# define VM_SHADOW_STACK VM_HIGH_ARCH_6
-#endif
-
-#ifndef VM_SHADOW_STACK
-# define VM_SHADOW_STACK VM_NONE
+#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) || \
+ defined(CONFIG_RISCV_USER_CFI)
+#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK)
+#else
+#define VM_SHADOW_STACK VM_NONE
#endif
-
-#if defined(CONFIG_X86)
-# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
-#elif defined(CONFIG_PPC64)
-# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
+#if defined(CONFIG_PPC64)
+#define VM_SAO INIT_VM_FLAG(SAO)
#elif defined(CONFIG_PARISC)
-# define VM_GROWSUP VM_ARCH_1
+#define VM_GROWSUP INIT_VM_FLAG(GROWSUP)
#elif defined(CONFIG_SPARC64)
-# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */
-# define VM_ARCH_CLEAR VM_SPARC_ADI
+#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI)
+#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR)
#elif defined(CONFIG_ARM64)
-# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */
-# define VM_ARCH_CLEAR VM_ARM64_BTI
+#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI)
+#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR)
#elif !defined(CONFIG_MMU)
-# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */
+#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY)
#endif
-
-#if defined(CONFIG_ARM64_MTE)
-# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */
-# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */
-#else
-# define VM_MTE VM_NONE
-# define VM_MTE_ALLOWED VM_NONE
-#endif
-
#ifndef VM_GROWSUP
-# define VM_GROWSUP VM_NONE
+#define VM_GROWSUP VM_NONE
+#endif
+#ifdef CONFIG_ARM64_MTE
+#define VM_MTE INIT_VM_FLAG(MTE)
+#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED)
+#else
+#define VM_MTE VM_NONE
+#define VM_MTE_ALLOWED VM_NONE
#endif
-
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-# define VM_UFFD_MINOR_BIT 38
-# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
-#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-# define VM_UFFD_MINOR VM_NONE
-#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-
-/*
- * This flag is used to connect VFIO to arch specific KVM code. It
- * indicates that the memory under this VMA is safe for use with any
- * non-cachable memory type inside KVM. Some VFIO devices, on some
- * platforms, are thought to be unsafe and can cause machine crashes
- * if KVM does not lock down the memory type.
- */
-#ifdef CONFIG_64BIT
-#define VM_ALLOW_ANY_UNCACHED_BIT 39
-#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT)
+#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR)
#else
-#define VM_ALLOW_ANY_UNCACHED VM_NONE
+#define VM_UFFD_MINOR VM_NONE
#endif
-
#ifdef CONFIG_64BIT
-#define VM_DROPPABLE_BIT 40
-#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT)
-#elif defined(CONFIG_PPC32)
-#define VM_DROPPABLE VM_ARCH_1
+#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
+#define VM_SEALED INIT_VM_FLAG(SEALED)
#else
-#define VM_DROPPABLE VM_NONE
+#define VM_ALLOW_ANY_UNCACHED VM_NONE
+#define VM_SEALED VM_NONE
#endif
-
-#ifdef CONFIG_64BIT
-/* VM is sealed, in vm_flags */
-#define VM_SEALED _BITUL(63)
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE)
+#else
+#define VM_DROPPABLE VM_NONE
#endif
/* Bits set in the VMA until the stack is in its final location */
@@ -468,12 +533,10 @@ extern unsigned int kobjsize(const void *objp);
#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
-#ifdef CONFIG_STACK_GROWSUP
-#define VM_STACK VM_GROWSUP
-#define VM_STACK_EARLY VM_GROWSDOWN
+#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
+#define VM_SEALED_SYSMAP VM_SEALED
#else
-#define VM_STACK VM_GROWSDOWN
-#define VM_STACK_EARLY 0
+#define VM_SEALED_SYSMAP VM_NONE
#endif
#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
@@ -481,12 +544,27 @@ extern unsigned int kobjsize(const void *objp);
/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
-
/*
* Special vmas that are non-mergable, non-mlock()able.
*/
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+/*
+ * Physically remapped pages are special. Tell the
+ * rest of the world about it:
+ * IO tells people not to look at these pages
+ * (accesses can have side effects).
+ * PFNMAP tells the core MM that the base pages are just
+ * raw PFN mappings, and do not have a "struct page" associated
+ * with them.
+ * DONTEXPAND
+ * Disable vma merging and expanding with mremap().
+ * DONTDUMP
+ * Omit vma from core dump, even when VM_IO turned off.
+ */
+#define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, \
+ VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)
+
/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
@@ -496,13 +574,73 @@ extern unsigned int kobjsize(const void *objp);
/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
+/* These flags can be updated atomically via VMA/mmap read lock. */
+#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD
+
/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
-# define VM_ARCH_CLEAR VM_NONE
+#define VM_ARCH_CLEAR VM_NONE
#endif
#define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)
/*
+ * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
+ * possesses it but the other does not, the merged VMA should nonetheless have
+ * applied to it:
+ *
+ * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its
+ * references cleared via /proc/$pid/clear_refs, any merged VMA
+ * should be considered soft-dirty also as it operates at a VMA
+ * granularity.
+ *
+ * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that
+ * mapped page tables may contain metadata not described by the
+ * VMA and thus any merged VMA may also contain this metadata,
+ * and thus we must make this flag sticky.
+ */
+#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
+
+/*
+ * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
+ * of these flags and the other not does not preclude a merge.
+ *
+ * VM_STICKY - When merging VMAs, VMA flags must match, unless they are
+ * 'sticky'. If any sticky flags exist in either VMA, we simply
+ * set all of them on the merged VMA.
+ */
+#define VM_IGNORE_MERGE VM_STICKY
+
+/*
+ * Flags which should result in page tables being copied on fork. These are
+ * flags which indicate that the VMA maps page tables which cannot be
+ * reconsistuted upon page fault, so necessitate page table copying upon fork.
+ *
+ * Note that these flags should be compared with the DESTINATION VMA not the
+ * source, as VM_UFFD_WP may not be propagated to destination, while all other
+ * flags will be.
+ *
+ * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
+ * reasonably reconstructed on page fault.
+ *
+ * VM_UFFD_WP - Encodes metadata about an installed uffd
+ * write protect handler, which cannot be
+ * reconstructed on page fault.
+ *
+ * We always copy pgtables when dst_vma has uffd-wp
+ * enabled even if it's file-backed
+ * (e.g. shmem). Because when uffd-wp is enabled,
+ * pgtable contains uffd-wp protection information,
+ * that's something we can't retrieve from page cache,
+ * and skip copying will lose those info.
+ *
+ * VM_MAYBE_GUARD - Could contain page guard region markers which
+ * by design are a property of the page tables
+ * only and thus cannot be reconstructed on page
+ * fault.
+ */
+#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
+
+/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
*/
@@ -673,13 +811,21 @@ struct vm_operations_struct {
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
unsigned long addr, pgoff_t *ilx);
#endif
+#ifdef CONFIG_FIND_NORMAL_PAGE
/*
- * Called by vm_normal_page() for special PTEs to find the
- * page for @addr. This is useful if the default behavior
- * (using pte_page()) would not find the correct page.
+ * Called by vm_normal_page() for special PTEs in @vma at @addr. This
+ * allows for returning a "normal" page from vm_normal_page() even
+ * though the PTE indicates that the "struct page" either does not exist
+ * or should not be touched: "special".
+ *
+ * Do not add new users: this really only works when a "normal" page
+ * was mapped, but then the PTE got changed to something weird (+
+ * marked special) that would not make pte_pfn() identify the originally
+ * inserted page.
*/
- struct page *(*find_special_page)(struct vm_area_struct *vma,
- unsigned long addr);
+ struct page *(*find_normal_page)(struct vm_area_struct *vma,
+ unsigned long addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
};
#ifdef CONFIG_NUMA_BALANCING
@@ -696,109 +842,11 @@ static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_PER_VMA_LOCK
/*
- * Try to read-lock a vma. The function is allowed to occasionally yield false
- * locked result to avoid performance overhead, in which case we fall back to
- * using mmap_lock. The function should never yield false unlocked result.
+ * These must be here rather than mmap_lock.h as dependent on vm_fault type,
+ * declared in this header.
*/
-static inline bool vma_start_read(struct vm_area_struct *vma)
-{
- /*
- * Check before locking. A race might cause false locked result.
- * We can use READ_ONCE() for the mm_lock_seq here, and don't need
- * ACQUIRE semantics, because this is just a lockless check whose result
- * we don't rely on for anything - the mm_lock_seq read against which we
- * need ordering is below.
- */
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
- return false;
-
- if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
- return false;
-
- /*
- * Overflow might produce false locked result.
- * False unlocked result is impossible because we modify and check
- * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
- * modification invalidates all existing locks.
- *
- * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
- * racing with vma_end_write_all(), we only start reading from the VMA
- * after it has been unlocked.
- * This pairs with RELEASE semantics in vma_end_write_all().
- */
- if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
- up_read(&vma->vm_lock->lock);
- return false;
- }
- return true;
-}
-
-static inline void vma_end_read(struct vm_area_struct *vma)
-{
- rcu_read_lock(); /* keeps vma alive till the end of up_read */
- up_read(&vma->vm_lock->lock);
- rcu_read_unlock();
-}
-
-/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
-{
- mmap_assert_write_locked(vma->vm_mm);
-
- /*
- * current task is holding mmap_write_lock, both vma->vm_lock_seq and
- * mm->mm_lock_seq can't be concurrently modified.
- */
- *mm_lock_seq = vma->vm_mm->mm_lock_seq;
- return (vma->vm_lock_seq == *mm_lock_seq);
-}
-
-/*
- * Begin writing to a VMA.
- * Exclude concurrent readers under the per-VMA lock until the currently
- * write-locked mmap_lock is dropped or downgraded.
- */
-static inline void vma_start_write(struct vm_area_struct *vma)
-{
- int mm_lock_seq;
-
- if (__is_vma_write_locked(vma, &mm_lock_seq))
- return;
-
- down_write(&vma->vm_lock->lock);
- /*
- * We should use WRITE_ONCE() here because we can have concurrent reads
- * from the early lockless pessimistic check in vma_start_read().
- * We don't really care about the correctness of that early check, but
- * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
- */
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
- up_write(&vma->vm_lock->lock);
-}
-
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
-{
- int mm_lock_seq;
-
- VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
-}
-
-static inline void vma_assert_locked(struct vm_area_struct *vma)
-{
- if (!rwsem_is_locked(&vma->vm_lock->lock))
- vma_assert_write_locked(vma);
-}
-
-static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
-{
- /* When detaching vma should be write-locked */
- if (detached)
- vma_assert_write_locked(vma);
- vma->detached = detached;
-}
-
+#ifdef CONFIG_PER_VMA_LOCK
static inline void release_fault_lock(struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_VMA_LOCK)
@@ -807,72 +855,73 @@ static inline void release_fault_lock(struct vm_fault *vmf)
mmap_read_unlock(vmf->vma->vm_mm);
}
-static inline void assert_fault_locked(struct vm_fault *vmf)
+static inline void assert_fault_locked(const struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_VMA_LOCK)
vma_assert_locked(vmf->vma);
else
mmap_assert_locked(vmf->vma->vm_mm);
}
+#else
+static inline void release_fault_lock(struct vm_fault *vmf)
+{
+ mmap_read_unlock(vmf->vma->vm_mm);
+}
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address);
-
-#else /* CONFIG_PER_VMA_LOCK */
+static inline void assert_fault_locked(const struct vm_fault *vmf)
+{
+ mmap_assert_locked(vmf->vma->vm_mm);
+}
+#endif /* CONFIG_PER_VMA_LOCK */
-static inline bool vma_start_read(struct vm_area_struct *vma)
- { return false; }
-static inline void vma_end_read(struct vm_area_struct *vma) {}
-static inline void vma_start_write(struct vm_area_struct *vma) {}
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
- { mmap_assert_write_locked(vma->vm_mm); }
-static inline void vma_mark_detached(struct vm_area_struct *vma,
- bool detached) {}
+static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
+{
+ return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
-static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
+static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm)
{
- return NULL;
+ return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}
-static inline void vma_assert_locked(struct vm_area_struct *vma)
+static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm)
{
- mmap_assert_locked(vma->vm_mm);
+ return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}
-static inline void release_fault_lock(struct vm_fault *vmf)
+static inline void mm_flags_set(int flag, struct mm_struct *mm)
{
- mmap_read_unlock(vmf->vma->vm_mm);
+ set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}
-static inline void assert_fault_locked(struct vm_fault *vmf)
+static inline void mm_flags_clear(int flag, struct mm_struct *mm)
{
- mmap_assert_locked(vmf->vma->vm_mm);
+ clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
}
-#endif /* CONFIG_PER_VMA_LOCK */
+static inline void mm_flags_clear_all(struct mm_struct *mm)
+{
+ bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS);
+}
extern const struct vm_operations_struct vma_dummy_vm_ops;
-/*
- * WARNING: vma_init does not initialize vma->vm_lock.
- * Use vm_area_alloc()/vm_area_free() if vma needs locking.
- */
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_ops = &vma_dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
- vma_mark_detached(vma, false);
- vma_numab_state_init(vma);
+ vma_lock_init(vma, false);
}
/* Use when VMA is not part of the VMA tree and needs no locking */
static inline void vm_flags_init(struct vm_area_struct *vma,
vm_flags_t flags)
{
- ACCESS_PRIVATE(vma, __vm_flags) = flags;
+ VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
+ vma_flags_clear_all(&vma->flags);
+ vma_flags_overwrite_word(&vma->flags, flags);
}
/*
@@ -883,6 +932,7 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
static inline void vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
+ VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
vma_assert_write_locked(vma);
vm_flags_init(vma, flags);
}
@@ -891,21 +941,33 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma,
vm_flags_t flags)
{
vma_assert_write_locked(vma);
- WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
+ /*
+ * If VMA flags exist beyond the first system word, also clear these. It
+ * is assumed the write once behaviour is required only for the first
+ * system word.
+ */
+ if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) {
+ unsigned long *bitmap = vma->flags.__vma_flags;
+
+ bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG);
+ }
+
+ vma_flags_overwrite_word_once(&vma->flags, flags);
}
static inline void vm_flags_set(struct vm_area_struct *vma,
vm_flags_t flags)
{
vma_start_write(vma);
- ACCESS_PRIVATE(vma, __vm_flags) |= flags;
+ vma_flags_set_word(&vma->flags, flags);
}
static inline void vm_flags_clear(struct vm_area_struct *vma,
vm_flags_t flags)
{
+ VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
vma_start_write(vma);
- ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
+ vma_flags_clear_word(&vma->flags, flags);
}
/*
@@ -929,6 +991,242 @@ static inline void vm_flags_mod(struct vm_area_struct *vma,
__vm_flags_mod(vma, set, clear);
}
+static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_t bit)
+{
+ const vm_flags_t mask = BIT((__force int)bit);
+
+ /* Only specific flags are permitted */
+ if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED)))
+ return false;
+
+ return true;
+}
+
+/*
+ * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific
+ * valid flags are allowed to do this.
+ */
+static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit)
+{
+ unsigned long *bitmap = vma->flags.__vma_flags;
+
+ vma_assert_stabilised(vma);
+ if (__vma_atomic_valid_flag(vma, bit))
+ set_bit((__force int)bit, bitmap);
+}
+
+/*
+ * Test for VMA flag atomically. Requires no locks. Only specific valid flags
+ * are allowed to do this.
+ *
+ * This is necessarily racey, so callers must ensure that serialisation is
+ * achieved through some other means, or that races are permissible.
+ */
+static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit)
+{
+ if (__vma_atomic_valid_flag(vma, bit))
+ return test_bit((__force int)bit, &vma->vm_flags);
+
+ return false;
+}
+
+/* Set an individual VMA flag in flags, non-atomically. */
+static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit)
+{
+ unsigned long *bitmap = flags->__vma_flags;
+
+ __set_bit((__force int)bit, bitmap);
+}
+
+static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits)
+{
+ vma_flags_t flags;
+ int i;
+
+ vma_flags_clear_all(&flags);
+ for (i = 0; i < count; i++)
+ vma_flag_set(&flags, bits[i]);
+ return flags;
+}
+
+/*
+ * Helper macro which bitwise-or combines the specified input flags into a
+ * vma_flags_t bitmap value. E.g.:
+ *
+ * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT,
+ * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT);
+ *
+ * The compiler cleverly optimises away all of the work and this ends up being
+ * equivalent to aggregating the values manually.
+ */
+#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \
+ (const vma_flag_t []){__VA_ARGS__})
+
+/* Test each of to_test flags in flags, non-atomically. */
+static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags,
+ vma_flags_t to_test)
+{
+ const unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+ return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Test whether any specified VMA flag is set, e.g.:
+ *
+ * if (vma_flags_test(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
+ */
+#define vma_flags_test(flags, ...) \
+ vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/* Test that ALL of the to_test flags are set, non-atomically. */
+static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags,
+ vma_flags_t to_test)
+{
+ const unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+ return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Test whether ALL specified VMA flags are set, e.g.:
+ *
+ * if (vma_flags_test_all(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
+ */
+#define vma_flags_test_all(flags, ...) \
+ vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/* Set each of the to_set flags in flags, non-atomically. */
+static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set)
+{
+ unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_set = to_set.__vma_flags;
+
+ bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Set all specified VMA flags, e.g.:
+ *
+ * vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+ */
+#define vma_flags_set(flags, ...) \
+ vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/* Clear all of the to-clear flags in flags, non-atomically. */
+static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear)
+{
+ unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_clear = to_clear.__vma_flags;
+
+ bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Clear all specified individual flags, e.g.:
+ *
+ * vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+ */
+#define vma_flags_clear(flags, ...) \
+ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/*
+ * Helper to test that ALL specified flags are set in a VMA.
+ *
+ * Note: appropriate locks must be held, this function does not acquire them for
+ * you.
+ */
+static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma,
+ vma_flags_t flags)
+{
+ return vma_flags_test_all_mask(&vma->flags, flags);
+}
+
+/*
+ * Helper macro for checking that ALL specified flags are set in a VMA, e.g.:
+ *
+ * if (vma_test_all_flags(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... }
+ */
+#define vma_test_all_flags(vma, ...) \
+ vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+/*
+ * Helper to set all VMA flags in a VMA.
+ *
+ * Note: appropriate locks must be held, this function does not acquire them for
+ * you.
+ */
+static inline void vma_set_flags_mask(struct vm_area_struct *vma,
+ vma_flags_t flags)
+{
+ vma_flags_set_mask(&vma->flags, flags);
+}
+
+/*
+ * Helper macro for specifying VMA flags in a VMA, e.g.:
+ *
+ * vma_set_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
+ * VMA_DONTDUMP_BIT);
+ *
+ * Note: appropriate locks must be held, this function does not acquire them for
+ * you.
+ */
+#define vma_set_flags(vma, ...) \
+ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+/* Helper to test all VMA flags in a VMA descriptor. */
+static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc,
+ vma_flags_t flags)
+{
+ return vma_flags_test_mask(&desc->vma_flags, flags);
+}
+
+/*
+ * Helper macro for testing VMA flags for an input pointer to a struct
+ * vm_area_desc object describing a proposed VMA, e.g.:
+ *
+ * if (vma_desc_test_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT,
+ * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... }
+ */
+#define vma_desc_test_flags(desc, ...) \
+ vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+/* Helper to set all VMA flags in a VMA descriptor. */
+static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc,
+ vma_flags_t flags)
+{
+ vma_flags_set_mask(&desc->vma_flags, flags);
+}
+
+/*
+ * Helper macro for specifying VMA flags for an input pointer to a struct
+ * vm_area_desc object describing a proposed VMA, e.g.:
+ *
+ * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
+ * VMA_DONTDUMP_BIT);
+ */
+#define vma_desc_set_flags(desc, ...) \
+ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+/* Helper to clear all VMA flags in a VMA descriptor. */
+static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc,
+ vma_flags_t flags)
+{
+ vma_flags_clear_mask(&desc->vma_flags, flags);
+}
+
+/*
+ * Helper macro for clearing VMA flags for an input pointer to a struct
+ * vm_area_desc object describing a proposed VMA, e.g.:
+ *
+ * vma_desc_clear_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
+ * VMA_DONTDUMP_BIT);
+ */
+#define vma_desc_clear_flags(desc, ...) \
+ vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
vma->vm_ops = NULL;
@@ -964,7 +1262,7 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
vma->vm_end >= vma->vm_mm->start_stack;
}
-static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
+static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -978,7 +1276,7 @@ static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
return false;
}
-static inline bool vma_is_foreign(struct vm_area_struct *vma)
+static inline bool vma_is_foreign(const struct vm_area_struct *vma)
{
if (!current->mm)
return true;
@@ -989,20 +1287,25 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma)
return false;
}
-static inline bool vma_is_accessible(struct vm_area_struct *vma)
+static inline bool vma_is_accessible(const struct vm_area_struct *vma)
{
return vma->vm_flags & VM_ACCESS_FLAGS;
}
-static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags)
{
return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
(VM_SHARED | VM_MAYWRITE);
}
-static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+static inline bool is_shared_maywrite(const vma_flags_t *flags)
+{
+ return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT);
+}
+
+static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma)
{
- return is_shared_maywrite(vma->vm_flags);
+ return is_shared_maywrite(&vma->flags);
}
static inline
@@ -1058,6 +1361,7 @@ static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
if (unlikely(mas_is_err(&vmi->mas)))
return -ENOMEM;
+ vma_mark_attached(vma);
return 0;
}
@@ -1083,14 +1387,14 @@ static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
* The vma_is_shmem is not inline because it is used only by slow
* paths in userfault.
*/
-bool vma_is_shmem(struct vm_area_struct *vma);
-bool vma_is_anon_shmem(struct vm_area_struct *vma);
+bool vma_is_shmem(const struct vm_area_struct *vma);
+bool vma_is_anon_shmem(const struct vm_area_struct *vma);
#else
-static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
-static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; }
+static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; }
+static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; }
#endif
-int vma_is_stack_for_current(struct vm_area_struct *vma);
+int vma_is_stack_for_current(const struct vm_area_struct *vma);
/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
@@ -1098,6 +1402,25 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
struct mmu_gather;
struct inode;
+extern void prep_compound_page(struct page *page, unsigned int order);
+
+static inline unsigned int folio_large_order(const struct folio *folio)
+{
+ return folio->_flags_1 & 0xff;
+}
+
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+static inline unsigned long folio_large_nr_pages(const struct folio *folio)
+{
+ return folio->_nr_pages;
+}
+#else
+static inline unsigned long folio_large_nr_pages(const struct folio *folio)
+{
+ return 1L << folio_large_order(folio);
+}
+#endif
+
/*
* compound_order() can be called without holding a reference, which means
* that niceties like page_folio() don't work. These callers should be
@@ -1105,13 +1428,13 @@ struct inode;
* set before the order is initialised, or this may be a tail page.
* See compaction.c for some good examples.
*/
-static inline unsigned int compound_order(struct page *page)
+static inline unsigned int compound_order(const struct page *page)
{
- struct folio *folio = (struct folio *)page;
+ const struct folio *folio = (struct folio *)page;
- if (!test_bit(PG_head, &folio->flags))
+ if (!test_bit(PG_head, &folio->flags.f))
return 0;
- return folio->_flags_1 & 0xff;
+ return folio_large_order(folio);
}
/**
@@ -1127,7 +1450,24 @@ static inline unsigned int folio_order(const struct folio *folio)
{
if (!folio_test_large(folio))
return 0;
- return folio->_flags_1 & 0xff;
+ return folio_large_order(folio);
+}
+
+/**
+ * folio_reset_order - Reset the folio order and derived _nr_pages
+ * @folio: The folio.
+ *
+ * Reset the order and derived _nr_pages to 0. Must only be used in the
+ * process of splitting large folios.
+ */
+static inline void folio_reset_order(struct folio *folio)
+{
+ if (WARN_ON_ONCE(!folio_test_large(folio)))
+ return;
+ folio->_flags_1 &= ~0xffUL;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 0;
+#endif
}
#include <linux/huge_mm.h>
@@ -1220,6 +1560,8 @@ static inline int is_vmalloc_or_module_addr(const void *x)
static inline int folio_entire_mapcount(const struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1))
+ return 0;
return atomic_read(&folio->_entire_mapcount) + 1;
}
@@ -1306,7 +1648,7 @@ int folio_mc_copy(struct folio *dst, struct folio *src);
unsigned long nr_free_buffer_pages(void);
/* Returns the number of bytes in this potentially compound page. */
-static inline unsigned long page_size(struct page *page)
+static inline unsigned long page_size(const struct page *page)
{
return PAGE_SIZE << compound_order(page);
}
@@ -1352,7 +1694,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page);
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
struct page *page, unsigned int nr, unsigned long addr);
@@ -1393,9 +1735,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf);
* the page's disk buffers. PG_private must be set to tell the VM to call
* into the filesystem to release these pages.
*
- * A page may belong to an inode's memory mapping. In this case, page->mapping
- * is the pointer to the inode, and page->index is the file offset of the page,
- * in units of PAGE_SIZE.
+ * A folio may belong to an inode's memory mapping. In this case,
+ * folio->mapping points to the inode, and folio->index is the file
+ * offset of the folio, in units of PAGE_SIZE.
*
* If pagecache pages are not associated with an inode, they are said to be
* anonymous pages. These may become associated with the swapcache, and in that
@@ -1419,25 +1761,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf);
* back into memory.
*/
-#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
-DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-
-bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
-static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
- if (!static_branch_unlikely(&devmap_managed_key))
- return false;
- if (!folio_is_zone_device(folio))
- return false;
- return __put_devmap_managed_folio_refs(folio, refs);
-}
-#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
- return false;
-}
-#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-
/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1458,7 +1781,12 @@ static inline void folio_get(struct folio *folio)
static inline void get_page(struct page *page)
{
- folio_get(page_folio(page));
+ struct folio *folio = page_folio(page);
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
+ return;
+ if (WARN_ON_ONCE(folio_test_large_kmalloc(folio)))
+ return;
+ folio_get(folio);
}
static inline __must_check bool try_get_page(struct page *page)
@@ -1552,12 +1880,9 @@ static inline void put_page(struct page *page)
{
struct folio *folio = page_folio(page);
- /*
- * For some devmap managed pages we need to catch refcount transition
- * from 2 to 1:
- */
- if (put_devmap_managed_folio_refs(folio, 1))
+ if (folio_test_slab(folio) || folio_test_large_kmalloc(folio))
return;
+
folio_put(folio);
}
@@ -1608,6 +1933,14 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
+static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc)
+{
+ const vma_flags_t *flags = &desc->vma_flags;
+
+ return vma_flags_test(flags, VMA_MAYWRITE_BIT) &&
+ !vma_flags_test(flags, VMA_SHARED_BIT);
+}
+
#ifndef CONFIG_MMU
static inline bool is_nommu_shared_mapping(vm_flags_t flags)
{
@@ -1621,6 +1954,11 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags)
*/
return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
}
+
+static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags)
+{
+ return vma_flags_test(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT);
+}
#endif
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -1637,21 +1975,26 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags)
*/
static inline int page_zone_id(struct page *page)
{
- return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
+ return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK;
}
#ifdef NODE_NOT_IN_PAGE_FLAGS
-int page_to_nid(const struct page *page);
+int memdesc_nid(memdesc_flags_t mdf);
#else
-static inline int page_to_nid(const struct page *page)
+static inline int memdesc_nid(memdesc_flags_t mdf)
{
- return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK;
+ return (mdf.f >> NODES_PGSHIFT) & NODES_MASK;
}
#endif
+static inline int page_to_nid(const struct page *page)
+{
+ return memdesc_nid(PF_POISONED_CHECK(page)->flags);
+}
+
static inline int folio_nid(const struct folio *folio)
{
- return page_to_nid(&folio->page);
+ return memdesc_nid(folio->flags);
}
#ifdef CONFIG_NUMA_BALANCING
@@ -1720,14 +2063,14 @@ static inline void page_cpupid_reset_last(struct page *page)
#else
static inline int folio_last_cpupid(struct folio *folio)
{
- return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
+ return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}
int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
static inline void page_cpupid_reset_last(struct page *page)
{
- page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
+ page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
@@ -1823,7 +2166,7 @@ static inline u8 page_kasan_tag(const struct page *page)
u8 tag = KASAN_TAG_KERNEL;
if (kasan_enabled()) {
- tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+ tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
tag ^= 0xff;
}
@@ -1838,12 +2181,12 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag)
return;
tag ^= 0xff;
- old_flags = READ_ONCE(page->flags);
+ old_flags = READ_ONCE(page->flags.f);
do {
flags = old_flags;
flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
- } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
+ } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags)));
}
static inline void page_kasan_tag_reset(struct page *page)
@@ -1874,28 +2217,33 @@ static inline pg_data_t *page_pgdat(const struct page *page)
return NODE_DATA(page_to_nid(page));
}
-static inline struct zone *folio_zone(const struct folio *folio)
+static inline pg_data_t *folio_pgdat(const struct folio *folio)
{
- return page_zone(&folio->page);
+ return NODE_DATA(folio_nid(folio));
}
-static inline pg_data_t *folio_pgdat(const struct folio *folio)
+static inline struct zone *folio_zone(const struct folio *folio)
{
- return page_pgdat(&folio->page);
+ return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)];
}
#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
- page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
- page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+ page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
+ page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}
-static inline unsigned long page_to_section(const struct page *page)
+static inline unsigned long memdesc_section(memdesc_flags_t mdf)
{
- return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
+ return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
-#endif
+#else /* !SECTION_IN_PAGE_FLAGS */
+static inline unsigned long memdesc_section(memdesc_flags_t mdf)
+{
+ return 0;
+}
+#endif /* SECTION_IN_PAGE_FLAGS */
/**
* folio_pfn - Return the Page Frame Number of a folio.
@@ -1916,6 +2264,69 @@ static inline struct folio *pfn_folio(unsigned long pfn)
return page_folio(pfn_to_page(pfn));
}
+#ifdef CONFIG_MMU
+static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot)
+{
+ return pfn_pte(page_to_pfn(page), pgprot);
+}
+
+/**
+ * folio_mk_pte - Create a PTE for this folio
+ * @folio: The folio to create a PTE for
+ * @pgprot: The page protection bits to use
+ *
+ * Create a page table entry for the first page of this folio.
+ * This is suitable for passing to set_ptes().
+ *
+ * Return: A page table entry suitable for mapping this folio.
+ */
+static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot)
+{
+ return pfn_pte(folio_pfn(folio), pgprot);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/**
+ * folio_mk_pmd - Create a PMD for this folio
+ * @folio: The folio to create a PMD for
+ * @pgprot: The page protection bits to use
+ *
+ * Create a page table entry for the first page of this folio.
+ * This is suitable for passing to set_pmd_at().
+ *
+ * Return: A page table entry suitable for mapping this folio.
+ */
+static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot)
+{
+ return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+/**
+ * folio_mk_pud - Create a PUD for this folio
+ * @folio: The folio to create a PUD for
+ * @pgprot: The page protection bits to use
+ *
+ * Create a page table entry for the first page of this folio.
+ * This is suitable for passing to set_pud_at().
+ *
+ * Return: A page table entry suitable for mapping this folio.
+ */
+static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot)
+{
+ return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot));
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_MMU */
+
+static inline bool folio_has_pincount(const struct folio *folio)
+{
+ if (IS_ENABLED(CONFIG_64BIT))
+ return folio_test_large(folio);
+ return folio_order(folio) > 1;
+}
+
/**
* folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
* @folio: The folio.
@@ -1932,7 +2343,7 @@ static inline struct folio *pfn_folio(unsigned long pfn)
* get that many refcounts, and b) all the callers of this routine are
* expected to be able to deal gracefully with a false positive.
*
- * For large folios, the result will be exactly correct. That's because
+ * For most large folios, the result will be exactly correct. That's because
* we have more tracking data available: the _pincount field is used
* instead of the GUP_PIN_COUNTING_BIAS scheme.
*
@@ -1943,7 +2354,7 @@ static inline struct folio *pfn_folio(unsigned long pfn)
*/
static inline bool folio_maybe_dma_pinned(struct folio *folio)
{
- if (folio_test_large(folio))
+ if (folio_has_pincount(folio))
return atomic_read(&folio->_pincount) > 0;
/*
@@ -1969,7 +2380,7 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
{
VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
- if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
+ if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))
return false;
return folio_maybe_dma_pinned(folio);
@@ -2015,6 +2426,13 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio)
if (folio_is_device_coherent(folio))
return false;
+ /*
+ * Filesystems can only tolerate transient delays to truncate and
+ * hole-punch operations
+ */
+ if (folio_is_fsdax(folio))
+ return false;
+
/* Otherwise, non-movable zone folios can be pinned. */
return !folio_is_zone_movable(folio);
@@ -2028,14 +2446,14 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio)
static inline void set_page_zone(struct page *page, enum zone_type zone)
{
- page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
- page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
+ page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT);
+ page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}
static inline void set_page_node(struct page *page, unsigned long node)
{
- page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
- page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
+ page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT);
+ page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT;
}
static inline void set_page_links(struct page *page, enum zone_type zone,
@@ -2054,49 +2472,55 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
*
* Return: A positive power of two.
*/
-static inline long folio_nr_pages(const struct folio *folio)
+static inline unsigned long folio_nr_pages(const struct folio *folio)
{
if (!folio_test_large(folio))
return 1;
-#ifdef CONFIG_64BIT
- return folio->_folio_nr_pages;
-#else
- return 1L << (folio->_flags_1 & 0xff);
-#endif
+ return folio_large_nr_pages(folio);
}
-/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER)
+#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS)
+/*
+ * We don't expect any folios that exceed buddy sizes (and consequently
+ * memory sections).
+ */
+#define MAX_FOLIO_ORDER MAX_PAGE_ORDER
+#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+/*
+ * Only pages within a single memory section are guaranteed to be
+ * contiguous. By limiting folios to a single memory section, all folio
+ * pages are guaranteed to be contiguous.
+ */
+#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT
+#elif defined(CONFIG_HUGETLB_PAGE)
+/*
+ * There is no real limit on the folio size. We limit them to the maximum we
+ * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
+ * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
+ */
+#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
#else
-#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES
+/*
+ * Without hugetlb, gigantic folios that are bigger than a single PUD are
+ * currently impossible.
+ */
+#define MAX_FOLIO_ORDER PUD_ORDER
#endif
+#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER)
+
/*
* compound_nr() returns the number of pages in this potentially compound
* page. compound_nr() can be called on a tail page, and is defined to
* return 1 in that case.
*/
-static inline unsigned long compound_nr(struct page *page)
+static inline unsigned long compound_nr(const struct page *page)
{
- struct folio *folio = (struct folio *)page;
+ const struct folio *folio = (struct folio *)page;
- if (!test_bit(PG_head, &folio->flags))
+ if (!test_bit(PG_head, &folio->flags.f))
return 1;
-#ifdef CONFIG_64BIT
- return folio->_folio_nr_pages;
-#else
- return 1L << (folio->_flags_1 & 0xff);
-#endif
-}
-
-/**
- * thp_nr_pages - The number of regular pages in this huge page.
- * @page: The head page of a huge page.
- */
-static inline int thp_nr_pages(struct page *page)
-{
- return folio_nr_pages((struct folio *)page);
+ return folio_large_nr_pages(folio);
}
/**
@@ -2149,23 +2573,18 @@ static inline size_t folio_size(const struct folio *folio)
}
/**
- * folio_likely_mapped_shared - Estimate if the folio is mapped into the page
- * tables of more than one MM
+ * folio_maybe_mapped_shared - Whether the folio is mapped into the page
+ * tables of more than one MM
* @folio: The folio.
*
- * This function checks if the folio is currently mapped into more than one
- * MM ("mapped shared"), or if the folio is only mapped into a single MM
- * ("mapped exclusively").
+ * This function checks if the folio maybe currently mapped into more than one
+ * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single
+ * MM ("mapped exclusively").
*
* For KSM folios, this function also returns "mapped shared" when a folio is
* mapped multiple times into the same MM, because the individual page mappings
* are independent.
*
- * As precise information is not easily available for all folios, this function
- * estimates the number of MMs ("sharers") that are currently mapping a folio
- * using the number of times the first page of the folio is currently mapped
- * into page tables.
- *
* For small anonymous folios and anonymous hugetlb folios, the return
* value will be exactly correct: non-KSM folios can only be mapped at most once
* into an MM, and they cannot be partially mapped. KSM folios are
@@ -2173,8 +2592,8 @@ static inline size_t folio_size(const struct folio *folio)
*
* For other folios, the result can be fuzzy:
* #. For partially-mappable large folios (THP), the return value can wrongly
- * indicate "mapped exclusively" (false negative) when the folio is
- * only partially mapped into at least one MM.
+ * indicate "mapped shared" (false positive) if a folio was mapped by
+ * more than two MMs at one point in time.
* #. For pagecache folios (including hugetlb), the return value can wrongly
* indicate "mapped shared" (false positive) when two VMAs in the same MM
* cover the same file range.
@@ -2191,7 +2610,7 @@ static inline size_t folio_size(const struct folio *folio)
*
* Return: Whether the folio is estimated to be mapped into more than one MM.
*/
-static inline bool folio_likely_mapped_shared(struct folio *folio)
+static inline bool folio_maybe_mapped_shared(struct folio *folio)
{
int mapcount = folio_mapcount(folio);
@@ -2199,16 +2618,77 @@ static inline bool folio_likely_mapped_shared(struct folio *folio)
if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
return mapcount > 1;
- /* A single mapping implies "mapped exclusively". */
+ /*
+ * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ...
+ * simply assume "mapped shared", nobody should really care
+ * about this for arbitrary kernel allocations.
+ */
+ if (!IS_ENABLED(CONFIG_MM_ID))
+ return true;
+
+ /*
+ * A single mapping implies "mapped exclusively", even if the
+ * folio flag says something different: it's easier to handle this
+ * case here instead of on the RMAP hot path.
+ */
if (mapcount <= 1)
return false;
+ return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
+}
- /* If any page is mapped more than once we treat it "mapped shared". */
- if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio))
- return true;
+/**
+ * folio_expected_ref_count - calculate the expected folio refcount
+ * @folio: the folio
+ *
+ * Calculate the expected folio refcount, taking references from the pagecache,
+ * swapcache, PG_private and page table mappings into account. Useful in
+ * combination with folio_ref_count() to detect unexpected references (e.g.,
+ * GUP or other temporary references).
+ *
+ * Does currently not consider references from the LRU cache. If the folio
+ * was isolated from the LRU (which is the case during migration or split),
+ * the LRU cache does not apply.
+ *
+ * Calling this function on an unmapped folio -- !folio_mapped() -- that is
+ * locked will return a stable result.
+ *
+ * Calling this function on a mapped folio will not result in a stable result,
+ * because nothing stops additional page table mappings from coming (e.g.,
+ * fork()) or going (e.g., munmap()).
+ *
+ * Calling this function without the folio lock will also not result in a
+ * stable result: for example, the folio might get dropped from the swapcache
+ * concurrently.
+ *
+ * However, even when called without the folio lock or on a mapped folio,
+ * this function can be used to detect unexpected references early (for example,
+ * if it makes sense to even lock the folio and unmap it).
+ *
+ * The caller must add any reference (e.g., from folio_try_get()) it might be
+ * holding itself to the result.
+ *
+ * Returns the expected folio refcount.
+ */
+static inline int folio_expected_ref_count(const struct folio *folio)
+{
+ const int order = folio_order(folio);
+ int ref_count = 0;
+
+ if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio)))
+ return 0;
- /* Let's guess based on the first subpage. */
- return atomic_read(&folio->_mapcount) > 0;
+ /* One reference per page from the swapcache. */
+ ref_count += folio_test_swapcache(folio) << order;
+
+ if (!folio_test_anon(folio)) {
+ /* One reference per page from the pagecache. */
+ ref_count += !!folio->mapping << order;
+ /* One reference from PG_private. */
+ ref_count += folio_test_private(folio);
+ }
+
+ /* One reference per page table mapping. */
+ return ref_count + folio_mapcount(folio);
}
#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
@@ -2311,7 +2791,6 @@ static inline void clear_page_pfmemalloc(struct page *page)
extern void pagefault_out_of_memory(void);
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
-#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1))
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
/*
@@ -2320,6 +2799,7 @@ extern void pagefault_out_of_memory(void);
struct zap_details {
struct folio *single_folio; /* Locked folio to be unmapped */
bool even_cows; /* Zap COWed private pages too? */
+ bool reclaim_pt; /* Need reclaim page tables? */
zap_flags_t zap_flags; /* Extra flags for zapping */
};
@@ -2333,31 +2813,6 @@ struct zap_details {
/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
-#ifdef CONFIG_SCHED_MM_CID
-void sched_mm_cid_before_execve(struct task_struct *t);
-void sched_mm_cid_after_execve(struct task_struct *t);
-void sched_mm_cid_fork(struct task_struct *t);
-void sched_mm_cid_exit_signals(struct task_struct *t);
-static inline int task_mm_cid(struct task_struct *t)
-{
- return t->mm_cid;
-}
-#else
-static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_fork(struct task_struct *t) { }
-static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
-static inline int task_mm_cid(struct task_struct *t)
-{
- /*
- * Use the processor id as a fall-back when the mm cid feature is
- * disabled. This provides functional per-cpu data structure accesses
- * in user-space, althrough it won't provide the memory usage benefits.
- */
- return raw_smp_processor_id();
-}
-#endif
-
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
@@ -2374,6 +2829,8 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
+struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t pud);
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
@@ -2384,10 +2841,6 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
zap_page_range_single(vma, vma->vm_start,
vma->vm_end - vma->vm_start, NULL);
}
-void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
- struct vm_area_struct *start_vma, unsigned long start,
- unsigned long end, unsigned long tree_end, bool mm_wr_locked);
-
struct mmu_notifier_range;
void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
@@ -2416,11 +2869,13 @@ struct follow_pfnmap_args {
* Outputs:
*
* @pfn: the PFN of the address
+ * @addr_mask: address mask covering pfn
* @pgprot: the pgprot_t of the mapping
* @writable: whether the mapping is writable
* @special: whether the mapping is a special mapping (real PFN maps)
*/
unsigned long pfn;
+ unsigned long addr_mask;
pgprot_t pgprot;
bool writable;
bool special;
@@ -2485,6 +2940,11 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
+#ifdef CONFIG_BPF_SYSCALL
+extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags);
+#endif
+
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -2545,10 +3005,10 @@ void folio_add_pin(struct folio *folio);
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
- struct task_struct *task, bool bypass_rlim);
+ const struct task_struct *task, bool bypass_rlim);
struct kvec;
-struct page *get_dump_page(unsigned long addr);
+struct page *get_dump_page(unsigned long addr, int *locked);
bool folio_mark_dirty(struct folio *folio);
bool folio_mark_dirty_lock(struct folio *folio);
@@ -2584,7 +3044,7 @@ extern long change_protection(struct mmu_gather *tlb,
unsigned long end, unsigned long cp_flags);
extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
struct vm_area_struct *vma, struct vm_area_struct **pprev,
- unsigned long start, unsigned long end, unsigned long newflags);
+ unsigned long start, unsigned long end, vm_flags_t newflags);
/*
* doesn't attempt to fault and will return short.
@@ -2605,6 +3065,11 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
return percpu_counter_read_positive(&mm->rss_stat[member]);
}
+static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member)
+{
+ return percpu_counter_sum_positive(&mm->rss_stat[member]);
+}
+
void mm_trace_rss_stat(struct mm_struct *mm, int member);
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
@@ -2650,6 +3115,13 @@ static inline unsigned long get_mm_rss(struct mm_struct *mm)
get_mm_counter(mm, MM_SHMEMPAGES);
}
+static inline unsigned long get_mm_rss_sum(struct mm_struct *mm)
+{
+ return get_mm_counter_sum(mm, MM_FILEPAGES) +
+ get_mm_counter_sum(mm, MM_ANONPAGES) +
+ get_mm_counter_sum(mm, MM_SHMEMPAGES);
+}
+
static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
return max(mm->hiwater_rss, get_mm_rss(mm));
@@ -2664,8 +3136,8 @@ static inline void update_hiwater_rss(struct mm_struct *mm)
{
unsigned long _rss = get_mm_rss(mm);
- if ((mm)->hiwater_rss < _rss)
- (mm)->hiwater_rss = _rss;
+ if (data_race(mm->hiwater_rss) < _rss)
+ data_race(mm->hiwater_rss = _rss);
}
static inline void update_hiwater_vm(struct mm_struct *mm)
@@ -2724,22 +3196,8 @@ static inline pud_t pud_mkspecial(pud_t pud)
}
#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
-#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
-static inline int pte_devmap(pte_t pte)
-{
- return 0;
-}
-#endif
-
-extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
- spinlock_t **ptl);
-static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
- spinlock_t **ptl)
-{
- pte_t *ptep;
- __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
- return ptep;
-}
+extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+ spinlock_t **ptl);
#ifdef __PAGETABLE_P4D_FOLDED
static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
@@ -2864,16 +3322,23 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
}
#endif /* CONFIG_MMU */
+enum pt_flags {
+ PT_kernel = PG_referenced,
+ PT_reserved = PG_reserved,
+ /* High bits are used for zone/node/section */
+};
+
static inline struct ptdesc *virt_to_ptdesc(const void *x)
{
return page_ptdesc(virt_to_page(x));
}
-static inline void *ptdesc_to_virt(const struct ptdesc *pt)
-{
- return page_to_virt(ptdesc_page(pt));
-}
-
+/**
+ * ptdesc_address - Virtual address of page table.
+ * @pt: Page table descriptor.
+ *
+ * Return: The first byte of the page table described by @pt.
+ */
static inline void *ptdesc_address(const struct ptdesc *pt)
{
return folio_address(ptdesc_folio(pt));
@@ -2881,7 +3346,47 @@ static inline void *ptdesc_address(const struct ptdesc *pt)
static inline bool pagetable_is_reserved(struct ptdesc *pt)
{
- return folio_test_reserved(ptdesc_folio(pt));
+ return test_bit(PT_reserved, &pt->pt_flags.f);
+}
+
+/**
+ * ptdesc_set_kernel - Mark a ptdesc used to map the kernel
+ * @ptdesc: The ptdesc to be marked
+ *
+ * Kernel page tables often need special handling. Set a flag so that
+ * the handling code knows this ptdesc will not be used for userspace.
+ */
+static inline void ptdesc_set_kernel(struct ptdesc *ptdesc)
+{
+ set_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
+/**
+ * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel
+ * @ptdesc: The ptdesc to be unmarked
+ *
+ * Use when the ptdesc is no longer used to map the kernel and no longer
+ * needs special handling.
+ */
+static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc)
+{
+ /*
+ * Note: the 'PG_referenced' bit does not strictly need to be
+ * cleared before freeing the page. But this is nice for
+ * symmetry.
+ */
+ clear_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
+/**
+ * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel
+ * @ptdesc: The ptdesc being tested
+ *
+ * Call to tell if the ptdesc used to map the kernel.
+ */
+static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc)
+{
+ return test_bit(PT_kernel, &ptdesc->pt_flags.f);
}
/**
@@ -2902,6 +3407,21 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde
}
#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
+static inline void __pagetable_free(struct ptdesc *pt)
+{
+ struct page *page = ptdesc_page(pt);
+
+ __free_pages(page, compound_order(page));
+}
+
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+void pagetable_free_kernel(struct ptdesc *pt);
+#else
+static inline void pagetable_free_kernel(struct ptdesc *pt)
+{
+ __pagetable_free(pt);
+}
+#endif
/**
* pagetable_free - Free pagetables
* @pt: The page table descriptor
@@ -2911,9 +3431,12 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde
*/
static inline void pagetable_free(struct ptdesc *pt)
{
- struct page *page = ptdesc_page(pt);
-
- __free_pages(page, compound_order(page));
+ if (ptdesc_test_kernel(pt)) {
+ ptdesc_clear_kernel(pt);
+ pagetable_free_kernel(pt);
+ } else {
+ __pagetable_free(pt);
+ }
}
#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
@@ -2991,18 +3514,15 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
-static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
+static inline void __pagetable_ctor(struct ptdesc *ptdesc)
{
struct folio *folio = ptdesc_folio(ptdesc);
- if (!ptlock_init(ptdesc))
- return false;
__folio_set_pgtable(folio);
lruvec_stat_add_folio(folio, NR_PAGETABLE);
- return true;
}
-static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
+static inline void pagetable_dtor(struct ptdesc *ptdesc)
{
struct folio *folio = ptdesc_folio(ptdesc);
@@ -3011,31 +3531,30 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}
-pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
-static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr,
- pmd_t *pmdvalp)
+static inline void pagetable_dtor_free(struct ptdesc *ptdesc)
{
- pte_t *pte;
+ pagetable_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
- __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp));
- return pte;
+static inline bool pagetable_pte_ctor(struct mm_struct *mm,
+ struct ptdesc *ptdesc)
+{
+ if (mm != &init_mm && !ptlock_init(ptdesc))
+ return false;
+ __pagetable_ctor(ptdesc);
+ return true;
}
+
+pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
+
static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
{
return __pte_offset_map(pmd, addr, NULL);
}
-pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, spinlock_t **ptlp);
-static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, spinlock_t **ptlp)
-{
- pte_t *pte;
-
- __cond_lock(RCU, __cond_lock(*ptlp,
- pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)));
- return pte;
-}
+pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp);
pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, spinlock_t **ptlp);
@@ -3087,14 +3606,6 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
return ptlock_init(ptdesc);
}
-static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
-#endif
- ptlock_free(ptdesc);
-}
-
#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
#else
@@ -3105,7 +3616,6 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
}
static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
-static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}
#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
@@ -3118,27 +3628,16 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
return ptl;
}
-static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
+static inline bool pagetable_pmd_ctor(struct mm_struct *mm,
+ struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
-
- if (!pmd_ptlock_init(ptdesc))
+ if (mm != &init_mm && !pmd_ptlock_init(ptdesc))
return false;
- __folio_set_pgtable(folio);
ptdesc_pmd_pts_init(ptdesc);
- lruvec_stat_add_folio(folio, NR_PAGETABLE);
+ __pagetable_ctor(ptdesc);
return true;
}
-static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
-{
- struct folio *folio = ptdesc_folio(ptdesc);
-
- pmd_ptlock_free(ptdesc);
- __folio_clear_pgtable(folio);
- lruvec_stat_sub_folio(folio, NR_PAGETABLE);
-}
-
/*
* No scalability reason to split PUD locks yet, but follow the same pattern
* as the PMD locks to make it easier if we decide to. The VM should not be
@@ -3160,18 +3659,17 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
-
- __folio_set_pgtable(folio);
- lruvec_stat_add_folio(folio, NR_PAGETABLE);
+ __pagetable_ctor(ptdesc);
}
-static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
+static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc)
{
- struct folio *folio = ptdesc_folio(ptdesc);
+ __pagetable_ctor(ptdesc);
+}
- __folio_clear_pgtable(folio);
- lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc)
+{
+ __pagetable_ctor(ptdesc);
}
extern void __init pagecache_init(void);
@@ -3193,7 +3691,6 @@ extern void reserve_bootmem_region(phys_addr_t start,
/* Free the reserved page into the buddy system, so it gets managed. */
void free_reserved_page(struct page *page);
-#define free_highmem_page(page) free_reserved_page(page)
static inline void mark_page_reserved(struct page *page)
{
@@ -3232,7 +3729,7 @@ static inline unsigned long get_num_physpages(void)
}
/*
- * Using memblock node mappings, an architecture may initialise its
+ * FIXME: Using memblock node mappings, an architecture may initialise its
* zones, allocate the backing mem_map and account for memory holes in an
* architecture independent manner.
*
@@ -3247,7 +3744,7 @@ static inline unsigned long get_num_physpages(void)
* memblock_add_node(base, size, nid, MEMBLOCK_NONE)
* free_area_init(max_zone_pfns);
*/
-void free_area_init(unsigned long *max_zone_pfn);
+void arch_zone_limits_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn);
@@ -3293,6 +3790,8 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
struct rb_root_cached *root);
+struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node,
+ unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
@@ -3320,10 +3819,11 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
/* mmap.c */
-extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
+extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
-int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, bool write);
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
@@ -3352,9 +3852,9 @@ extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
const struct vm_special_mapping *sm);
-extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
+struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
- unsigned long flags,
+ vm_flags_t vm_flags,
const struct vm_special_mapping *spec);
unsigned long randomize_stack_top(unsigned long stack_top);
@@ -3371,9 +3871,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
}
-extern unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
@@ -3421,10 +3918,10 @@ struct vm_unmapped_area_info {
extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
/* truncate.c */
-extern void truncate_inode_pages(struct address_space *, loff_t);
-extern void truncate_inode_pages_range(struct address_space *,
- loff_t lstart, loff_t lend);
-extern void truncate_inode_pages_final(struct address_space *);
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart);
+void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart,
+ uoff_t lend);
+void truncate_inode_pages_final(struct address_space *mapping);
/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
@@ -3437,9 +3934,6 @@ extern unsigned long stack_guard_gap;
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
-int expand_downwards(struct vm_area_struct *vma, unsigned long address);
-
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
@@ -3465,7 +3959,7 @@ struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
return mtree_load(&mm->mm_mt, addr);
}
-static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
+static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma)
{
if (vma->vm_flags & VM_GROWSDOWN)
return stack_guard_gap;
@@ -3477,7 +3971,7 @@ static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
return 0;
}
-static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
+static inline unsigned long vm_start_gap(const struct vm_area_struct *vma)
{
unsigned long gap = stack_guard_start_gap(vma);
unsigned long vm_start = vma->vm_start;
@@ -3488,7 +3982,7 @@ static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
return vm_start;
}
-static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
+static inline unsigned long vm_end_gap(const struct vm_area_struct *vma)
{
unsigned long vm_end = vma->vm_end;
@@ -3500,11 +3994,95 @@ static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
return vm_end;
}
-static inline unsigned long vma_pages(struct vm_area_struct *vma)
+static inline unsigned long vma_pages(const struct vm_area_struct *vma)
{
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
+static inline unsigned long vma_desc_size(const struct vm_area_desc *desc)
+{
+ return desc->end - desc->start;
+}
+
+static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc)
+{
+ return vma_desc_size(desc) >> PAGE_SHIFT;
+}
+
+/**
+ * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN
+ * remap is required.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start: The virtual address to start the remap from, must be within the VMA.
+ * @start_pfn: The first PFN in the range to remap.
+ * @size: The size of the range to remap, in bytes, at most spanning to the end
+ * of the VMA.
+ */
+static inline void mmap_action_remap(struct vm_area_desc *desc,
+ unsigned long start,
+ unsigned long start_pfn,
+ unsigned long size)
+{
+ struct mmap_action *action = &desc->action;
+
+ /* [start, start + size) must be within the VMA. */
+ WARN_ON_ONCE(start < desc->start || start >= desc->end);
+ WARN_ON_ONCE(start + size > desc->end);
+
+ action->type = MMAP_REMAP_PFN;
+ action->remap.start = start;
+ action->remap.start_pfn = start_pfn;
+ action->remap.size = size;
+ action->remap.pgprot = desc->page_prot;
+}
+
+/**
+ * mmap_action_remap_full - helper for mmap_prepare hook to specify that the
+ * entirety of a VMA should be PFN remapped.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start_pfn: The first PFN in the range to remap.
+ */
+static inline void mmap_action_remap_full(struct vm_area_desc *desc,
+ unsigned long start_pfn)
+{
+ mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc));
+}
+
+/**
+ * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN
+ * I/O remap is required.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start: The virtual address to start the remap from, must be within the VMA.
+ * @start_pfn: The first PFN in the range to remap.
+ * @size: The size of the range to remap, in bytes, at most spanning to the end
+ * of the VMA.
+ */
+static inline void mmap_action_ioremap(struct vm_area_desc *desc,
+ unsigned long start,
+ unsigned long start_pfn,
+ unsigned long size)
+{
+ mmap_action_remap(desc, start, start_pfn, size);
+ desc->action.type = MMAP_IO_REMAP_PFN;
+}
+
+/**
+ * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the
+ * entirety of a VMA should be PFN I/O remapped.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start_pfn: The first PFN in the range to remap.
+ */
+static inline void mmap_action_ioremap_full(struct vm_area_desc *desc,
+ unsigned long start_pfn)
+{
+ mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc));
+}
+
+void mmap_action_prepare(struct mmap_action *action,
+ struct vm_area_desc *desc);
+int mmap_action_complete(struct mmap_action *action,
+ struct vm_area_struct *vma);
+
/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
unsigned long vm_start, unsigned long vm_end)
@@ -3517,17 +4095,17 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
return vma;
}
-static inline bool range_in_vma(struct vm_area_struct *vma,
+static inline bool range_in_vma(const struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
return (vma && vma->vm_start <= start && end <= vma->vm_end);
}
#ifdef CONFIG_MMU
-pgprot_t vm_get_page_prot(unsigned long vm_flags);
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);
#else
-static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
+static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
{
return __pgprot(0);
}
@@ -3546,10 +4124,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
unsigned long addr);
-int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t);
-int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot);
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t pgprot);
+
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num);
@@ -3557,14 +4134,16 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
+vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
+ bool write);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn);
+ unsigned long pfn);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn);
+ unsigned long addr, unsigned long pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
@@ -3580,15 +4159,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
-#ifndef io_remap_pfn_range
-static inline int io_remap_pfn_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn,
- unsigned long size, pgprot_t prot)
+#ifndef io_remap_pfn_range_pfn
+static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
+ unsigned long size)
{
- return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
+ return pfn;
}
#endif
+static inline int io_remap_pfn_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long orig_pfn,
+ unsigned long size, pgprot_t orig_prot)
+{
+ const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
+ const pgprot_t prot = pgprot_decrypted(orig_prot);
+
+ return remap_pfn_range(vma, addr, pfn, size, prot);
+}
+
static inline vm_fault_t vmf_error(int err)
{
if (err == -ENOMEM)
@@ -3631,7 +4219,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
* Indicates whether GUP can follow a PROT_NONE mapped page, or whether
* a (NUMA hinting) fault is required.
*/
-static inline bool gup_can_follow_protnone(struct vm_area_struct *vma,
+static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma,
unsigned int flags)
{
/*
@@ -3738,12 +4326,16 @@ extern void __kernel_map_pages(struct page *page, int numpages, int enable);
#ifdef CONFIG_DEBUG_PAGEALLOC
static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
{
+ iommu_debug_check_unmapped(page, numpages);
+
if (debug_pagealloc_enabled_static())
__kernel_map_pages(page, numpages, 1);
}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
{
+ iommu_debug_check_unmapped(page, numpages);
+
if (debug_pagealloc_enabled_static())
__kernel_map_pages(page, numpages, 0);
}
@@ -3761,7 +4353,7 @@ static inline bool debug_guardpage_enabled(void)
return static_branch_unlikely(&_debug_guardpage_enabled);
}
-static inline bool page_is_guard(struct page *page)
+static inline bool page_is_guard(const struct page *page)
{
if (!debug_guardpage_enabled())
return false;
@@ -3792,13 +4384,68 @@ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
-static inline bool page_is_guard(struct page *page) { return false; }
+static inline bool page_is_guard(const struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order) {}
#endif /* CONFIG_DEBUG_PAGEALLOC */
+#ifndef clear_pages
+/**
+ * clear_pages() - clear a page range for kernel-internal use.
+ * @addr: start address
+ * @npages: number of pages
+ *
+ * Use clear_user_pages() instead when clearing a page range to be
+ * mapped to user space.
+ *
+ * Does absolutely no exception handling.
+ *
+ * Note that even though the clearing operation is preemptible, clear_pages()
+ * does not (and on architectures where it reduces to a few long-running
+ * instructions, might not be able to) call cond_resched() to check if
+ * rescheduling is required.
+ *
+ * When running under preemptible models this is not a problem. Under
+ * cooperatively scheduled models, however, the caller is expected to
+ * limit @npages to no more than PROCESS_PAGES_NON_PREEMPT_BATCH.
+ */
+static inline void clear_pages(void *addr, unsigned int npages)
+{
+ do {
+ clear_page(addr);
+ addr += PAGE_SIZE;
+ } while (--npages);
+}
+#endif
+
+#ifndef PROCESS_PAGES_NON_PREEMPT_BATCH
+#ifdef clear_pages
+/*
+ * The architecture defines clear_pages(), and we assume that it is
+ * generally "fast". So choose a batch size large enough to allow the processor
+ * headroom for optimizing the operation and yet small enough that we see
+ * reasonable preemption latency for when this optimization is not possible
+ * (ex. slow microarchitectures, memory bandwidth saturation.)
+ *
+ * With a value of 32MB and assuming a memory bandwidth of ~10GBps, this should
+ * result in worst case preemption latency of around 3ms when clearing pages.
+ *
+ * (See comment above clear_pages() for why preemption latency is a concern
+ * here.)
+ */
+#define PROCESS_PAGES_NON_PREEMPT_BATCH (SZ_32M >> PAGE_SHIFT)
+#else /* !clear_pages */
+/*
+ * The architecture does not provide a clear_pages() implementation. Assume
+ * that clear_page() -- which clear_pages() will fallback to -- is relatively
+ * slow and choose a small value for PROCESS_PAGES_NON_PREEMPT_BATCH.
+ */
+#define PROCESS_PAGES_NON_PREEMPT_BATCH 1
+#endif
+#endif
+
#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
extern int in_gate_area_no_mm(unsigned long addr);
@@ -3815,13 +4462,7 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
}
#endif /* __HAVE_ARCH_GATE_AREA */
-extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
-
-#ifdef CONFIG_SYSCTL
-extern int sysctl_drop_caches;
-int drop_caches_sysctl_handler(const struct ctl_table *, int, void *, size_t *,
- loff_t *);
-#endif
+bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm);
void drop_slab(void);
@@ -3841,6 +4482,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
#endif
void *sparse_buffer_alloc(unsigned long size);
+unsigned long section_map_size(void);
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
@@ -3849,7 +4491,8 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
- struct vmem_altmap *altmap, struct page *reuse);
+ struct vmem_altmap *altmap, unsigned long ptpfn,
+ unsigned long flags);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
@@ -3865,6 +4508,12 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap);
+int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node,
+ unsigned long headsize);
+int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node,
+ unsigned long headsize);
+void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
+ unsigned long headsize);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
@@ -3872,7 +4521,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
{
/* number of pfns from base where pfn_to_page() is valid */
if (altmap)
@@ -3886,7 +4535,7 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
altmap->alloc -= nr_pfns;
}
#else
-static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
{
return 0;
}
@@ -3931,9 +4580,6 @@ static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
}
#endif
-void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
- unsigned long nr_pages);
-
enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
MF_ACTION_REQUIRED = 1 << 1,
@@ -3947,7 +4593,6 @@ enum mf_flags {
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
-extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
@@ -4039,6 +4684,7 @@ enum mf_action_page_type {
MF_MSG_DAX,
MF_MSG_UNSPLIT_THP,
MF_MSG_ALREADY_POISONED,
+ MF_MSG_PFN_MAP,
MF_MSG_UNKNOWN,
};
@@ -4094,78 +4740,15 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
pgoff_t first_index, pgoff_t nr);
#endif
-extern int sysctl_nr_trim_pages;
-
-#ifdef CONFIG_PRINTK
-void mem_dump_obj(void *object);
-#else
-static inline void mem_dump_obj(void *object) {}
-#endif
-
-static inline bool is_write_sealed(int seals)
-{
- return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
-}
-
-/**
- * is_readonly_sealed - Checks whether write-sealed but mapped read-only,
- * in which case writes should be disallowing moving
- * forwards.
- * @seals: the seals to check
- * @vm_flags: the VMA flags to check
- *
- * Returns whether readonly sealed, in which case writess should be disallowed
- * going forward.
- */
-static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags)
-{
- /*
- * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
- * MAP_SHARED and read-only, take care to not allow mprotect to
- * revert protections on such mappings. Do this only for shared
- * mappings. For private mappings, don't need to mask
- * VM_MAYWRITE as we still want them to be COW-writable.
- */
- if (is_write_sealed(seals) &&
- ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED))
- return true;
-
- return false;
-}
-
-/**
- * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
- * handle them.
- * @seals: the seals to check
- * @vma: the vma to operate on
- *
- * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
- * check/handling on the vma flags. Return 0 if check pass, or <0 for errors.
- */
-static inline int seal_check_write(int seals, struct vm_area_struct *vma)
-{
- if (!is_write_sealed(seals))
- return 0;
-
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * write seals are active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
-
- return 0;
-}
-
#ifdef CONFIG_ANON_VMA_NAME
-int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in,
- struct anon_vma_name *anon_name);
+int set_anon_vma_name(unsigned long addr, unsigned long size,
+ const char __user *uname);
#else
-static inline int
-madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, struct anon_vma_name *anon_name) {
- return 0;
+static inline
+int set_anon_vma_name(unsigned long addr, unsigned long size,
+ const char __user *uname)
+{
+ return -EINVAL;
}
#endif
@@ -4197,6 +4780,7 @@ void vma_pgtable_walk_begin(struct vm_area_struct *vma);
void vma_pgtable_walk_end(struct vm_area_struct *vma);
int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size);
+int reserve_mem_release_by_name(const char *name);
#ifdef CONFIG_64BIT
int do_mseal(unsigned long start, size_t len_in, unsigned long flags);
@@ -4229,4 +4813,85 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st
int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
+/*
+ * DMA mapping IDs for page_pool
+ *
+ * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and
+ * stashes it in the upper bits of page->pp_magic. We always want to be able to
+ * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP
+ * pages can have arbitrary kernel pointers stored in the same field as pp_magic
+ * (since it overlaps with page->lru.next), so we must ensure that we cannot
+ * mistake a valid kernel pointer with any of the values we write into this
+ * field.
+ *
+ * On architectures that set POISON_POINTER_DELTA, this is already ensured,
+ * since this value becomes part of PP_SIGNATURE; meaning we can just use the
+ * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the
+ * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is
+ * 0, we use the lowest bit of PAGE_OFFSET as the boundary if that value is
+ * known at compile-time.
+ *
+ * If the value of PAGE_OFFSET is not known at compile time, or if it is too
+ * small to leave at least 8 bits available above PP_SIGNATURE, we define the
+ * number of bits to be 0, which turns off the DMA index tracking altogether
+ * (see page_pool_register_dma_index()).
+ */
+#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA))
+#if POISON_POINTER_DELTA > 0
+/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA
+ * index to not overlap with that if set
+ */
+#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT)
+#else
+/* Use the lowest bit of PAGE_OFFSET if there's at least 8 bits available; see above */
+#define PP_DMA_INDEX_MIN_OFFSET (1 << (PP_DMA_INDEX_SHIFT + 8))
+#define PP_DMA_INDEX_BITS ((__builtin_constant_p(PAGE_OFFSET) && \
+ PAGE_OFFSET >= PP_DMA_INDEX_MIN_OFFSET && \
+ !(PAGE_OFFSET & (PP_DMA_INDEX_MIN_OFFSET - 1))) ? \
+ MIN(32, __ffs(PAGE_OFFSET) - PP_DMA_INDEX_SHIFT) : 0)
+
+#endif
+
+#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \
+ PP_DMA_INDEX_SHIFT)
+
+/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is
+ * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for
+ * the head page of compound page and bit 1 for pfmemalloc page, as well as the
+ * bits used for the DMA index. page_is_pfmemalloc() is checked in
+ * __page_pool_put_page() to avoid recycling the pfmemalloc page.
+ */
+#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL)
+
+#ifdef CONFIG_PAGE_POOL
+static inline bool page_pool_page_is_pp(const struct page *page)
+{
+ return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE;
+}
+#else
+static inline bool page_pool_page_is_pp(const struct page *page)
+{
+ return false;
+}
+#endif
+
+#define PAGE_SNAPSHOT_FAITHFUL (1 << 0)
+#define PAGE_SNAPSHOT_PG_BUDDY (1 << 1)
+#define PAGE_SNAPSHOT_PG_IDLE (1 << 2)
+
+struct page_snapshot {
+ struct folio folio_snapshot;
+ struct page page_snapshot;
+ unsigned long pfn;
+ unsigned long idx;
+ unsigned long flags;
+};
+
+static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps)
+{
+ return ps->flags & PAGE_SNAPSHOT_FAITHFUL;
+}
+
+void snapshot_page(struct page_snapshot *ps, const struct page *page);
+
#endif /* _LINUX_MM_H */