summaryrefslogtreecommitdiff
path: root/include/linux/mm.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r--include/linux/mm.h941
1 files changed, 587 insertions, 354 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6a4d6471b4a..059ca4767e14 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -5,6 +5,7 @@
#include <linux/errno.h>
#include <linux/mmdebug.h>
#include <linux/gfp.h>
+#include <linux/pgalloc_tag.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
@@ -36,6 +37,7 @@ struct anon_vma;
struct anon_vma_chain;
struct user_struct;
struct pt_regs;
+struct folio_batch;
extern int sysctl_page_lock_unfairness;
@@ -86,7 +88,7 @@ extern int sysctl_legacy_va_layout;
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
-extern const int mmap_rnd_bits_max;
+extern int mmap_rnd_bits_max __ro_after_init;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
@@ -96,7 +98,11 @@ extern int mmap_rnd_compat_bits __read_mostly;
#endif
#ifndef PHYSMEM_END
+# ifdef MAX_PHYSMEM_BITS
# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1)
+# else
+# define PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63))
+# endif
#endif
#include <asm/page.h>
@@ -206,11 +212,11 @@ extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;
-int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
+int overcommit_ratio_handler(const struct ctl_table *, int, void *, size_t *,
loff_t *);
-int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
+int overcommit_kbytes_handler(const struct ctl_table *, int, void *, size_t *,
loff_t *);
-int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
+int overcommit_policy_handler(const struct ctl_table *, int, void *, size_t *,
loff_t *);
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -230,7 +236,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
-#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
static inline struct folio *lru_to_folio(struct list_head *head)
{
return list_entry((head)->prev, struct folio, lru);
@@ -333,12 +338,16 @@ extern unsigned int kobjsize(const void *objp);
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS
-# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
-# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */
-# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */
-# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
-# define VM_PKEY_BIT3 VM_HIGH_ARCH_3
-#ifdef CONFIG_PPC
+# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
+# define VM_PKEY_BIT0 VM_HIGH_ARCH_0
+# define VM_PKEY_BIT1 VM_HIGH_ARCH_1
+# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
+#if CONFIG_ARCH_PKEY_BITS > 3
+# define VM_PKEY_BIT3 VM_HIGH_ARCH_3
+#else
+# define VM_PKEY_BIT3 0
+#endif
+#if CONFIG_ARCH_PKEY_BITS > 4
# define VM_PKEY_BIT4 VM_HIGH_ARCH_4
#else
# define VM_PKEY_BIT4 0
@@ -362,12 +371,10 @@ extern unsigned int kobjsize(const void *objp);
#if defined(CONFIG_X86)
# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
-#elif defined(CONFIG_PPC)
+#elif defined(CONFIG_PPC64)
# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP VM_ARCH_1
-#elif defined(CONFIG_IA64)
-# define VM_GROWSUP VM_ARCH_1
#elif defined(CONFIG_SPARC64)
# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */
# define VM_ARCH_CLEAR VM_SPARC_ADI
@@ -379,8 +386,8 @@ extern unsigned int kobjsize(const void *objp);
#endif
#if defined(CONFIG_ARM64_MTE)
-# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */
-# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */
+# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */
+# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */
#else
# define VM_MTE VM_NONE
# define VM_MTE_ALLOWED VM_NONE
@@ -397,6 +404,34 @@ extern unsigned int kobjsize(const void *objp);
# define VM_UFFD_MINOR VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+/*
+ * This flag is used to connect VFIO to arch specific KVM code. It
+ * indicates that the memory under this VMA is safe for use with any
+ * non-cachable memory type inside KVM. Some VFIO devices, on some
+ * platforms, are thought to be unsafe and can cause machine crashes
+ * if KVM does not lock down the memory type.
+ */
+#ifdef CONFIG_64BIT
+#define VM_ALLOW_ANY_UNCACHED_BIT 39
+#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT)
+#else
+#define VM_ALLOW_ANY_UNCACHED VM_NONE
+#endif
+
+#ifdef CONFIG_64BIT
+#define VM_DROPPABLE_BIT 40
+#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT)
+#elif defined(CONFIG_PPC32)
+#define VM_DROPPABLE VM_ARCH_1
+#else
+#define VM_DROPPABLE VM_NONE
+#endif
+
+#ifdef CONFIG_64BIT
+/* VM is sealed, in vm_flags */
+#define VM_SEALED _BITUL(63)
+#endif
+
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
@@ -623,7 +658,7 @@ struct vm_operations_struct {
* policy.
*/
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
- unsigned long addr);
+ unsigned long addr, pgoff_t *ilx);
#endif
/*
* Called by vm_normal_page() for special PTEs to find the
@@ -787,6 +822,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
return NULL;
}
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+ mmap_assert_locked(vma->vm_mm);
+}
+
static inline void release_fault_lock(struct vm_fault *vmf)
{
mmap_read_unlock(vmf->vma->vm_mm);
@@ -892,8 +932,8 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
*/
static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
{
- return vma->vm_start <= vma->vm_mm->brk &&
- vma->vm_end >= vma->vm_mm->start_brk;
+ return vma->vm_start < vma->vm_mm->brk &&
+ vma->vm_end > vma->vm_mm->start_brk;
}
/*
@@ -907,8 +947,8 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
* its "stack". It's not even well-defined for programs written
* languages like Go.
*/
- return vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack;
+ return vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack;
}
static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
@@ -941,6 +981,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & VM_ACCESS_FLAGS;
}
+static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+{
+ return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+ (VM_SHARED | VM_MAYWRITE);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+ return is_shared_maywrite(vma->vm_flags);
+}
+
static inline
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
{
@@ -968,25 +1019,15 @@ static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
return mas_prev(&vmi->mas, 0);
}
-static inline
-struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
-{
- return mas_prev_range(&vmi->mas, 0);
-}
-
-static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+ unsigned long start, unsigned long end, gfp_t gfp)
{
- return vmi->mas.index;
-}
+ __mas_set_range(&vmi->mas, start, end - 1);
+ mas_store_gfp(&vmi->mas, NULL, gfp);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
-static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
-{
- return vmi->mas.last + 1;
-}
-static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
- unsigned long count)
-{
- return mas_expected_entries(&vmi->mas, count);
+ return 0;
}
/* Free any unused preallocations */
@@ -1069,7 +1110,7 @@ static inline unsigned int compound_order(struct page *page)
*
* Return: The order of the folio.
*/
-static inline unsigned int folio_order(struct folio *folio)
+static inline unsigned int folio_order(const struct folio *folio)
{
if (!folio_test_large(folio))
return 0;
@@ -1161,84 +1202,51 @@ static inline int is_vmalloc_or_module_addr(const void *x)
/*
* How many times the entire folio is mapped as a single unit (eg by a
* PMD or PUD entry). This is probably not what you want, except for
- * debugging purposes - it does not include PTE-mapped sub-pages; look
- * at folio_mapcount() or page_mapcount() or total_mapcount() instead.
+ * debugging purposes or implementation of other core folio_*() primitives.
*/
-static inline int folio_entire_mapcount(struct folio *folio)
+static inline int folio_entire_mapcount(const struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
return atomic_read(&folio->_entire_mapcount) + 1;
}
-/*
- * The atomic page->_mapcount, starts from -1: so that transitions
- * both from it and to it can be tracked, using atomic_inc_and_test
- * and atomic_add_negative(-1).
- */
-static inline void page_mapcount_reset(struct page *page)
+static inline int folio_large_mapcount(const struct folio *folio)
{
- atomic_set(&(page)->_mapcount, -1);
+ VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
+ return atomic_read(&folio->_large_mapcount) + 1;
}
/**
- * page_mapcount() - Number of times this precise page is mapped.
- * @page: The page.
+ * folio_mapcount() - Number of mappings of this folio.
+ * @folio: The folio.
*
- * The number of times this page is mapped. If this page is part of
- * a large folio, it includes the number of times this page is mapped
- * as part of that folio.
+ * The folio mapcount corresponds to the number of present user page table
+ * entries that reference any part of a folio. Each such present user page
+ * table entry must be paired with exactly on folio reference.
*
- * Will report 0 for pages which cannot be mapped into userspace, eg
- * slab, page tables and similar.
- */
-static inline int page_mapcount(struct page *page)
-{
- int mapcount = atomic_read(&page->_mapcount) + 1;
-
- /* Handle page_has_type() pages */
- if (mapcount < 0)
- mapcount = 0;
- if (unlikely(PageCompound(page)))
- mapcount += folio_entire_mapcount(page_folio(page));
-
- return mapcount;
-}
-
-int folio_total_mapcount(struct folio *folio);
-
-/**
- * folio_mapcount() - Calculate the number of mappings of this folio.
- * @folio: The folio.
+ * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
+ * exactly once.
*
- * A large folio tracks both how many times the entire folio is mapped,
- * and how many times each individual page in the folio is mapped.
- * This function calculates the total number of times the folio is
- * mapped.
+ * For hugetlb folios, each abstracted "hugetlb" user page table entry that
+ * references the entire folio counts exactly once, even when such special
+ * page table entries are comprised of multiple ordinary page table entries.
+ *
+ * Will report 0 for pages which cannot be mapped into userspace, such as
+ * slab, page tables and similar.
*
* Return: The number of times this folio is mapped.
*/
-static inline int folio_mapcount(struct folio *folio)
+static inline int folio_mapcount(const struct folio *folio)
{
- if (likely(!folio_test_large(folio)))
- return atomic_read(&folio->_mapcount) + 1;
- return folio_total_mapcount(folio);
-}
+ int mapcount;
-static inline int total_mapcount(struct page *page)
-{
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) + 1;
- return folio_total_mapcount(page_folio(page));
-}
-
-static inline bool folio_large_is_mapped(struct folio *folio)
-{
- /*
- * Reading _entire_mapcount below could be omitted if hugetlb
- * participated in incrementing nr_pages_mapped when compound mapped.
- */
- return atomic_read(&folio->_nr_pages_mapped) > 0 ||
- atomic_read(&folio->_entire_mapcount) >= 0;
+ if (likely(!folio_test_large(folio))) {
+ mapcount = atomic_read(&folio->_mapcount) + 1;
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+ return mapcount;
+ }
+ return folio_large_mapcount(folio);
}
/**
@@ -1247,11 +1255,9 @@ static inline bool folio_large_is_mapped(struct folio *folio)
*
* Return: True if any page in this folio is referenced by user page tables.
*/
-static inline bool folio_mapped(struct folio *folio)
+static inline bool folio_mapped(const struct folio *folio)
{
- if (likely(!folio_test_large(folio)))
- return atomic_read(&folio->_mapcount) >= 0;
- return folio_large_is_mapped(folio);
+ return folio_mapcount(folio) >= 1;
}
/*
@@ -1259,11 +1265,9 @@ static inline bool folio_mapped(struct folio *folio)
* For compound page it returns true if any sub-page of compound page is mapped,
* even if this particular sub-page is not itself mapped by any PTE or PMD.
*/
-static inline bool page_mapped(struct page *page)
+static inline bool page_mapped(const struct page *page)
{
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) >= 0;
- return folio_large_is_mapped(page_folio(page));
+ return folio_mapped(page_folio(page));
}
static inline struct page *virt_to_head_page(const void *x)
@@ -1286,11 +1290,10 @@ void put_pages_list(struct list_head *pages);
void split_page(struct page *page, unsigned int order);
void folio_copy(struct folio *dst, struct folio *src);
+int folio_mc_copy(struct folio *dst, struct folio *src);
unsigned long nr_free_buffer_pages(void);
-void destroy_large_folio(struct folio *folio);
-
/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(struct page *page)
{
@@ -1343,7 +1346,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
struct page *page, unsigned int nr, unsigned long addr);
vm_fault_t finish_fault(struct vm_fault *vmf);
-vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif
/*
@@ -1409,27 +1411,22 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-bool __put_devmap_managed_page_refs(struct page *page, int refs);
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
+bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
+static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
{
if (!static_branch_unlikely(&devmap_managed_key))
return false;
- if (!is_zone_device_page(page))
+ if (!folio_is_zone_device(folio))
return false;
- return __put_devmap_managed_page_refs(page, refs);
+ return __put_devmap_managed_folio_refs(folio, refs);
}
#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
+static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
{
return false;
}
#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page(struct page *page)
-{
- return put_devmap_managed_page_refs(page, 1);
-}
-
/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1501,6 +1498,8 @@ static inline void folio_put_refs(struct folio *folio, int refs)
__folio_put(folio);
}
+void folios_put_refs(struct folio_batch *folios, unsigned int *refs);
+
/*
* union release_pages_arg - an array of pages or folios
*
@@ -1523,18 +1522,19 @@ void release_pages(release_pages_arg, int nr);
/**
* folios_put - Decrement the reference count on an array of folios.
* @folios: The folios.
- * @nr: How many folios there are.
*
- * Like folio_put(), but for an array of folios. This is more efficient
- * than writing the loop yourself as it will optimise the locks which
- * need to be taken if the folios are freed.
+ * Like folio_put(), but for a batch of folios. This is more efficient
+ * than writing the loop yourself as it will optimise the locks which need
+ * to be taken if the folios are freed. The folios batch is returned
+ * empty and ready to be reused for another batch; there is no need to
+ * reinitialise it.
*
* Context: May be called in process or interrupt context, but not in NMI
* context. May be called while holding a spinlock.
*/
-static inline void folios_put(struct folio **folios, unsigned int nr)
+static inline void folios_put(struct folio_batch *folios)
{
- release_pages(folios, nr);
+ folios_put_refs(folios, NULL);
}
static inline void put_page(struct page *page)
@@ -1545,7 +1545,7 @@ static inline void put_page(struct page *page)
* For some devmap managed pages we need to catch refcount transition
* from 2 to 1:
*/
- if (put_devmap_managed_page(&folio->page))
+ if (put_devmap_managed_folio_refs(folio, 1))
return;
folio_put(folio);
}
@@ -1577,17 +1577,20 @@ static inline void put_page(struct page *page)
* issue.
*
* Locking: the lockless algorithm described in folio_try_get_rcu()
- * provides safe operation for get_user_pages(), page_mkclean() and
+ * provides safe operation for get_user_pages(), folio_mkclean() and
* other calls that race to set up page table entries.
*/
#define GUP_PIN_COUNTING_BIAS (1U << 10)
void unpin_user_page(struct page *page);
+void unpin_folio(struct folio *folio);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty);
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);
+void unpin_user_folio(struct folio *folio, unsigned long npages);
+void unpin_folios(struct folio **folios, unsigned long nfolios);
static inline bool is_cow_mapping(vm_flags_t flags)
{
@@ -1627,13 +1630,11 @@ static inline int page_zone_id(struct page *page)
}
#ifdef NODE_NOT_IN_PAGE_FLAGS
-extern int page_to_nid(const struct page *page);
+int page_to_nid(const struct page *page);
#else
static inline int page_to_nid(const struct page *page)
{
- struct page *p = (struct page *)page;
-
- return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
+ return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK;
}
#endif
@@ -1692,26 +1693,26 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
- return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
+ return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
{
- return page->_last_cpupid;
+ return folio->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
{
- return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
+ return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}
-extern int page_cpupid_xchg_last(struct page *page, int cpupid);
+int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
static inline void page_cpupid_reset_last(struct page *page)
{
@@ -1719,11 +1720,12 @@ static inline void page_cpupid_reset_last(struct page *page)
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
{
int last_time;
- last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
+ last_time = folio_xchg_last_cpupid(folio,
+ time >> PAGE_ACCESS_TIME_BUCKETS);
return last_time << PAGE_ACCESS_TIME_BUCKETS;
}
@@ -1736,20 +1738,22 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
}
}
+
+bool folio_use_access_time(struct folio *folio);
#else /* !CONFIG_NUMA_BALANCING */
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
- return page_to_nid(page); /* XXX */
+ return folio_nid(folio); /* XXX */
}
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
{
return 0;
}
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
{
- return page_to_nid(page); /* XXX */
+ return folio_nid(folio); /* XXX */
}
static inline int cpupid_to_nid(int cpupid)
@@ -1789,6 +1793,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
}
+static inline bool folio_use_access_time(struct folio *folio)
+{
+ return false;
+}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -1801,7 +1809,7 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
static inline u8 page_kasan_tag(const struct page *page)
{
- u8 tag = 0xff;
+ u8 tag = KASAN_TAG_KERNEL;
if (kasan_enabled()) {
tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
@@ -1830,7 +1838,7 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag)
static inline void page_kasan_tag_reset(struct page *page)
{
if (kasan_enabled())
- page_kasan_tag_set(page, 0xff);
+ page_kasan_tag_set(page, KASAN_TAG_KERNEL);
}
#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
@@ -1919,8 +1927,8 @@ static inline struct folio *pfn_folio(unsigned long pfn)
*
* For more information, please see Documentation/core-api/pin_user_pages.rst.
*
- * Return: True, if it is likely that the page has been "dma-pinned".
- * False, if the page is definitely not dma-pinned.
+ * Return: True, if it is likely that the folio has been "dma-pinned".
+ * False, if the folio is definitely not dma-pinned.
*/
static inline bool folio_maybe_dma_pinned(struct folio *folio)
{
@@ -1939,26 +1947,21 @@ static inline bool folio_maybe_dma_pinned(struct folio *folio)
GUP_PIN_COUNTING_BIAS;
}
-static inline bool page_maybe_dma_pinned(struct page *page)
-{
- return folio_maybe_dma_pinned(page_folio(page));
-}
-
/*
* This should most likely only be called during fork() to see whether we
* should break the cow immediately for an anon page on the src mm.
*
* The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
*/
-static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
- struct page *page)
+static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
+ struct folio *folio)
{
VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
return false;
- return page_maybe_dma_pinned(page);
+ return folio_maybe_dma_pinned(folio);
}
/**
@@ -2040,7 +2043,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
*
* Return: A positive power of two.
*/
-static inline long folio_nr_pages(struct folio *folio)
+static inline long folio_nr_pages(const struct folio *folio)
{
if (!folio_test_large(folio))
return 1;
@@ -2051,6 +2054,13 @@ static inline long folio_nr_pages(struct folio *folio)
#endif
}
+/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER)
+#else
+#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES
+#endif
+
/*
* compound_nr() returns the number of pages in this potentially compound
* page. compound_nr() can be called on a tail page, and is defined to
@@ -2109,7 +2119,7 @@ static inline struct folio *folio_next(struct folio *folio)
* it from being split. It is not necessary for the folio to be locked.
* Return: The base-2 logarithm of the size of this folio.
*/
-static inline unsigned int folio_shift(struct folio *folio)
+static inline unsigned int folio_shift(const struct folio *folio)
{
return PAGE_SHIFT + folio_order(folio);
}
@@ -2122,49 +2132,78 @@ static inline unsigned int folio_shift(struct folio *folio)
* it from being split. It is not necessary for the folio to be locked.
* Return: The number of bytes in this folio.
*/
-static inline size_t folio_size(struct folio *folio)
+static inline size_t folio_size(const struct folio *folio)
{
return PAGE_SIZE << folio_order(folio);
}
/**
- * folio_estimated_sharers - Estimate the number of sharers of a folio.
+ * folio_likely_mapped_shared - Estimate if the folio is mapped into the page
+ * tables of more than one MM
* @folio: The folio.
*
- * folio_estimated_sharers() aims to serve as a function to efficiently
- * estimate the number of processes sharing a folio. This is done by
- * looking at the precise mapcount of the first subpage in the folio, and
- * assuming the other subpages are the same. This may not be true for large
- * folios. If you want exact mapcounts for exact calculations, look at
- * page_mapcount() or folio_total_mapcount().
+ * This function checks if the folio is currently mapped into more than one
+ * MM ("mapped shared"), or if the folio is only mapped into a single MM
+ * ("mapped exclusively").
+ *
+ * For KSM folios, this function also returns "mapped shared" when a folio is
+ * mapped multiple times into the same MM, because the individual page mappings
+ * are independent.
+ *
+ * As precise information is not easily available for all folios, this function
+ * estimates the number of MMs ("sharers") that are currently mapping a folio
+ * using the number of times the first page of the folio is currently mapped
+ * into page tables.
+ *
+ * For small anonymous folios and anonymous hugetlb folios, the return
+ * value will be exactly correct: non-KSM folios can only be mapped at most once
+ * into an MM, and they cannot be partially mapped. KSM folios are
+ * considered shared even if mapped multiple times into the same MM.
+ *
+ * For other folios, the result can be fuzzy:
+ * #. For partially-mappable large folios (THP), the return value can wrongly
+ * indicate "mapped exclusively" (false negative) when the folio is
+ * only partially mapped into at least one MM.
+ * #. For pagecache folios (including hugetlb), the return value can wrongly
+ * indicate "mapped shared" (false positive) when two VMAs in the same MM
+ * cover the same file range.
*
- * Return: The estimated number of processes sharing a folio.
+ * Further, this function only considers current page table mappings that
+ * are tracked using the folio mapcount(s).
+ *
+ * This function does not consider:
+ * #. If the folio might get mapped in the (near) future (e.g., swapcache,
+ * pagecache, temporary unmapping for migration).
+ * #. If the folio is mapped differently (VM_PFNMAP).
+ * #. If hugetlb page table sharing applies. Callers might want to check
+ * hugetlb_pmd_shared().
+ *
+ * Return: Whether the folio is estimated to be mapped into more than one MM.
*/
-static inline int folio_estimated_sharers(struct folio *folio)
+static inline bool folio_likely_mapped_shared(struct folio *folio)
{
- return page_mapcount(folio_page(folio, 0));
-}
+ int mapcount = folio_mapcount(folio);
-#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
-static inline int arch_make_page_accessible(struct page *page)
-{
- return 0;
+ /* Only partially-mappable folios require more care. */
+ if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
+ return mapcount > 1;
+
+ /* A single mapping implies "mapped exclusively". */
+ if (mapcount <= 1)
+ return false;
+
+ /* If any page is mapped more than once we treat it "mapped shared". */
+ if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio))
+ return true;
+
+ /* Let's guess based on the first subpage. */
+ return atomic_read(&folio->_mapcount) > 0;
}
-#endif
#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
static inline int arch_make_folio_accessible(struct folio *folio)
{
- int ret;
- long i, nr = folio_nr_pages(folio);
-
- for (i = 0; i < nr; i++) {
- ret = arch_make_page_accessible(folio_page(folio, i));
- if (ret)
- break;
- }
-
- return ret;
+ return 0;
}
#endif
@@ -2173,11 +2212,6 @@ static inline int arch_make_folio_accessible(struct folio *folio)
*/
#include <linux/vmstat.h>
-static __always_inline void *lowmem_page_address(const struct page *page)
-{
- return page_to_virt(page);
-}
-
#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
#define HASHED_PAGE_VIRTUAL
#endif
@@ -2200,6 +2234,11 @@ void set_page_address(struct page *page, void *virtual);
void page_address_init(void);
#endif
+static __always_inline void *lowmem_page_address(const struct page *page)
+{
+ return page_to_virt(page);
+}
+
#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
#define page_address(page) lowmem_page_address(page)
#define set_page_address(page, address) do { } while(0)
@@ -2211,19 +2250,6 @@ static inline void *folio_address(const struct folio *folio)
return page_address(&folio->page);
}
-extern pgoff_t __page_file_index(struct page *page);
-
-/*
- * Return the pagecache index of the passed page. Regular pagecache pages
- * use ->index whereas swapcache pages use swp_offset(->private)
- */
-static inline pgoff_t page_index(struct page *page)
-{
- if (unlikely(PageSwapCache(page)))
- return __page_file_index(page);
- return page->index;
-}
-
/*
* Return true only if the page has been allocated with
* ALLOC_NO_WATERMARKS and the low watermark was not
@@ -2333,6 +2359,8 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
+struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
@@ -2355,20 +2383,46 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp);
-int follow_pfn(struct vm_area_struct *vma, unsigned long address,
- unsigned long *pfn);
-int follow_phys(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags, unsigned long *prot, resource_size_t *phys);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
+struct follow_pfnmap_args {
+ /**
+ * Inputs:
+ * @vma: Pointer to @vm_area_struct struct
+ * @address: the virtual address to walk
+ */
+ struct vm_area_struct *vma;
+ unsigned long address;
+ /**
+ * Internals:
+ *
+ * The caller shouldn't touch any of these.
+ */
+ spinlock_t *lock;
+ pte_t *ptep;
+ /**
+ * Outputs:
+ *
+ * @pfn: the PFN of the address
+ * @pgprot: the pgprot_t of the mapping
+ * @writable: whether the mapping is writable
+ * @special: whether the mapping is a special mapping (real PFN maps)
+ */
+ unsigned long pfn;
+ pgprot_t pgprot;
+ bool writable;
+ bool special;
+};
+int follow_pfnmap_start(struct follow_pfnmap_args *args);
+void follow_pfnmap_end(struct follow_pfnmap_args *args);
+
extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
-int generic_error_remove_page(struct address_space *mapping, struct page *page);
+int generic_error_remove_folio(struct address_space *mapping,
+ struct folio *folio);
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
unsigned long address, struct pt_regs *regs);
@@ -2419,8 +2473,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
-extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
- void *buf, int len, unsigned int gup_flags);
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@@ -2431,6 +2483,9 @@ long pin_user_pages_remote(struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
int *locked);
+/*
+ * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
+ */
static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
unsigned long addr,
int gup_flags,
@@ -2438,12 +2493,15 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
{
struct page *page;
struct vm_area_struct *vma;
- int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
+ int got;
+
+ if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
+ return ERR_PTR(-EINVAL);
+
+ got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
if (got < 0)
return ERR_PTR(got);
- if (got == 0)
- return NULL;
vma = vma_lookup(mm, addr);
if (WARN_ON_ONCE(!vma)) {
@@ -2463,6 +2521,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
+long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
+ struct folio **folios, unsigned int max_folios,
+ pgoff_t *offset);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
@@ -2483,11 +2544,6 @@ int set_page_dirty_lock(struct page *page);
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
-extern unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks);
-
/*
* Flags used by change_protection(). For now we make it a bitmap so
* that we can pass in multiple flags just like parameters. However
@@ -2508,21 +2564,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
MM_CP_UFFD_WP_RESOLVE)
-bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
-int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
-static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
-{
- /*
- * We want to check manually if we can change individual PTEs writable
- * if we can't do that automatically for all PTEs in a mapping. For
- * private mappings, that's always the case when we have write
- * permissions as we properly have to handle COW.
- */
- if (vma->vm_flags & VM_SHARED)
- return vma_wants_writenotify(vma, vma->vm_page_prot);
- return !!(vma->vm_flags & VM_WRITE);
-
-}
bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
extern long change_protection(struct mmu_gather *tlb,
@@ -2574,19 +2615,19 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member)
mm_trace_rss_stat(mm, member);
}
-/* Optimized variant when page is already known not to be PageAnon */
-static inline int mm_counter_file(struct page *page)
+/* Optimized variant when folio is already known not to be anon */
+static inline int mm_counter_file(struct folio *folio)
{
- if (PageSwapBacked(page))
+ if (folio_test_swapbacked(folio))
return MM_SHMEMPAGES;
return MM_FILEPAGES;
}
-static inline int mm_counter(struct page *page)
+static inline int mm_counter(struct folio *folio)
{
- if (PageAnon(page))
+ if (folio_test_anon(folio))
return MM_ANONPAGES;
- return mm_counter_file(page);
+ return mm_counter_file(folio);
}
static inline unsigned long get_mm_rss(struct mm_struct *mm)
@@ -2634,14 +2675,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
*maxrss = hiwater_rss;
}
-#if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct mm_struct *mm);
-#else
-static inline void sync_mm_rss(struct mm_struct *mm)
-{
-}
-#endif
-
#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
@@ -2654,6 +2687,30 @@ static inline pte_t pte_mkspecial(pte_t pte)
}
#endif
+#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+static inline bool pmd_special(pmd_t pmd)
+{
+ return false;
+}
+
+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+ return pmd;
+}
+#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */
+
+#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
+static inline bool pud_special(pud_t pud)
+{
+ return false;
+}
+
+static inline pud_t pud_mkspecial(pud_t pud)
+{
+ return pud;
+}
+#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
+
#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
@@ -2824,12 +2881,13 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt)
*
* Return: The ptdesc describing the allocated page tables.
*/
-static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
{
- struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+ struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);
return page_ptdesc(page);
}
+#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
/**
* pagetable_free - Free pagetables
@@ -2845,7 +2903,7 @@ static inline void pagetable_free(struct ptdesc *pt)
__free_pages(page, compound_order(page));
}
-#if USE_SPLIT_PTE_PTLOCKS
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
bool ptlock_alloc(struct ptdesc *ptdesc);
@@ -2880,6 +2938,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}
+static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
+{
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE));
+ BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE);
+ return ptlock_ptr(virt_to_ptdesc(pte));
+}
+
static inline bool ptlock_init(struct ptdesc *ptdesc)
{
/*
@@ -2896,7 +2961,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc)
return true;
}
-#else /* !USE_SPLIT_PTE_PTLOCKS */
+#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
*/
@@ -2904,10 +2969,14 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
return &mm->page_table_lock;
}
+static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
+{
+ return &mm->page_table_lock;
+}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
-#endif /* USE_SPLIT_PTE_PTLOCKS */
+#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
{
@@ -2967,7 +3036,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
NULL: pte_offset_kernel(pmd, address))
-#if USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_SPLIT_PMD_PTLOCKS)
static inline struct page *pmd_pgtable_page(pmd_t *pmd)
{
@@ -3031,6 +3100,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
if (!pmd_ptlock_init(ptdesc))
return false;
__folio_set_pgtable(folio);
+ ptdesc_pmd_pts_init(ptdesc);
lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
}
@@ -3063,6 +3133,22 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
return ptl;
}
+static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
+{
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ __folio_set_pgtable(folio);
+ lruvec_stat_add_folio(folio, NR_PAGETABLE);
+}
+
+static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
+{
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ __folio_clear_pgtable(folio);
+ lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+}
+
extern void __init pagecache_init(void);
extern void free_initmem(void);
@@ -3081,13 +3167,7 @@ extern void reserve_bootmem_region(phys_addr_t start,
phys_addr_t end, int nid);
/* Free the reserved page into the buddy system, so it gets managed. */
-static inline void free_reserved_page(struct page *page)
-{
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- adjust_managed_page_count(page, 1);
-}
+void free_reserved_page(struct page *page);
#define free_highmem_page(page) free_reserved_page(page)
static inline void mark_page_reserved(struct page *page)
@@ -3144,8 +3224,6 @@ static inline unsigned long get_num_physpages(void)
*/
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
-unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
- unsigned long end_pfn);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
@@ -3161,7 +3239,6 @@ static inline int early_pfn_to_nid(unsigned long pfn)
extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif
-extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void mem_init(void);
extern void __init mmap_init(void);
@@ -3173,9 +3250,6 @@ static inline void show_mem(void)
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
-#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-extern unsigned long arch_reserved_kernel_pages(void);
-#endif
extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
@@ -3222,27 +3296,9 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
-extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff,
- struct vm_area_struct *next);
-extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff);
-extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi,
- struct mm_struct *, struct vm_area_struct *prev, unsigned long addr,
- unsigned long end, unsigned long vm_flags, struct anon_vma *,
- struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx,
- struct anon_vma_name *);
-extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
-extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
- unsigned long addr, int new_below);
-extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
- unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void unlink_file_vma(struct vm_area_struct *);
-extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
- unsigned long addr, unsigned long len, pgoff_t pgoff,
- bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
@@ -3275,15 +3331,20 @@ extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long flags,
const struct vm_special_mapping *spec);
-/* This is an obsolete alternative to _install_special_mapping. */
-extern int install_special_mapping(struct mm_struct *mm,
- unsigned long addr, unsigned long len,
- unsigned long flags, struct page **pages);
unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);
-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+unsigned long
+__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);
+
+static inline unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
+}
extern unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
@@ -3295,14 +3356,14 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr,
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
+int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf, bool unlock);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
#ifdef CONFIG_MMU
-extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct list_head *uf, bool unlock);
extern int __mm_populate(unsigned long addr, unsigned long len,
int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
@@ -3314,8 +3375,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif
-/* These take the mm semaphore themselves */
-extern int __must_check vm_brk(unsigned long, unsigned long);
+/* This takes the mm semaphore itself */
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
@@ -3330,6 +3390,7 @@ struct vm_unmapped_area_info {
unsigned long high_limit;
unsigned long align_mask;
unsigned long align_offset;
+ unsigned long start_gap;
};
extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
@@ -3530,9 +3591,6 @@ static inline vm_fault_t vmf_fs_error(int err)
return VM_FAULT_SIGBUS;
}
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
- unsigned int foll_flags);
-
static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
if (vm_fault & VM_FAULT_OOM)
@@ -3686,24 +3744,22 @@ static inline bool page_is_guard(struct page *page)
return PageGuard(page);
}
-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
- int migratetype);
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
+ unsigned int order)
{
if (!debug_guardpage_enabled())
return false;
- return __set_page_guard(zone, page, order, migratetype);
+ return __set_page_guard(zone, page, order);
}
-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
- int migratetype);
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
+ unsigned int order)
{
if (!debug_guardpage_enabled())
return;
- __clear_page_guard(zone, page, order, migratetype);
+ __clear_page_guard(zone, page, order);
}
#else /* CONFIG_DEBUG_PAGEALLOC */
@@ -3713,9 +3769,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) { return false; }
+ unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) {}
+ unsigned int order) {}
#endif /* CONFIG_DEBUG_PAGEALLOC */
#ifdef __HAVE_ARCH_GATE_AREA
@@ -3738,7 +3794,7 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
#ifdef CONFIG_SYSCTL
extern int sysctl_drop_caches;
-int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
+int drop_caches_sysctl_handler(const struct ctl_table *, int, void *, size_t *,
loff_t *);
#endif
@@ -3763,8 +3819,9 @@ void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
-void pmd_init(void *addr);
void pud_init(void *addr);
+void pmd_init(void *addr);
+void kernel_pte_init(void *addr);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
@@ -3792,6 +3849,32 @@ void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap);
#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ /* number of pfns from base where pfn_to_page() is valid */
+ if (altmap)
+ return altmap->reserve + altmap->free;
+ return 0;
+}
+
+static inline void vmem_altmap_free(struct vmem_altmap *altmap,
+ unsigned long nr_pfns)
+{
+ altmap->alloc -= nr_pfns;
+}
+#else
+static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ return 0;
+}
+
+static inline void vmem_altmap_free(struct vmem_altmap *altmap,
+ unsigned long nr_pfns)
+{
+}
+#endif
+
#define VMEMMAP_RESERVE_NR 2
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
@@ -3837,13 +3920,13 @@ enum mf_flags {
MF_UNPOISON = 1 << 4,
MF_SW_SIMULATED = 1 << 5,
MF_NO_RETRY = 1 << 6,
+ MF_MEM_PRE_REMOVE = 1 << 7,
};
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
-extern void shake_page(struct page *p);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
@@ -3856,7 +3939,6 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
-struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
#else
static inline void memory_failure_queue(unsigned long pfn, int flags)
{
@@ -3877,12 +3959,6 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
}
#endif
-#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
-void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma, struct list_head *to_kill,
- unsigned long ksm_addr);
-#endif
-
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
extern void memblk_nr_poison_inc(unsigned long pfn);
extern void memblk_nr_poison_sub(unsigned long pfn, long i);
@@ -3923,10 +3999,10 @@ enum mf_result {
enum mf_action_page_type {
MF_MSG_KERNEL,
MF_MSG_KERNEL_HIGH_ORDER,
- MF_MSG_SLAB,
MF_MSG_DIFFERENT_COMPOUND,
MF_MSG_HUGE,
MF_MSG_FREE_HUGE,
+ MF_MSG_GET_HWPOISON,
MF_MSG_UNMAP_FAILED,
MF_MSG_DIRTY_SWAPCACHE,
MF_MSG_CLEAN_SWAPCACHE,
@@ -3940,13 +4016,12 @@ enum mf_action_page_type {
MF_MSG_BUDDY,
MF_MSG_DAX,
MF_MSG_UNSPLIT_THP,
+ MF_MSG_ALREADY_POISONED,
MF_MSG_UNKNOWN,
};
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
-extern void clear_huge_page(struct page *page,
- unsigned long addr_hint,
- unsigned int pages_per_huge_page);
+void folio_zero_user(struct folio *folio, unsigned long addr_hint);
int copy_user_large_folio(struct folio *dst, struct folio *src,
unsigned long addr_hint,
struct vm_area_struct *vma);
@@ -4005,34 +4080,57 @@ void mem_dump_obj(void *object);
static inline void mem_dump_obj(void *object) {}
#endif
+static inline bool is_write_sealed(int seals)
+{
+ return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
+}
+
+/**
+ * is_readonly_sealed - Checks whether write-sealed but mapped read-only,
+ * in which case writes should be disallowing moving
+ * forwards.
+ * @seals: the seals to check
+ * @vm_flags: the VMA flags to check
+ *
+ * Returns whether readonly sealed, in which case writess should be disallowed
+ * going forward.
+ */
+static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags)
+{
+ /*
+ * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
+ * MAP_SHARED and read-only, take care to not allow mprotect to
+ * revert protections on such mappings. Do this only for shared
+ * mappings. For private mappings, don't need to mask
+ * VM_MAYWRITE as we still want them to be COW-writable.
+ */
+ if (is_write_sealed(seals) &&
+ ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED))
+ return true;
+
+ return false;
+}
+
/**
- * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
+ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
+ * handle them.
* @seals: the seals to check
* @vma: the vma to operate on
*
- * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
- * the vma flags. Return 0 if check pass, or <0 for errors.
- */
-static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
-{
- if (seals & F_SEAL_FUTURE_WRITE) {
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * "future write" seal active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
-
- /*
- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
- * MAP_SHARED and read-only, take care to not allow mprotect to
- * revert protections on such mappings. Do this only for shared
- * mappings. For private mappings, don't need to mask
- * VM_MAYWRITE as we still want them to be COW-writable.
- */
- if (vma->vm_flags & VM_SHARED)
- vm_flags_clear(vma, VM_MAYWRITE);
- }
+ * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
+ * check/handling on the vma flags. Return 0 if check pass, or <0 for errors.
+ */
+static inline int seal_check_write(int seals, struct vm_area_struct *vma)
+{
+ if (!is_write_sealed(seals))
+ return 0;
+
+ /*
+ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+ * write seals are active.
+ */
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+ return -EPERM;
return 0;
}
@@ -4051,21 +4149,156 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
#ifdef CONFIG_UNACCEPTED_MEMORY
-bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
-void accept_memory(phys_addr_t start, phys_addr_t end);
+bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size);
+void accept_memory(phys_addr_t start, unsigned long size);
#else
static inline bool range_contains_unaccepted_memory(phys_addr_t start,
- phys_addr_t end)
+ unsigned long size)
{
return false;
}
-static inline void accept_memory(phys_addr_t start, phys_addr_t end)
+static inline void accept_memory(phys_addr_t start, unsigned long size)
+{
+}
+
+#endif
+
+static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
+{
+ return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE);
+}
+
+void vma_pgtable_walk_begin(struct vm_area_struct *vma);
+void vma_pgtable_walk_end(struct vm_area_struct *vma);
+
+int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size);
+
+#ifdef CONFIG_64BIT
+int do_mseal(unsigned long start, size_t len_in, unsigned long flags);
+#else
+static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
+{
+ /* noop on 32 bit */
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
+{
+ int i;
+ struct alloc_tag *tag;
+ unsigned int nr_pages = 1 << new_order;
+
+ if (!mem_alloc_profiling_enabled())
+ return;
+
+ tag = pgalloc_tag_get(&folio->page);
+ if (!tag)
+ return;
+
+ for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
+ union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i));
+
+ if (ref) {
+ /* Set new reference to point to the original tag */
+ alloc_tag_ref_set(ref, tag);
+ put_page_tag_ref(ref);
+ }
+ }
+}
+
+static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
+{
+ struct alloc_tag *tag;
+ union codetag_ref *ref;
+
+ tag = pgalloc_tag_get(&old->page);
+ if (!tag)
+ return;
+
+ ref = get_page_tag_ref(&new->page);
+ if (!ref)
+ return;
+
+ /* Clear the old ref to the original allocation tag. */
+ clear_page_tag_ref(&old->page);
+ /* Decrement the counters of the tag on get_new_folio. */
+ alloc_tag_sub(ref, folio_nr_pages(new));
+
+ __alloc_tag_ref_set(ref, tag);
+
+ put_page_tag_ref(ref);
+}
+#else /* !CONFIG_MEM_ALLOC_PROFILING */
+static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
+{
+}
+
+static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
{
}
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+/*
+ * DMA mapping IDs for page_pool
+ *
+ * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and
+ * stashes it in the upper bits of page->pp_magic. We always want to be able to
+ * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP
+ * pages can have arbitrary kernel pointers stored in the same field as pp_magic
+ * (since it overlaps with page->lru.next), so we must ensure that we cannot
+ * mistake a valid kernel pointer with any of the values we write into this
+ * field.
+ *
+ * On architectures that set POISON_POINTER_DELTA, this is already ensured,
+ * since this value becomes part of PP_SIGNATURE; meaning we can just use the
+ * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the
+ * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is
+ * 0, we make sure that we leave the two topmost bits empty, as that guarantees
+ * we won't mistake a valid kernel pointer for a value we set, regardless of the
+ * VMSPLIT setting.
+ *
+ * Altogether, this means that the number of bits available is constrained by
+ * the size of an unsigned long (at the upper end, subtracting two bits per the
+ * above), and the definition of PP_SIGNATURE (with or without
+ * POISON_POINTER_DELTA).
+ */
+#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA))
+#if POISON_POINTER_DELTA > 0
+/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA
+ * index to not overlap with that if set
+ */
+#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT)
+#else
+/* Always leave out the topmost two; see above. */
+#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2)
+#endif
+
+#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \
+ PP_DMA_INDEX_SHIFT)
+
+/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is
+ * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for
+ * the head page of compound page and bit 1 for pfmemalloc page, as well as the
+ * bits used for the DMA index. page_is_pfmemalloc() is checked in
+ * __page_pool_put_page() to avoid recycling the pfmemalloc page.
+ */
+#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL)
+
+#ifdef CONFIG_PAGE_POOL
+static inline bool page_pool_page_is_pp(struct page *page)
+{
+ return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE;
+}
+#else
+static inline bool page_pool_page_is_pp(struct page *page)
+{
+ return false;
+}
#endif
#endif /* _LINUX_MM_H */