summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/damon.h39
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/huge_mm.h112
-rw-r--r--include/linux/hugetlb.h13
-rw-r--r--include/linux/hugetlb_inline.h15
-rw-r--r--include/linux/iommu.h4
-rw-r--r--include/linux/kasan.h20
-rw-r--r--include/linux/kmsan.h6
-rw-r--r--include/linux/ksm.h4
-rw-r--r--include/linux/leafops.h619
-rw-r--r--include/linux/memcontrol.h89
-rw-r--r--include/linux/memory-failure.h17
-rw-r--r--include/linux/memory.h24
-rw-r--r--include/linux/memremap.h57
-rw-r--r--include/linux/migrate.h4
-rw-r--r--include/linux/mm.h681
-rw-r--r--include/linux/mm_inline.h16
-rw-r--r--include/linux/mm_types.h156
-rw-r--r--include/linux/mmap_lock.h37
-rw-r--r--include/linux/mmzone.h4
-rw-r--r--include/linux/node.h10
-rw-r--r--include/linux/pgtable.h12
-rw-r--r--include/linux/sched/mm.h12
-rw-r--r--include/linux/shmem_fs.h9
-rw-r--r--include/linux/swap.h15
-rw-r--r--include/linux/swapops.h241
-rw-r--r--include/linux/userfaultfd_k.h96
-rw-r--r--include/linux/vmalloc.h8
-rw-r--r--include/linux/vmstat.h48
30 files changed, 1669 insertions, 707 deletions
diff --git a/include/linux/damon.h b/include/linux/damon.h
index cae8c613c5fc..3813373a9200 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -91,17 +91,23 @@ struct damon_region {
* @nr_regions: Number of monitoring target regions of this target.
* @regions_list: Head of the monitoring target regions of this target.
* @list: List head for siblings.
+ * @obsolete: Whether the commit destination target is obsolete.
*
* Each monitoring context could have multiple targets. For example, a context
* for virtual memory address spaces could have multiple target processes. The
* @pid should be set for appropriate &struct damon_operations including the
* virtual address spaces monitoring operations.
+ *
+ * @obsolete is used only for damon_commit_targets() source targets, to specify
+ * the matching destination targets are obsolete. Read damon_commit_targets()
+ * to see how it is handled.
*/
struct damon_target {
struct pid *pid;
unsigned int nr_regions;
struct list_head regions_list;
struct list_head list;
+ bool obsolete;
};
/**
@@ -147,6 +153,8 @@ enum damos_action {
* @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us.
* @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node.
* @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node.
+ * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup.
+ * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup.
* @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics.
*
* Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
@@ -156,6 +164,8 @@ enum damos_quota_goal_metric {
DAMOS_QUOTA_SOME_MEM_PSI_US,
DAMOS_QUOTA_NODE_MEM_USED_BP,
DAMOS_QUOTA_NODE_MEM_FREE_BP,
+ DAMOS_QUOTA_NODE_MEMCG_USED_BP,
+ DAMOS_QUOTA_NODE_MEMCG_FREE_BP,
NR_DAMOS_QUOTA_GOAL_METRICS,
};
@@ -166,6 +176,7 @@ enum damos_quota_goal_metric {
* @current_value: Current value of @metric.
* @last_psi_total: Last measured total PSI
* @nid: Node id.
+ * @memcg_id: Memcg id.
* @list: List head for siblings.
*
* Data structure for getting the current score of the quota tuning goal. The
@@ -176,6 +187,12 @@ enum damos_quota_goal_metric {
* If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually
* entered by the user, probably inside the kdamond callbacks. Otherwise,
* DAMON sets @current_value with self-measured value of @metric.
+ *
+ * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node
+ * id of the target node to account the used/free memory.
+ *
+ * If @metric is DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP, @nid and @memcg_id
+ * represents the node id and the cgroup to account the used memory for.
*/
struct damos_quota_goal {
enum damos_quota_goal_metric metric;
@@ -184,7 +201,10 @@ struct damos_quota_goal {
/* metric-dependent fields */
union {
u64 last_psi_total;
- int nid;
+ struct {
+ int nid;
+ unsigned short memcg_id;
+ };
};
struct list_head list;
};
@@ -472,7 +492,7 @@ struct damos_migrate_dests {
* @wmarks: Watermarks for automated (in)activation of this scheme.
* @migrate_dests: Destination nodes if @action is "migrate_{hot,cold}".
* @target_nid: Destination node if @action is "migrate_{hot,cold}".
- * @filters: Additional set of &struct damos_filter for &action.
+ * @core_filters: Additional set of &struct damos_filter for &action.
* @ops_filters: ops layer handling &struct damos_filter objects list.
* @last_applied: Last @action applied ops-managing entity.
* @stat: Statistics of this scheme.
@@ -498,7 +518,7 @@ struct damos_migrate_dests {
*
* Before applying the &action to a memory region, &struct damon_operations
* implementation could check pages of the region and skip &action to respect
- * &filters
+ * &core_filters
*
* The minimum entity that @action can be applied depends on the underlying
* &struct damon_operations. Since it may not be aligned with the core layer
@@ -542,7 +562,7 @@ struct damos {
struct damos_migrate_dests migrate_dests;
};
};
- struct list_head filters;
+ struct list_head core_filters;
struct list_head ops_filters;
void *last_applied;
struct damos_stat stat;
@@ -851,11 +871,11 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
#define damos_for_each_quota_goal_safe(goal, next, quota) \
list_for_each_entry_safe(goal, next, &(quota)->goals, list)
-#define damos_for_each_filter(f, scheme) \
- list_for_each_entry(f, &(scheme)->filters, list)
+#define damos_for_each_core_filter(f, scheme) \
+ list_for_each_entry(f, &(scheme)->core_filters, list)
-#define damos_for_each_filter_safe(f, next, scheme) \
- list_for_each_entry_safe(f, next, &(scheme)->filters, list)
+#define damos_for_each_core_filter_safe(f, next, scheme) \
+ list_for_each_entry_safe(f, next, &(scheme)->core_filters, list)
#define damos_for_each_ops_filter(f, scheme) \
list_for_each_entry(f, &(scheme)->ops_filters, list)
@@ -947,7 +967,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control);
int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control);
int damon_set_region_biggest_system_ram_default(struct damon_target *t,
- unsigned long *start, unsigned long *end);
+ unsigned long *start, unsigned long *end,
+ unsigned long min_sz_region);
#endif /* CONFIG_DAMON */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ce25feb06727..17b38b9d7f90 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2041,14 +2041,14 @@ static inline bool can_mmap_file(struct file *file)
return true;
}
-int __compat_vma_mmap_prepare(const struct file_operations *f_op,
+int __compat_vma_mmap(const struct file_operations *f_op,
struct file *file, struct vm_area_struct *vma);
-int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma);
+int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
{
if (file->f_op->mmap_prepare)
- return compat_vma_mmap_prepare(file, vma);
+ return compat_vma_mmap(file, vma);
return file->f_op->mmap(file, vma);
}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 623bee335383..b155929af5b1 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -387,7 +387,7 @@ extern void free_pages(unsigned long addr, unsigned int order);
#define free_page(addr) free_pages((addr), 0)
void page_alloc_init_cpuhp(void);
-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
+bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 11cab07f322a..ae7f21aad0ac 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -364,20 +364,35 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
unsigned long len, unsigned long pgoff, unsigned long flags,
vm_flags_t vm_flags);
+enum split_type {
+ SPLIT_TYPE_UNIFORM,
+ SPLIT_TYPE_NON_UNIFORM,
+};
+
bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order);
+int folio_split_unmapped(struct folio *folio, unsigned int new_order);
int min_order_for_split(struct folio *folio);
int split_folio_to_list(struct folio *folio, struct list_head *list);
-bool uniform_split_supported(struct folio *folio, unsigned int new_order,
- bool warns);
-bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
- bool warns);
+bool folio_split_supported(struct folio *folio, unsigned int new_order,
+ enum split_type split_type, bool warns);
int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
struct list_head *list);
-/*
- * try_folio_split_to_order - try to split a @folio at @page to @new_order using
- * non uniform split.
+
+static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+ unsigned int new_order)
+{
+ return __split_huge_page_to_list_to_order(page, list, new_order);
+}
+static inline int split_huge_page_to_order(struct page *page, unsigned int new_order)
+{
+ return split_huge_page_to_list_to_order(page, NULL, new_order);
+}
+
+/**
+ * try_folio_split_to_order() - try to split a @folio at @page to @new_order
+ * using non uniform split.
* @folio: folio to be split
* @page: split to @new_order at the given page
* @new_order: the target split order
@@ -387,14 +402,13 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
* folios are put back to LRU list. Use min_order_for_split() to get the lower
* bound of @new_order.
*
- * Return: 0: split is successful, otherwise split failed.
+ * Return: 0 - split is successful, otherwise split failed.
*/
static inline int try_folio_split_to_order(struct folio *folio,
struct page *page, unsigned int new_order)
{
- if (!non_uniform_split_supported(folio, new_order, /* warns= */ false))
- return split_huge_page_to_list_to_order(&folio->page, NULL,
- new_order);
+ if (!folio_split_supported(folio, new_order, SPLIT_TYPE_NON_UNIFORM, /* warns= */ false))
+ return split_huge_page_to_order(&folio->page, new_order);
return folio_split(folio, new_order, page, NULL);
}
static inline int split_huge_page(struct page *page)
@@ -402,14 +416,43 @@ static inline int split_huge_page(struct page *page)
return split_huge_page_to_list_to_order(page, NULL, 0);
}
void deferred_split_folio(struct folio *folio, bool partially_mapped);
+#ifdef CONFIG_MEMCG
+void reparent_deferred_split_queue(struct mem_cgroup *memcg);
+#endif
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze);
+/**
+ * pmd_is_huge() - Is this PMD either a huge PMD entry or a software leaf entry?
+ * @pmd: The PMD to check.
+ *
+ * A huge PMD entry is a non-empty entry which is present and marked huge or a
+ * software leaf entry. This check be performed without the appropriate locks
+ * held, in which case the condition should be rechecked after they are
+ * acquired.
+ *
+ * Returns: true if this PMD is huge, false otherwise.
+ */
+static inline bool pmd_is_huge(pmd_t pmd)
+{
+ if (pmd_present(pmd)) {
+ return pmd_trans_huge(pmd);
+ } else if (!pmd_none(pmd)) {
+ /*
+ * Non-present PMDs must be valid huge non-present entries. We
+ * cannot assert that here due to header dependency issues.
+ */
+ return true;
+ }
+
+ return false;
+}
+
#define split_huge_pmd(__vma, __pmd, __address) \
do { \
pmd_t *____pmd = (__pmd); \
- if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)) \
+ if (pmd_is_huge(*____pmd)) \
__split_huge_pmd(__vma, __pmd, __address, \
false); \
} while (0)
@@ -447,19 +490,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);
-static inline int is_swap_pmd(pmd_t pmd)
-{
- return !pmd_none(pmd) && !pmd_present(pmd);
-}
-
/* mmap_lock must be held on entry */
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma)
{
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))
+ if (pmd_is_huge(*pmd))
return __pmd_trans_huge_lock(pmd, vma);
- else
- return NULL;
+
+ return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
struct vm_area_struct *vma)
@@ -473,6 +511,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
/**
* folio_test_pmd_mappable - Can we map this folio with a PMD?
* @folio: The folio to test
+ *
+ * Return: true - @folio can be mapped, false - @folio cannot be mapped.
*/
static inline bool folio_test_pmd_mappable(struct folio *folio)
{
@@ -481,6 +521,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
+vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);
+
extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;
@@ -524,6 +566,8 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze);
bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct folio *folio);
+void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd,
+ struct vm_area_struct *vma, unsigned long haddr);
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -576,6 +620,11 @@ split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
VM_WARN_ON_ONCE_PAGE(1, page);
return -EINVAL;
}
+static inline int split_huge_page_to_order(struct page *page, unsigned int new_order)
+{
+ VM_WARN_ON_ONCE_PAGE(1, page);
+ return -EINVAL;
+}
static inline int split_huge_page(struct page *page)
{
VM_WARN_ON_ONCE_PAGE(1, page);
@@ -602,6 +651,7 @@ static inline int try_folio_split_to_order(struct folio *folio,
}
static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
+static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
@@ -642,10 +692,6 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
struct vm_area_struct *next)
{
}
-static inline int is_swap_pmd(pmd_t pmd)
-{
- return 0;
-}
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma)
{
@@ -662,6 +708,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
return 0;
}
+static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
+{
+ return 0;
+}
+
static inline bool is_huge_zero_folio(const struct folio *folio)
{
return false;
@@ -682,12 +733,6 @@ static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
return;
}
-static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
- return NULL;
-}
-
static inline bool thp_migration_supported(void)
{
return false;
@@ -720,6 +765,11 @@ static inline struct folio *get_persistent_huge_zero_folio(void)
{
return NULL;
}
+
+static inline bool pmd_is_huge(pmd_t pmd)
+{
+ return false;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline int split_folio_to_list_to_order(struct folio *folio,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e63e46b8e1f..019a1c5281e4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
struct folio **foliop);
#endif /* CONFIG_USERFAULTFD */
long hugetlb_reserve_pages(struct inode *inode, long from, long to,
- struct vm_area_struct *vma,
- vm_flags_t vm_flags);
+ struct vm_area_desc *desc, vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
@@ -172,7 +171,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h,
struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
-extern int sysctl_hugetlb_shm_group;
+extern int sysctl_hugetlb_shm_group __read_mostly;
extern struct list_head huge_boot_pages[MAX_NUMNODES];
void hugetlb_bootmem_alloc(void);
@@ -275,11 +274,10 @@ void hugetlb_vma_lock_release(struct kref *kref);
long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
-bool is_hugetlb_entry_migration(pte_t pte);
-bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -466,6 +464,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
+static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+ return 0;
+}
+
#endif /* !CONFIG_HUGETLB_PAGE */
#ifndef pgd_write
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 0660a03d37d9..a27aa0162918 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -2,22 +2,27 @@
#ifndef _LINUX_HUGETLB_INLINE_H
#define _LINUX_HUGETLB_INLINE_H
-#ifdef CONFIG_HUGETLB_PAGE
-
#include <linux/mm.h>
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+#ifdef CONFIG_HUGETLB_PAGE
+
+static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
{
- return !!(vma->vm_flags & VM_HUGETLB);
+ return !!(vm_flags & VM_HUGETLB);
}
#else
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
{
return false;
}
#endif
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+ return is_vm_hugetlb_flags(vma->vm_flags);
+}
+
#endif
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 801b2bd9e8d4..8c66284a91a8 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1135,7 +1135,9 @@ struct iommu_sva {
struct iommu_mm_data {
u32 pasid;
+ struct mm_struct *mm;
struct list_head sva_domains;
+ struct list_head mm_list_elm;
};
int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode);
@@ -1616,6 +1618,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
struct mm_struct *mm);
void iommu_sva_unbind_device(struct iommu_sva *handle);
u32 iommu_sva_get_pasid(struct iommu_sva *handle);
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end);
#else
static inline struct iommu_sva *
iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
@@ -1640,6 +1643,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
}
static inline void mm_pasid_drop(struct mm_struct *mm) {}
+static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {}
#endif /* CONFIG_IOMMU_SVA */
#ifdef CONFIG_IOMMU_IOPF
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d12e1a5f5a9a..f335c1d7b61d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -571,11 +571,27 @@ static inline void kasan_init_hw_tags(void) { }
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask);
-void kasan_release_vmalloc(unsigned long start, unsigned long end,
+int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask);
+static inline int kasan_populate_vmalloc(unsigned long addr,
+ unsigned long size, gfp_t gfp_mask)
+{
+ if (kasan_enabled())
+ return __kasan_populate_vmalloc(addr, size, gfp_mask);
+ return 0;
+}
+void __kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long free_region_start,
unsigned long free_region_end,
unsigned long flags);
+static inline void kasan_release_vmalloc(unsigned long start, unsigned long end,
+ unsigned long free_region_start,
+ unsigned long free_region_end,
+ unsigned long flags)
+{
+ if (kasan_enabled())
+ return __kasan_release_vmalloc(start, end, free_region_start,
+ free_region_end, flags);
+}
#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index f2fd221107bb..7da9fd506b39 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -133,6 +133,7 @@ void kmsan_kfree_large(const void *ptr);
* @prot: page protection flags used for vmap.
* @pages: array of pages.
* @page_shift: page_shift passed to vmap_range_noflush().
+ * @gfp_mask: gfp_mask to use internally.
*
* KMSAN maps shadow and origin pages of @pages into contiguous ranges in
* vmalloc metadata address range. Returns 0 on success, callers must check
@@ -142,7 +143,8 @@ int __must_check kmsan_vmap_pages_range_noflush(unsigned long start,
unsigned long end,
pgprot_t prot,
struct page **pages,
- unsigned int page_shift);
+ unsigned int page_shift,
+ gfp_t gfp_mask);
/**
* kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
@@ -347,7 +349,7 @@ static inline void kmsan_kfree_large(const void *ptr)
static inline int __must_check kmsan_vmap_pages_range_noflush(
unsigned long start, unsigned long end, pgprot_t prot,
- struct page **pages, unsigned int page_shift)
+ struct page **pages, unsigned int page_shift, gfp_t gfp_mask)
{
return 0;
}
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 067538fc4d58..c982694c987b 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -17,7 +17,7 @@
#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, vm_flags_t *vm_flags);
-vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file,
vm_flags_t vm_flags);
int ksm_enable_merge_any(struct mm_struct *mm);
int ksm_disable_merge_any(struct mm_struct *mm);
@@ -103,7 +103,7 @@ bool ksm_process_mergeable(struct mm_struct *mm);
#else /* !CONFIG_KSM */
-static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
+static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm,
const struct file *file, vm_flags_t vm_flags)
{
return vm_flags;
diff --git a/include/linux/leafops.h b/include/linux/leafops.h
new file mode 100644
index 000000000000..cfafe7a5e7b1
--- /dev/null
+++ b/include/linux/leafops.h
@@ -0,0 +1,619 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Describes operations that can be performed on software-defined page table
+ * leaf entries. These are abstracted from the hardware page table entries
+ * themselves by the softleaf_t type, see mm_types.h.
+ */
+#ifndef _LINUX_LEAFOPS_H
+#define _LINUX_LEAFOPS_H
+
+#include <linux/mm_types.h>
+#include <linux/swapops.h>
+#include <linux/swap.h>
+
+#ifdef CONFIG_MMU
+
+/* Temporary until swp_entry_t eliminated. */
+#define LEAF_TYPE_SHIFT SWP_TYPE_SHIFT
+
+enum softleaf_type {
+ /* Fundamental types. */
+ SOFTLEAF_NONE,
+ SOFTLEAF_SWAP,
+ /* Migration types. */
+ SOFTLEAF_MIGRATION_READ,
+ SOFTLEAF_MIGRATION_READ_EXCLUSIVE,
+ SOFTLEAF_MIGRATION_WRITE,
+ /* Device types. */
+ SOFTLEAF_DEVICE_PRIVATE_READ,
+ SOFTLEAF_DEVICE_PRIVATE_WRITE,
+ SOFTLEAF_DEVICE_EXCLUSIVE,
+ /* H/W posion types. */
+ SOFTLEAF_HWPOISON,
+ /* Marker types. */
+ SOFTLEAF_MARKER,
+};
+
+/**
+ * softleaf_mk_none() - Create an empty ('none') leaf entry.
+ * Returns: empty leaf entry.
+ */
+static inline softleaf_t softleaf_mk_none(void)
+{
+ return ((softleaf_t) { 0 });
+}
+
+/**
+ * softleaf_from_pte() - Obtain a leaf entry from a PTE entry.
+ * @pte: PTE entry.
+ *
+ * If @pte is present (therefore not a leaf entry) the function returns an empty
+ * leaf entry. Otherwise, it returns a leaf entry.
+ *
+ * Returns: Leaf entry.
+ */
+static inline softleaf_t softleaf_from_pte(pte_t pte)
+{
+ softleaf_t arch_entry;
+
+ if (pte_present(pte) || pte_none(pte))
+ return softleaf_mk_none();
+
+ pte = pte_swp_clear_flags(pte);
+ arch_entry = __pte_to_swp_entry(pte);
+
+ /* Temporary until swp_entry_t eliminated. */
+ return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
+}
+
+/**
+ * softleaf_to_pte() - Obtain a PTE entry from a leaf entry.
+ * @entry: Leaf entry.
+ *
+ * This generates an architecture-specific PTE entry that can be utilised to
+ * encode the metadata the leaf entry encodes.
+ *
+ * Returns: Architecture-specific PTE entry encoding leaf entry.
+ */
+static inline pte_t softleaf_to_pte(softleaf_t entry)
+{
+ /* Temporary until swp_entry_t eliminated. */
+ return swp_entry_to_pte(entry);
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+/**
+ * softleaf_from_pmd() - Obtain a leaf entry from a PMD entry.
+ * @pmd: PMD entry.
+ *
+ * If @pmd is present (therefore not a leaf entry) the function returns an empty
+ * leaf entry. Otherwise, it returns a leaf entry.
+ *
+ * Returns: Leaf entry.
+ */
+static inline softleaf_t softleaf_from_pmd(pmd_t pmd)
+{
+ softleaf_t arch_entry;
+
+ if (pmd_present(pmd) || pmd_none(pmd))
+ return softleaf_mk_none();
+
+ if (pmd_swp_soft_dirty(pmd))
+ pmd = pmd_swp_clear_soft_dirty(pmd);
+ if (pmd_swp_uffd_wp(pmd))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+ arch_entry = __pmd_to_swp_entry(pmd);
+
+ /* Temporary until swp_entry_t eliminated. */
+ return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
+}
+
+#else
+
+static inline softleaf_t softleaf_from_pmd(pmd_t pmd)
+{
+ return softleaf_mk_none();
+}
+
+#endif
+
+/**
+ * softleaf_is_none() - Is the leaf entry empty?
+ * @entry: Leaf entry.
+ *
+ * Empty entries are typically the result of a 'none' page table leaf entry
+ * being converted to a leaf entry.
+ *
+ * Returns: true if the entry is empty, false otherwise.
+ */
+static inline bool softleaf_is_none(softleaf_t entry)
+{
+ return entry.val == 0;
+}
+
+/**
+ * softleaf_type() - Identify the type of leaf entry.
+ * @enntry: Leaf entry.
+ *
+ * Returns: the leaf entry type associated with @entry.
+ */
+static inline enum softleaf_type softleaf_type(softleaf_t entry)
+{
+ unsigned int type_num;
+
+ if (softleaf_is_none(entry))
+ return SOFTLEAF_NONE;
+
+ type_num = entry.val >> LEAF_TYPE_SHIFT;
+
+ if (type_num < MAX_SWAPFILES)
+ return SOFTLEAF_SWAP;
+
+ switch (type_num) {
+#ifdef CONFIG_MIGRATION
+ case SWP_MIGRATION_READ:
+ return SOFTLEAF_MIGRATION_READ;
+ case SWP_MIGRATION_READ_EXCLUSIVE:
+ return SOFTLEAF_MIGRATION_READ_EXCLUSIVE;
+ case SWP_MIGRATION_WRITE:
+ return SOFTLEAF_MIGRATION_WRITE;
+#endif
+#ifdef CONFIG_DEVICE_PRIVATE
+ case SWP_DEVICE_WRITE:
+ return SOFTLEAF_DEVICE_PRIVATE_WRITE;
+ case SWP_DEVICE_READ:
+ return SOFTLEAF_DEVICE_PRIVATE_READ;
+ case SWP_DEVICE_EXCLUSIVE:
+ return SOFTLEAF_DEVICE_EXCLUSIVE;
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ case SWP_HWPOISON:
+ return SOFTLEAF_HWPOISON;
+#endif
+ case SWP_PTE_MARKER:
+ return SOFTLEAF_MARKER;
+ }
+
+ /* Unknown entry type. */
+ VM_WARN_ON_ONCE(1);
+ return SOFTLEAF_NONE;
+}
+
+/**
+ * softleaf_is_swap() - Is this leaf entry a swap entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a swap entry, otherwise false.
+ */
+static inline bool softleaf_is_swap(softleaf_t entry)
+{
+ return softleaf_type(entry) == SOFTLEAF_SWAP;
+}
+
+/**
+ * softleaf_is_migration_write() - Is this leaf entry a writable migration entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a writable migration entry, otherwise
+ * false.
+ */
+static inline bool softleaf_is_migration_write(softleaf_t entry)
+{
+ return softleaf_type(entry) == SOFTLEAF_MIGRATION_WRITE;
+}
+
+/**
+ * softleaf_is_migration_read() - Is this leaf entry a readable migration entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a readable migration entry, otherwise
+ * false.
+ */
+static inline bool softleaf_is_migration_read(softleaf_t entry)
+{
+ return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ;
+}
+
+/**
+ * softleaf_is_migration_read_exclusive() - Is this leaf entry an exclusive
+ * readable migration entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is an exclusive readable migration entry,
+ * otherwise false.
+ */
+static inline bool softleaf_is_migration_read_exclusive(softleaf_t entry)
+{
+ return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ_EXCLUSIVE;
+}
+
+/**
+ * softleaf_is_migration() - Is this leaf entry a migration entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a migration entry, otherwise false.
+ */
+static inline bool softleaf_is_migration(softleaf_t entry)
+{
+ switch (softleaf_type(entry)) {
+ case SOFTLEAF_MIGRATION_READ:
+ case SOFTLEAF_MIGRATION_READ_EXCLUSIVE:
+ case SOFTLEAF_MIGRATION_WRITE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * softleaf_is_device_private_write() - Is this leaf entry a device private
+ * writable entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a device private writable entry, otherwise
+ * false.
+ */
+static inline bool softleaf_is_device_private_write(softleaf_t entry)
+{
+ return softleaf_type(entry) == SOFTLEAF_DEVICE_PRIVATE_WRITE;
+}
+
+/**
+ * softleaf_is_device_private() - Is this leaf entry a device private entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a device private entry, otherwise false.
+ */
+static inline bool softleaf_is_device_private(softleaf_t entry)
+{
+ switch (softleaf_type(entry)) {
+ case SOFTLEAF_DEVICE_PRIVATE_WRITE:
+ case SOFTLEAF_DEVICE_PRIVATE_READ:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * softleaf_is_device_exclusive() - Is this leaf entry a device-exclusive entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a device-exclusive entry, otherwise false.
+ */
+static inline bool softleaf_is_device_exclusive(softleaf_t entry)
+{
+ return softleaf_type(entry) == SOFTLEAF_DEVICE_EXCLUSIVE;
+}
+
+/**
+ * softleaf_is_hwpoison() - Is this leaf entry a hardware poison entry?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a hardware poison entry, otherwise false.
+ */
+static inline bool softleaf_is_hwpoison(softleaf_t entry)
+{
+ return softleaf_type(entry) == SOFTLEAF_HWPOISON;
+}
+
+/**
+ * softleaf_is_marker() - Is this leaf entry a marker?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a marker entry, otherwise false.
+ */
+static inline bool softleaf_is_marker(softleaf_t entry)
+{
+ return softleaf_type(entry) == SOFTLEAF_MARKER;
+}
+
+/**
+ * softleaf_to_marker() - Obtain marker associated with leaf entry.
+ * @entry: Leaf entry, softleaf_is_marker(@entry) must return true.
+ *
+ * Returns: Marker associated with the leaf entry.
+ */
+static inline pte_marker softleaf_to_marker(softleaf_t entry)
+{
+ VM_WARN_ON_ONCE(!softleaf_is_marker(entry));
+
+ return swp_offset(entry) & PTE_MARKER_MASK;
+}
+
+/**
+ * softleaf_has_pfn() - Does this leaf entry encode a valid PFN number?
+ * @entry: Leaf entry.
+ *
+ * A pfn swap entry is a special type of swap entry that always has a pfn stored
+ * in the swap offset. They can either be used to represent unaddressable device
+ * memory, to restrict access to a page undergoing migration or to represent a
+ * pfn which has been hwpoisoned and unmapped.
+ *
+ * Returns: true if the leaf entry encodes a PFN, otherwise false.
+ */
+static inline bool softleaf_has_pfn(softleaf_t entry)
+{
+ /* Make sure the swp offset can always store the needed fields. */
+ BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
+
+ if (softleaf_is_migration(entry))
+ return true;
+ if (softleaf_is_device_private(entry))
+ return true;
+ if (softleaf_is_device_exclusive(entry))
+ return true;
+ if (softleaf_is_hwpoison(entry))
+ return true;
+
+ return false;
+}
+
+/**
+ * softleaf_to_pfn() - Obtain PFN encoded within leaf entry.
+ * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true.
+ *
+ * Returns: The PFN associated with the leaf entry.
+ */
+static inline unsigned long softleaf_to_pfn(softleaf_t entry)
+{
+ VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
+
+ /* Temporary until swp_entry_t eliminated. */
+ return swp_offset(entry) & SWP_PFN_MASK;
+}
+
+/**
+ * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry.
+ * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true.
+ *
+ * Returns: Pointer to the struct page associated with the leaf entry's PFN.
+ */
+static inline struct page *softleaf_to_page(softleaf_t entry)
+{
+ struct page *page = pfn_to_page(softleaf_to_pfn(entry));
+
+ VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
+ /*
+ * Any use of migration entries may only occur while the
+ * corresponding page is locked
+ */
+ VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page));
+
+ return page;
+}
+
+/**
+ * softleaf_to_folio() - Obtains struct folio for PFN encoded within leaf entry.
+ * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true.
+ *
+ * Returns: Pointer to the struct folio associated with the leaf entry's PFN.
+ */
+static inline struct folio *softleaf_to_folio(softleaf_t entry)
+{
+ struct folio *folio = pfn_folio(softleaf_to_pfn(entry));
+
+ VM_WARN_ON_ONCE(!softleaf_has_pfn(entry));
+ /*
+ * Any use of migration entries may only occur while the
+ * corresponding folio is locked.
+ */
+ VM_WARN_ON_ONCE(softleaf_is_migration(entry) &&
+ !folio_test_locked(folio));
+
+ return folio;
+}
+
+/**
+ * softleaf_is_poison_marker() - Is this leaf entry a poison marker?
+ * @entry: Leaf entry.
+ *
+ * The poison marker is set via UFFDIO_POISON. Userfaultfd-specific.
+ *
+ * Returns: true if the leaf entry is a poison marker, otherwise false.
+ */
+static inline bool softleaf_is_poison_marker(softleaf_t entry)
+{
+ if (!softleaf_is_marker(entry))
+ return false;
+
+ return softleaf_to_marker(entry) & PTE_MARKER_POISONED;
+}
+
+/**
+ * softleaf_is_guard_marker() - Is this leaf entry a guard region marker?
+ * @entry: Leaf entry.
+ *
+ * Returns: true if the leaf entry is a guard marker, otherwise false.
+ */
+static inline bool softleaf_is_guard_marker(softleaf_t entry)
+{
+ if (!softleaf_is_marker(entry))
+ return false;
+
+ return softleaf_to_marker(entry) & PTE_MARKER_GUARD;
+}
+
+/**
+ * softleaf_is_uffd_wp_marker() - Is this leaf entry a userfautlfd write protect
+ * marker?
+ * @entry: Leaf entry.
+ *
+ * Userfaultfd-specific.
+ *
+ * Returns: true if the leaf entry is a UFFD WP marker, otherwise false.
+ */
+static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry)
+{
+ if (!softleaf_is_marker(entry))
+ return false;
+
+ return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP;
+}
+
+#ifdef CONFIG_MIGRATION
+
+/**
+ * softleaf_is_migration_young() - Does this migration entry contain an accessed
+ * bit?
+ * @entry: Leaf entry.
+ *
+ * If the architecture can support storing A/D bits in migration entries, this
+ * determines whether the accessed (or 'young') bit was set on the migrated page
+ * table entry.
+ *
+ * Returns: true if the entry contains an accessed bit, otherwise false.
+ */
+static inline bool softleaf_is_migration_young(softleaf_t entry)
+{
+ VM_WARN_ON_ONCE(!softleaf_is_migration(entry));
+
+ if (migration_entry_supports_ad())
+ return swp_offset(entry) & SWP_MIG_YOUNG;
+ /* Keep the old behavior of aging page after migration */
+ return false;
+}
+
+/**
+ * softleaf_is_migration_dirty() - Does this migration entry contain a dirty bit?
+ * @entry: Leaf entry.
+ *
+ * If the architecture can support storing A/D bits in migration entries, this
+ * determines whether the dirty bit was set on the migrated page table entry.
+ *
+ * Returns: true if the entry contains a dirty bit, otherwise false.
+ */
+static inline bool softleaf_is_migration_dirty(softleaf_t entry)
+{
+ VM_WARN_ON_ONCE(!softleaf_is_migration(entry));
+
+ if (migration_entry_supports_ad())
+ return swp_offset(entry) & SWP_MIG_DIRTY;
+ /* Keep the old behavior of clean page after migration */
+ return false;
+}
+
+#else /* CONFIG_MIGRATION */
+
+static inline bool softleaf_is_migration_young(softleaf_t entry)
+{
+ return false;
+}
+
+static inline bool softleaf_is_migration_dirty(softleaf_t entry)
+{
+ return false;
+}
+#endif /* CONFIG_MIGRATION */
+
+/**
+ * pte_is_marker() - Does the PTE entry encode a marker leaf entry?
+ * @pte: PTE entry.
+ *
+ * Returns: true if this PTE is a marker leaf entry, otherwise false.
+ */
+static inline bool pte_is_marker(pte_t pte)
+{
+ return softleaf_is_marker(softleaf_from_pte(pte));
+}
+
+/**
+ * pte_is_uffd_wp_marker() - Does this PTE entry encode a userfaultfd write
+ * protect marker leaf entry?
+ * @pte: PTE entry.
+ *
+ * Returns: true if this PTE is a UFFD WP marker leaf entry, otherwise false.
+ */
+static inline bool pte_is_uffd_wp_marker(pte_t pte)
+{
+ const softleaf_t entry = softleaf_from_pte(pte);
+
+ return softleaf_is_uffd_wp_marker(entry);
+}
+
+/**
+ * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker
+ * leaf entry?
+ * @entry: Leaf entry.
+ *
+ * It's useful to be able to determine which leaf entries encode UFFD-specific
+ * markers so we can handle these correctly.
+ *
+ * Returns: true if this PTE entry is a UFFD-specific marker, otherwise false.
+ */
+static inline bool pte_is_uffd_marker(pte_t pte)
+{
+ const softleaf_t entry = softleaf_from_pte(pte);
+
+ if (!softleaf_is_marker(entry))
+ return false;
+
+ /* UFFD WP, poisoned swap entries are UFFD-handled. */
+ if (softleaf_is_uffd_wp_marker(entry))
+ return true;
+ if (softleaf_is_poison_marker(entry))
+ return true;
+
+ return false;
+}
+
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
+
+/**
+ * pmd_is_device_private_entry() - Check if PMD contains a device private swap
+ * entry.
+ * @pmd: The PMD to check.
+ *
+ * Returns true if the PMD contains a swap entry that represents a device private
+ * page mapping. This is used for zone device private pages that have been
+ * swapped out but still need special handling during various memory management
+ * operations.
+ *
+ * Return: true if PMD contains device private entry, false otherwise
+ */
+static inline bool pmd_is_device_private_entry(pmd_t pmd)
+{
+ return softleaf_is_device_private(softleaf_from_pmd(pmd));
+}
+
+#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static inline bool pmd_is_device_private_entry(pmd_t pmd)
+{
+ return false;
+}
+
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+/**
+ * pmd_is_migration_entry() - Does this PMD entry encode a migration entry?
+ * @pmd: PMD entry.
+ *
+ * Returns: true if the PMD encodes a migration entry, otherwise false.
+ */
+static inline bool pmd_is_migration_entry(pmd_t pmd)
+{
+ return softleaf_is_migration(softleaf_from_pmd(pmd));
+}
+
+/**
+ * pmd_is_valid_softleaf() - Is this PMD entry a valid leaf entry?
+ * @pmd: PMD entry.
+ *
+ * PMD leaf entries are valid only if they are device private or migration
+ * entries. This function asserts that a PMD leaf entry is valid in this
+ * respect.
+ *
+ * Returns: true if the PMD entry is a valid leaf entry, otherwise false.
+ */
+static inline bool pmd_is_valid_softleaf(pmd_t pmd)
+{
+ const softleaf_t entry = softleaf_from_pmd(pmd);
+
+ /* Only device private, migration entries valid for PMD. */
+ return softleaf_is_device_private(entry) ||
+ softleaf_is_migration(entry);
+}
+
+#endif /* CONFIG_MMU */
+#endif /* _LINUX_LEAFOPS_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 873e510d6f8d..0651865a4564 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -52,6 +52,7 @@ enum memcg_memory_event {
MEMCG_SWAP_HIGH,
MEMCG_SWAP_MAX,
MEMCG_SWAP_FAIL,
+ MEMCG_SOCK_THROTTLED,
MEMCG_NR_MEMORY_EVENTS,
};
@@ -956,17 +957,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
-void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
-
-static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
- int val)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __mod_lruvec_kmem_state(p, idx, val);
- local_irq_restore(flags);
-}
+void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count);
@@ -1001,36 +992,8 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
count_memcg_events_mm(mm, idx, 1);
}
-static inline void __memcg_memory_event(struct mem_cgroup *memcg,
- enum memcg_memory_event event,
- bool allow_spinning)
-{
- bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
- event == MEMCG_SWAP_FAIL;
-
- /* For now only MEMCG_MAX can happen with !allow_spinning context. */
- VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX);
-
- atomic_long_inc(&memcg->memory_events_local[event]);
- if (!swap_event && allow_spinning)
- cgroup_file_notify(&memcg->events_local_file);
-
- do {
- atomic_long_inc(&memcg->memory_events[event]);
- if (allow_spinning) {
- if (swap_event)
- cgroup_file_notify(&memcg->swap_events_file);
- else
- cgroup_file_notify(&memcg->events_file);
- }
-
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- break;
- if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
- break;
- } while ((memcg = parent_mem_cgroup(memcg)) &&
- !mem_cgroup_is_root(memcg));
-}
+void __memcg_memory_event(struct mem_cgroup *memcg,
+ enum memcg_memory_event event, bool allow_spinning);
static inline void memcg_memory_event(struct mem_cgroup *memcg,
enum memcg_memory_event event)
@@ -1430,14 +1393,6 @@ static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
}
-static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
- int val)
-{
- struct page *page = virt_to_head_page(p);
-
- __mod_node_page_state(page_pgdat(page), idx, val);
-}
-
static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
int val)
{
@@ -1497,16 +1452,6 @@ struct slabobj_ext {
#endif
} __aligned(8);
-static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
-{
- __mod_lruvec_kmem_state(p, idx, 1);
-}
-
-static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
-{
- __mod_lruvec_kmem_state(p, idx, -1);
-}
-
static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
struct mem_cgroup *memcg;
@@ -1674,6 +1619,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg);
void free_shrinker_info(struct mem_cgroup *memcg);
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
void reparent_shrinker_deferred(struct mem_cgroup *memcg);
+
+static inline int shrinker_id(struct shrinker *shrinker)
+{
+ return shrinker->id;
+}
#else
#define mem_cgroup_sockets_enabled 0
@@ -1705,6 +1655,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg,
int nid, int shrinker_id)
{
}
+
+static inline int shrinker_id(struct shrinker *shrinker)
+{
+ return -1;
+}
#endif
#ifdef CONFIG_MEMCG
@@ -1791,6 +1746,13 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
+void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);
+
+static inline bool memcg_is_dying(struct mem_cgroup *memcg)
+{
+ return memcg ? css_is_dying(&memcg->css) : false;
+}
+
#else
static inline bool mem_cgroup_kmem_disabled(void)
{
@@ -1857,6 +1819,15 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
{
return true;
}
+
+static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
+{
+}
+
+static inline bool memcg_is_dying(struct mem_cgroup *memcg)
+{
+ return false;
+}
#endif /* CONFIG_MEMCG */
#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h
new file mode 100644
index 000000000000..bc326503d2d2
--- /dev/null
+++ b/include/linux/memory-failure.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMORY_FAILURE_H
+#define _LINUX_MEMORY_FAILURE_H
+
+#include <linux/interval_tree.h>
+
+struct pfn_address_space;
+
+struct pfn_address_space {
+ struct interval_tree_node node;
+ struct address_space *mapping;
+};
+
+int register_pfn_address_space(struct pfn_address_space *pfn_space);
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space);
+
+#endif /* _LINUX_MEMORY_FAILURE_H */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index ba1515160894..faeaa921e55b 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -64,9 +64,19 @@ struct memory_group {
};
};
+enum memory_block_state {
+ /* These states are exposed to userspace as text strings in sysfs */
+ MEM_ONLINE, /* exposed to userspace */
+ MEM_GOING_OFFLINE, /* exposed to userspace */
+ MEM_OFFLINE, /* exposed to userspace */
+ MEM_GOING_ONLINE,
+ MEM_CANCEL_ONLINE,
+ MEM_CANCEL_OFFLINE,
+};
+
struct memory_block {
unsigned long start_section_nr;
- unsigned long state; /* serialized by the dev->lock */
+ enum memory_block_state state; /* serialized by the dev->lock */
int online_type; /* for passing data to online routine */
int nid; /* NID for this memory block */
/*
@@ -89,14 +99,6 @@ int arch_get_memory_phys_device(unsigned long start_pfn);
unsigned long memory_block_size_bytes(void);
int set_memory_block_size_order(unsigned int order);
-/* These states are exposed to userspace as text strings in sysfs */
-#define MEM_ONLINE (1<<0) /* exposed to userspace */
-#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */
-#define MEM_OFFLINE (1<<2) /* exposed to userspace */
-#define MEM_GOING_ONLINE (1<<3)
-#define MEM_CANCEL_ONLINE (1<<4)
-#define MEM_CANCEL_OFFLINE (1<<5)
-
struct memory_notify {
unsigned long start_pfn;
unsigned long nr_pages;
@@ -130,7 +132,7 @@ static inline int register_memory_notifier(struct notifier_block *nb)
static inline void unregister_memory_notifier(struct notifier_block *nb)
{
}
-static inline int memory_notify(unsigned long val, void *v)
+static inline int memory_notify(enum memory_block_state state, void *v)
{
return 0;
}
@@ -154,7 +156,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
struct memory_group *group);
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
-extern int memory_notify(unsigned long val, void *v);
+extern int memory_notify(enum memory_block_state state, void *v);
extern struct memory_block *find_memory_block(unsigned long section_nr);
typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
extern int walk_memory_blocks(unsigned long start, unsigned long size,
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 30c7aecbd245..713ec0435b48 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -76,11 +76,11 @@ enum memory_type {
struct dev_pagemap_ops {
/*
- * Called once the page refcount reaches 0. The reference count will be
+ * Called once the folio refcount reaches 0. The reference count will be
* reset to one by the core code after the method is called to prepare
- * for handing out the page again.
+ * for handing out the folio again.
*/
- void (*page_free)(struct page *page);
+ void (*folio_free)(struct folio *folio);
/*
* Used for private (un-addressable) device memory only. Must migrate
@@ -99,6 +99,13 @@ struct dev_pagemap_ops {
*/
int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
unsigned long nr_pages, int mf_flags);
+
+ /*
+ * Used for private (un-addressable) device memory only.
+ * This callback is used when a folio is split into
+ * a smaller folio
+ */
+ void (*folio_split)(struct folio *head, struct folio *tail);
};
#define PGMAP_ALTMAP_VALID (1 << 0)
@@ -176,6 +183,18 @@ static inline bool folio_is_pci_p2pdma(const struct folio *folio)
folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}
+static inline void *folio_zone_device_data(const struct folio *folio)
+{
+ VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio);
+ return folio->page.zone_device_data;
+}
+
+static inline void folio_set_zone_device_data(struct folio *folio, void *data)
+{
+ VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio);
+ folio->page.zone_device_data = data;
+}
+
static inline bool is_pci_p2pdma_page(const struct page *page)
{
return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
@@ -205,7 +224,7 @@ static inline bool is_fsdax_page(const struct page *page)
}
#ifdef CONFIG_ZONE_DEVICE
-void zone_device_page_init(struct page *page);
+void zone_device_page_init(struct page *page, unsigned int order);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -214,6 +233,31 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
unsigned long memremap_compat_align(void);
+
+static inline void zone_device_folio_init(struct folio *folio, unsigned int order)
+{
+ zone_device_page_init(&folio->page, order);
+ if (order)
+ folio_set_large_rmappable(folio);
+}
+
+static inline void zone_device_private_split_cb(struct folio *original_folio,
+ struct folio *new_folio)
+{
+ if (folio_is_device_private(original_folio)) {
+ if (!original_folio->pgmap->ops->folio_split) {
+ if (new_folio) {
+ new_folio->pgmap = original_folio->pgmap;
+ new_folio->page.mapping =
+ original_folio->page.mapping;
+ }
+ } else {
+ original_folio->pgmap->ops->folio_split(original_folio,
+ new_folio);
+ }
+ }
+}
+
#else
static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap)
@@ -247,6 +291,11 @@ static inline unsigned long memremap_compat_align(void)
{
return PAGE_SIZE;
}
+
+static inline void zone_device_private_split_cb(struct folio *original_folio,
+ struct folio *new_folio)
+{
+}
#endif /* CONFIG_ZONE_DEVICE */
static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 1f0ac122c3bf..26ca00c325d9 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list);
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src);
-void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
+void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
__releases(ptl);
void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
int folio_migrate_mapping(struct address_space *mapping,
@@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node)
#define MIGRATE_PFN_VALID (1UL << 0)
#define MIGRATE_PFN_MIGRATE (1UL << 1)
#define MIGRATE_PFN_WRITE (1UL << 3)
+#define MIGRATE_PFN_COMPOUND (1UL << 4)
#define MIGRATE_PFN_SHIFT 6
static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
@@ -143,6 +144,7 @@ enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
+ MIGRATE_VMA_SELECT_COMPOUND = 1 << 3,
};
struct migrate_vma {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8dc0a07570cc..7a1819c20643 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -105,6 +105,8 @@ extern int mmap_rnd_compat_bits __read_mostly;
# endif
#endif
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+
#include <asm/page.h>
#include <asm/processor.h>
@@ -273,178 +275,235 @@ extern unsigned int kobjsize(const void *objp);
* vm_flags in vm_area_struct, see mm_types.h.
* When changing, update also include/trace/events/mmflags.h
*/
-#define VM_NONE 0x00000000
-#define VM_READ 0x00000001 /* currently active flags */
-#define VM_WRITE 0x00000002
-#define VM_EXEC 0x00000004
-#define VM_SHARED 0x00000008
+#define VM_NONE 0x00000000
-/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
-#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
-#define VM_MAYWRITE 0x00000020
-#define VM_MAYEXEC 0x00000040
-#define VM_MAYSHARE 0x00000080
+/**
+ * typedef vma_flag_t - specifies an individual VMA flag by bit number.
+ *
+ * This value is made type safe by sparse to avoid passing invalid flag values
+ * around.
+ */
+typedef int __bitwise vma_flag_t;
-#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
+#define DECLARE_VMA_BIT(name, bitnum) \
+ VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
+#define DECLARE_VMA_BIT_ALIAS(name, aliased) \
+ VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT)
+enum {
+ DECLARE_VMA_BIT(READ, 0),
+ DECLARE_VMA_BIT(WRITE, 1),
+ DECLARE_VMA_BIT(EXEC, 2),
+ DECLARE_VMA_BIT(SHARED, 3),
+ /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
+ DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */
+ DECLARE_VMA_BIT(MAYWRITE, 5),
+ DECLARE_VMA_BIT(MAYEXEC, 6),
+ DECLARE_VMA_BIT(MAYSHARE, 7),
+ DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */
#ifdef CONFIG_MMU
-#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
-#else /* CONFIG_MMU */
-#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
-#define VM_UFFD_MISSING 0
+ DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
+#else
+ /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
+ DECLARE_VMA_BIT(MAYOVERLAY, 9),
#endif /* CONFIG_MMU */
-#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
-#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
-
-#define VM_LOCKED 0x00002000
-#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
-
- /* Used by sys_madvise() */
-#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
-#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
-
-#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
-#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
-#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
-#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
-#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
-#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
-#define VM_SYNC 0x00800000 /* Synchronous page faults */
-#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
-#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
-#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
-
+ /* Page-ranges managed without "struct page", just pure PFN */
+ DECLARE_VMA_BIT(PFNMAP, 10),
+ DECLARE_VMA_BIT(MAYBE_GUARD, 11),
+ DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */
+ DECLARE_VMA_BIT(LOCKED, 13),
+ DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */
+ DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */
+ DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */
+ DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */
+ DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
+ DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
+ DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */
+ DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */
+ DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */
+ DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */
+ DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */
+ DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
+ DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */
+ DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */
+ DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */
+ DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */
+ DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
+ DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */
+ /* These bits are reused, we define specific uses below. */
+ DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
+ DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
+ DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
+ DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
+ DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
+ DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
+ DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
+ /*
+ * This flag is used to connect VFIO to arch specific KVM code. It
+ * indicates that the memory under this VMA is safe for use with any
+ * non-cachable memory type inside KVM. Some VFIO devices, on some
+ * platforms, are thought to be unsafe and can cause machine crashes
+ * if KVM does not lock down the memory type.
+ */
+ DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
+#ifdef CONFIG_PPC32
+ DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
+#else
+ DECLARE_VMA_BIT(DROPPABLE, 40),
+#endif
+ DECLARE_VMA_BIT(UFFD_MINOR, 41),
+ DECLARE_VMA_BIT(SEALED, 42),
+ /* Flags that reuse flags above. */
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
+#if defined(CONFIG_X86_USER_SHADOW_STACK)
+ /*
+ * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
+ * support core mm.
+ *
+ * These VMAs will get a single end guard page. This helps userspace
+ * protect itself from attacks. A single page is enough for current
+ * shadow stack archs (x86). See the comments near alloc_shstk() in
+ * arch/x86/kernel/shstk.c for more details on the guard size.
+ */
+ DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
+#elif defined(CONFIG_ARM64_GCS)
+ /*
+ * arm64's Guarded Control Stack implements similar functionality and
+ * has similar constraints to shadow stacks.
+ */
+ DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
+#endif
+ DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */
+ DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */
+ DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */
+ DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */
+ DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */
+ DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */
+ DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */
+ DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
+#ifdef CONFIG_STACK_GROWSUP
+ DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
+ DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
+#else
+ DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
+#endif
+};
+#undef DECLARE_VMA_BIT
+#undef DECLARE_VMA_BIT_ALIAS
+
+#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
+#define VM_READ INIT_VM_FLAG(READ)
+#define VM_WRITE INIT_VM_FLAG(WRITE)
+#define VM_EXEC INIT_VM_FLAG(EXEC)
+#define VM_SHARED INIT_VM_FLAG(SHARED)
+#define VM_MAYREAD INIT_VM_FLAG(MAYREAD)
+#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE)
+#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC)
+#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE)
+#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN)
+#ifdef CONFIG_MMU
+#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING)
+#else
+#define VM_UFFD_MISSING VM_NONE
+#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY)
+#endif
+#define VM_PFNMAP INIT_VM_FLAG(PFNMAP)
+#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD)
+#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP)
+#define VM_LOCKED INIT_VM_FLAG(LOCKED)
+#define VM_IO INIT_VM_FLAG(IO)
+#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ)
+#define VM_RAND_READ INIT_VM_FLAG(RAND_READ)
+#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY)
+#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND)
+#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT)
+#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT)
+#define VM_NORESERVE INIT_VM_FLAG(NORESERVE)
+#define VM_HUGETLB INIT_VM_FLAG(HUGETLB)
+#define VM_SYNC INIT_VM_FLAG(SYNC)
+#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1)
+#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK)
+#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP)
#ifdef CONFIG_MEM_SOFT_DIRTY
-# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
+#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY)
#else
-# define VM_SOFTDIRTY 0
+#define VM_SOFTDIRTY VM_NONE
+#endif
+#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP)
+#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE)
+#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE)
+#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE)
+#define VM_STACK INIT_VM_FLAG(STACK)
+#ifdef CONFIG_STACK_GROWS_UP
+#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY)
+#else
+#define VM_STACK_EARLY VM_NONE
#endif
-
-#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
-#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
-#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
-#define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */
-
-#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
-#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
-#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
-#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
-#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
-#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
-#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
-#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6)
-#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
-
#ifdef CONFIG_ARCH_HAS_PKEYS
-# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
-# define VM_PKEY_BIT0 VM_HIGH_ARCH_0
-# define VM_PKEY_BIT1 VM_HIGH_ARCH_1
-# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
+#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
+/* Despite the naming, these are FLAGS not bits. */
+#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
+#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
+#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
#if CONFIG_ARCH_PKEY_BITS > 3
-# define VM_PKEY_BIT3 VM_HIGH_ARCH_3
+#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
#else
-# define VM_PKEY_BIT3 0
-#endif
+#define VM_PKEY_BIT3 VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 3 */
#if CONFIG_ARCH_PKEY_BITS > 4
-# define VM_PKEY_BIT4 VM_HIGH_ARCH_4
+#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
#else
-# define VM_PKEY_BIT4 0
-#endif
+#define VM_PKEY_BIT4 VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 4 */
#endif /* CONFIG_ARCH_HAS_PKEYS */
-
-#ifdef CONFIG_X86_USER_SHADOW_STACK
-/*
- * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
- * support core mm.
- *
- * These VMAs will get a single end guard page. This helps userspace protect
- * itself from attacks. A single page is enough for current shadow stack archs
- * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c
- * for more details on the guard size.
- */
-# define VM_SHADOW_STACK VM_HIGH_ARCH_5
-#endif
-
-#if defined(CONFIG_ARM64_GCS)
-/*
- * arm64's Guarded Control Stack implements similar functionality and
- * has similar constraints to shadow stacks.
- */
-# define VM_SHADOW_STACK VM_HIGH_ARCH_6
-#endif
-
-#ifndef VM_SHADOW_STACK
-# define VM_SHADOW_STACK VM_NONE
+#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS)
+#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK)
+#else
+#define VM_SHADOW_STACK VM_NONE
#endif
-
#if defined(CONFIG_PPC64)
-# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
+#define VM_SAO INIT_VM_FLAG(SAO)
#elif defined(CONFIG_PARISC)
-# define VM_GROWSUP VM_ARCH_1
+#define VM_GROWSUP INIT_VM_FLAG(GROWSUP)
#elif defined(CONFIG_SPARC64)
-# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */
-# define VM_ARCH_CLEAR VM_SPARC_ADI
+#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI)
+#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR)
#elif defined(CONFIG_ARM64)
-# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */
-# define VM_ARCH_CLEAR VM_ARM64_BTI
+#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI)
+#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR)
#elif !defined(CONFIG_MMU)
-# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */
-#endif
-
-#if defined(CONFIG_ARM64_MTE)
-# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */
-# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */
-#else
-# define VM_MTE VM_NONE
-# define VM_MTE_ALLOWED VM_NONE
+#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY)
#endif
-
#ifndef VM_GROWSUP
-# define VM_GROWSUP VM_NONE
+#define VM_GROWSUP VM_NONE
+#endif
+#ifdef CONFIG_ARM64_MTE
+#define VM_MTE INIT_VM_FLAG(MTE)
+#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED)
+#else
+#define VM_MTE VM_NONE
+#define VM_MTE_ALLOWED VM_NONE
#endif
-
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-# define VM_UFFD_MINOR_BIT 41
-# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
-#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-# define VM_UFFD_MINOR VM_NONE
-#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-
-/*
- * This flag is used to connect VFIO to arch specific KVM code. It
- * indicates that the memory under this VMA is safe for use with any
- * non-cachable memory type inside KVM. Some VFIO devices, on some
- * platforms, are thought to be unsafe and can cause machine crashes
- * if KVM does not lock down the memory type.
- */
-#ifdef CONFIG_64BIT
-#define VM_ALLOW_ANY_UNCACHED_BIT 39
-#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT)
+#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR)
#else
-#define VM_ALLOW_ANY_UNCACHED VM_NONE
+#define VM_UFFD_MINOR VM_NONE
#endif
-
#ifdef CONFIG_64BIT
-#define VM_DROPPABLE_BIT 40
-#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT)
-#elif defined(CONFIG_PPC32)
-#define VM_DROPPABLE VM_ARCH_1
+#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
+#define VM_SEALED INIT_VM_FLAG(SEALED)
#else
-#define VM_DROPPABLE VM_NONE
+#define VM_ALLOW_ANY_UNCACHED VM_NONE
+#define VM_SEALED VM_NONE
#endif
-
-#ifdef CONFIG_64BIT
-#define VM_SEALED_BIT 42
-#define VM_SEALED BIT(VM_SEALED_BIT)
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE)
#else
-#define VM_SEALED VM_NONE
+#define VM_DROPPABLE VM_NONE
#endif
/* Bits set in the VMA until the stack is in its final location */
@@ -470,12 +529,10 @@ extern unsigned int kobjsize(const void *objp);
#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
-#ifdef CONFIG_STACK_GROWSUP
-#define VM_STACK VM_GROWSUP
-#define VM_STACK_EARLY VM_GROWSDOWN
+#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
+#define VM_SEALED_SYSMAP VM_SEALED
#else
-#define VM_STACK VM_GROWSDOWN
-#define VM_STACK_EARLY 0
+#define VM_SEALED_SYSMAP VM_NONE
#endif
#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
@@ -483,12 +540,26 @@ extern unsigned int kobjsize(const void *objp);
/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
-
/*
* Special vmas that are non-mergable, non-mlock()able.
*/
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+/*
+ * Physically remapped pages are special. Tell the
+ * rest of the world about it:
+ * VM_IO tells people not to look at these pages
+ * (accesses can have side effects).
+ * VM_PFNMAP tells the core MM that the base pages are just
+ * raw PFN mappings, and do not have a "struct page" associated
+ * with them.
+ * VM_DONTEXPAND
+ * Disable vma merging and expanding with mremap().
+ * VM_DONTDUMP
+ * Omit vma from core dump, even when VM_IO turned off.
+ */
+#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP)
+
/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
@@ -498,13 +569,69 @@ extern unsigned int kobjsize(const void *objp);
/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
+/* These flags can be updated atomically via VMA/mmap read lock. */
+#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD
+
/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
-# define VM_ARCH_CLEAR VM_NONE
+#define VM_ARCH_CLEAR VM_NONE
#endif
#define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)
/*
+ * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
+ * possesses it but the other does not, the merged VMA should nonetheless have
+ * applied to it:
+ *
+ * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its
+ * references cleared via /proc/$pid/clear_refs, any merged VMA
+ * should be considered soft-dirty also as it operates at a VMA
+ * granularity.
+ *
+ * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that
+ * mapped page tables may contain metadata not described by the
+ * VMA and thus any merged VMA may also contain this metadata,
+ * and thus we must make this flag sticky.
+ */
+#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
+
+/*
+ * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
+ * of these flags and the other not does not preclude a merge.
+ *
+ * VM_STICKY - When merging VMAs, VMA flags must match, unless they are
+ * 'sticky'. If any sticky flags exist in either VMA, we simply
+ * set all of them on the merged VMA.
+ */
+#define VM_IGNORE_MERGE VM_STICKY
+
+/*
+ * Flags which should result in page tables being copied on fork. These are
+ * flags which indicate that the VMA maps page tables which cannot be
+ * reconsistuted upon page fault, so necessitate page table copying upon
+ *
+ * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
+ * reasonably reconstructed on page fault.
+ *
+ * VM_UFFD_WP - Encodes metadata about an installed uffd
+ * write protect handler, which cannot be
+ * reconstructed on page fault.
+ *
+ * We always copy pgtables when dst_vma has uffd-wp
+ * enabled even if it's file-backed
+ * (e.g. shmem). Because when uffd-wp is enabled,
+ * pgtable contains uffd-wp protection information,
+ * that's something we can't retrieve from page cache,
+ * and skip copying will lose those info.
+ *
+ * VM_MAYBE_GUARD - Could contain page guard region markers which
+ * by design are a property of the page tables
+ * only and thus cannot be reconstructed on page
+ * fault.
+ */
+#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
+
+/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
*/
@@ -783,7 +910,9 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
static inline void vm_flags_init(struct vm_area_struct *vma,
vm_flags_t flags)
{
- ACCESS_PRIVATE(vma, __vm_flags) = flags;
+ VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
+ vma_flags_clear_all(&vma->flags);
+ vma_flags_overwrite_word(&vma->flags, flags);
}
/*
@@ -794,6 +923,7 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
static inline void vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
+ VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
vma_assert_write_locked(vma);
vm_flags_init(vma, flags);
}
@@ -802,21 +932,33 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma,
vm_flags_t flags)
{
vma_assert_write_locked(vma);
- WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
+ /*
+ * If VMA flags exist beyond the first system word, also clear these. It
+ * is assumed the write once behaviour is required only for the first
+ * system word.
+ */
+ if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) {
+ unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags);
+
+ bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG);
+ }
+
+ vma_flags_overwrite_word_once(&vma->flags, flags);
}
static inline void vm_flags_set(struct vm_area_struct *vma,
vm_flags_t flags)
{
vma_start_write(vma);
- ACCESS_PRIVATE(vma, __vm_flags) |= flags;
+ vma_flags_set_word(&vma->flags, flags);
}
static inline void vm_flags_clear(struct vm_area_struct *vma,
vm_flags_t flags)
{
+ VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
vma_start_write(vma);
- ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
+ vma_flags_clear_word(&vma->flags, flags);
}
/*
@@ -840,6 +982,51 @@ static inline void vm_flags_mod(struct vm_area_struct *vma,
__vm_flags_mod(vma, set, clear);
}
+static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma,
+ vma_flag_t bit)
+{
+ const vm_flags_t mask = BIT((__force int)bit);
+
+ /* Only specific flags are permitted */
+ if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED)))
+ return false;
+
+ return true;
+}
+
+/*
+ * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific
+ * valid flags are allowed to do this.
+ */
+static inline void vma_flag_set_atomic(struct vm_area_struct *vma,
+ vma_flag_t bit)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags);
+
+ /* mmap read lock/VMA read lock must be held. */
+ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
+ vma_assert_locked(vma);
+
+ if (__vma_flag_atomic_valid(vma, bit))
+ set_bit((__force int)bit, bitmap);
+}
+
+/*
+ * Test for VMA flag atomically. Requires no locks. Only specific valid flags
+ * are allowed to do this.
+ *
+ * This is necessarily racey, so callers must ensure that serialisation is
+ * achieved through some other means, or that races are permissible.
+ */
+static inline bool vma_flag_test_atomic(struct vm_area_struct *vma,
+ vma_flag_t bit)
+{
+ if (__vma_flag_atomic_valid(vma, bit))
+ return test_bit((__force int)bit, &vma->vm_flags);
+
+ return false;
+}
+
static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
vma->vm_ops = NULL;
@@ -2438,7 +2625,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
}
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long start,
- unsigned long end, unsigned long tree_end, bool mm_wr_locked);
+ unsigned long end, unsigned long tree_end);
struct mmu_notifier_range;
@@ -2922,6 +3109,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
#endif /* CONFIG_MMU */
enum pt_flags {
+ PT_kernel = PG_referenced,
PT_reserved = PG_reserved,
/* High bits are used for zone/node/section */
};
@@ -2948,6 +3136,46 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt)
}
/**
+ * ptdesc_set_kernel - Mark a ptdesc used to map the kernel
+ * @ptdesc: The ptdesc to be marked
+ *
+ * Kernel page tables often need special handling. Set a flag so that
+ * the handling code knows this ptdesc will not be used for userspace.
+ */
+static inline void ptdesc_set_kernel(struct ptdesc *ptdesc)
+{
+ set_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
+/**
+ * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel
+ * @ptdesc: The ptdesc to be unmarked
+ *
+ * Use when the ptdesc is no longer used to map the kernel and no longer
+ * needs special handling.
+ */
+static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc)
+{
+ /*
+ * Note: the 'PG_referenced' bit does not strictly need to be
+ * cleared before freeing the page. But this is nice for
+ * symmetry.
+ */
+ clear_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
+/**
+ * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel
+ * @ptdesc: The ptdesc being tested
+ *
+ * Call to tell if the ptdesc used to map the kernel.
+ */
+static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc)
+{
+ return test_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
+/**
* pagetable_alloc - Allocate pagetables
* @gfp: GFP flags
* @order: desired pagetable order
@@ -2965,6 +3193,21 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde
}
#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
+static inline void __pagetable_free(struct ptdesc *pt)
+{
+ struct page *page = ptdesc_page(pt);
+
+ __free_pages(page, compound_order(page));
+}
+
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+void pagetable_free_kernel(struct ptdesc *pt);
+#else
+static inline void pagetable_free_kernel(struct ptdesc *pt)
+{
+ __pagetable_free(pt);
+}
+#endif
/**
* pagetable_free - Free pagetables
* @pt: The page table descriptor
@@ -2974,9 +3217,12 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde
*/
static inline void pagetable_free(struct ptdesc *pt)
{
- struct page *page = ptdesc_page(pt);
-
- __free_pages(page, compound_order(page));
+ if (ptdesc_test_kernel(pt)) {
+ ptdesc_clear_kernel(pt);
+ pagetable_free_kernel(pt);
+ } else {
+ __pagetable_free(pt);
+ }
}
#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
@@ -3560,6 +3806,90 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma)
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
+static inline unsigned long vma_desc_size(const struct vm_area_desc *desc)
+{
+ return desc->end - desc->start;
+}
+
+static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc)
+{
+ return vma_desc_size(desc) >> PAGE_SHIFT;
+}
+
+/**
+ * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN
+ * remap is required.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start: The virtual address to start the remap from, must be within the VMA.
+ * @start_pfn: The first PFN in the range to remap.
+ * @size: The size of the range to remap, in bytes, at most spanning to the end
+ * of the VMA.
+ */
+static inline void mmap_action_remap(struct vm_area_desc *desc,
+ unsigned long start,
+ unsigned long start_pfn,
+ unsigned long size)
+{
+ struct mmap_action *action = &desc->action;
+
+ /* [start, start + size) must be within the VMA. */
+ WARN_ON_ONCE(start < desc->start || start >= desc->end);
+ WARN_ON_ONCE(start + size > desc->end);
+
+ action->type = MMAP_REMAP_PFN;
+ action->remap.start = start;
+ action->remap.start_pfn = start_pfn;
+ action->remap.size = size;
+ action->remap.pgprot = desc->page_prot;
+}
+
+/**
+ * mmap_action_remap_full - helper for mmap_prepare hook to specify that the
+ * entirety of a VMA should be PFN remapped.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start_pfn: The first PFN in the range to remap.
+ */
+static inline void mmap_action_remap_full(struct vm_area_desc *desc,
+ unsigned long start_pfn)
+{
+ mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc));
+}
+
+/**
+ * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN
+ * I/O remap is required.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start: The virtual address to start the remap from, must be within the VMA.
+ * @start_pfn: The first PFN in the range to remap.
+ * @size: The size of the range to remap, in bytes, at most spanning to the end
+ * of the VMA.
+ */
+static inline void mmap_action_ioremap(struct vm_area_desc *desc,
+ unsigned long start,
+ unsigned long start_pfn,
+ unsigned long size)
+{
+ mmap_action_remap(desc, start, start_pfn, size);
+ desc->action.type = MMAP_IO_REMAP_PFN;
+}
+
+/**
+ * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the
+ * entirety of a VMA should be PFN I/O remapped.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start_pfn: The first PFN in the range to remap.
+ */
+static inline void mmap_action_ioremap_full(struct vm_area_desc *desc,
+ unsigned long start_pfn)
+{
+ mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc));
+}
+
+void mmap_action_prepare(struct mmap_action *action,
+ struct vm_area_desc *desc);
+int mmap_action_complete(struct mmap_action *action,
+ struct vm_area_struct *vma);
+
/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
unsigned long vm_start, unsigned long vm_end)
@@ -3601,10 +3931,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
unsigned long addr);
-int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t);
-int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot);
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t pgprot);
+
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num);
@@ -3637,15 +3966,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
-#ifndef io_remap_pfn_range
-static inline int io_remap_pfn_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn,
- unsigned long size, pgprot_t prot)
+#ifndef io_remap_pfn_range_pfn
+static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
+ unsigned long size)
{
- return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
+ return pfn;
}
#endif
+static inline int io_remap_pfn_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long orig_pfn,
+ unsigned long size, pgprot_t orig_prot)
+{
+ const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
+ const pgprot_t prot = pgprot_decrypted(orig_prot);
+
+ return remap_pfn_range(vma, addr, pfn, size, prot);
+}
+
static inline vm_fault_t vmf_error(int err)
{
if (err == -ENOMEM)
@@ -4094,6 +4432,7 @@ enum mf_action_page_type {
MF_MSG_DAX,
MF_MSG_UNSPLIT_THP,
MF_MSG_ALREADY_POISONED,
+ MF_MSG_PFN_MAP,
MF_MSG_UNKNOWN,
};
@@ -4222,16 +4561,6 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st
int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
-
-/*
- * mseal of userspace process's system mappings.
- */
-#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
-#define VM_SEALED_SYSMAP VM_SEALED
-#else
-#define VM_SEALED_SYSMAP VM_NONE
-#endif
-
/*
* DMA mapping IDs for page_pool
*
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f6a2b2d20016..fa2d6ba811b5 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -8,7 +8,7 @@
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/userfaultfd_k.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
/**
* folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
@@ -44,7 +44,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
lockdep_assert_held(&lruvec->lru_lock);
WARN_ON_ONCE(nr_pages != (int)nr_pages);
- __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
+ mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
}
@@ -541,9 +541,9 @@ static inline bool mm_tlb_flush_nested(const struct mm_struct *mm)
* The caller should insert a new pte created with make_pte_marker().
*/
static inline pte_marker copy_pte_marker(
- swp_entry_t entry, struct vm_area_struct *dst_vma)
+ softleaf_t entry, struct vm_area_struct *dst_vma)
{
- pte_marker srcm = pte_marker_get(entry);
+ const pte_marker srcm = softleaf_to_marker(entry);
/* Always copy error entries. */
pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);
@@ -553,7 +553,6 @@ static inline pte_marker copy_pte_marker(
return dstm;
}
-#endif
/*
* If this pte is wr-protected by uffd-wp in any form, arm the special pte to
@@ -571,9 +570,11 @@ static inline bool
pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
pte_t *pte, pte_t pteval)
{
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
bool arm_uffd_pte = false;
+ if (!uffd_supports_wp_marker())
+ return false;
+
/* The current status of the pte should be "cleared" before calling */
WARN_ON_ONCE(!pte_none(ptep_get(pte)));
@@ -602,7 +603,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
make_pte_marker(PTE_MARKER_UFFD_WP));
return true;
}
-#endif
+
return false;
}
@@ -616,6 +617,7 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma)
return true;
}
+#endif
/**
* num_pages_contiguous() - determine the number of contiguous pages
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3b7d05e7169c..9f6de068295d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -286,6 +286,31 @@ typedef struct {
unsigned long val;
} swp_entry_t;
+/**
+ * typedef softleaf_t - Describes a page table software leaf entry, abstracted
+ * from its architecture-specific encoding.
+ *
+ * Page table leaf entries are those which do not reference any descendent page
+ * tables but rather either reference a data page, are an empty (or 'none'
+ * entry), or contain a non-present entry.
+ *
+ * If referencing another page table or a data page then the page table entry is
+ * pertinent to hardware - that is it tells the hardware how to decode the page
+ * table entry.
+ *
+ * Otherwise it is a software-defined leaf page table entry, which this type
+ * describes. See leafops.h and specifically @softleaf_type for a list of all
+ * possible kinds of software leaf entry.
+ *
+ * A softleaf_t entry is abstracted from the hardware page table entry, so is
+ * not architecture-specific.
+ *
+ * NOTE: While we transition from the confusing swp_entry_t type used for this
+ * purpose, we simply alias this type. This will be removed once the
+ * transition is complete.
+ */
+typedef swp_entry_t softleaf_t;
+
#if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT)
/* We have some extra room after the refcount in tail pages. */
#define NR_PAGES_IN_LARGE_FOLIO
@@ -774,6 +799,65 @@ struct pfnmap_track_ctx {
};
#endif
+/* What action should be taken after an .mmap_prepare call is complete? */
+enum mmap_action_type {
+ MMAP_NOTHING, /* Mapping is complete, no further action. */
+ MMAP_REMAP_PFN, /* Remap PFN range. */
+ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */
+};
+
+/*
+ * Describes an action an mmap_prepare hook can instruct to be taken to complete
+ * the mapping of a VMA. Specified in vm_area_desc.
+ */
+struct mmap_action {
+ union {
+ /* Remap range. */
+ struct {
+ unsigned long start;
+ unsigned long start_pfn;
+ unsigned long size;
+ pgprot_t pgprot;
+ } remap;
+ };
+ enum mmap_action_type type;
+
+ /*
+ * If specified, this hook is invoked after the selected action has been
+ * successfully completed. Note that the VMA write lock still held.
+ *
+ * The absolute minimum ought to be done here.
+ *
+ * Returns 0 on success, or an error code.
+ */
+ int (*success_hook)(const struct vm_area_struct *vma);
+
+ /*
+ * If specified, this hook is invoked when an error occurred when
+ * attempting the selection action.
+ *
+ * The hook can return an error code in order to filter the error, but
+ * it is not valid to clear the error here.
+ */
+ int (*error_hook)(int err);
+
+ /*
+ * This should be set in rare instances where the operation required
+ * that the rmap should not be able to access the VMA until
+ * completely set up.
+ */
+ bool hide_from_rmap_until_complete :1;
+};
+
+/*
+ * Opaque type representing current VMA (vm_area_struct) flag state. Must be
+ * accessed via vma_flags_xxx() helper functions.
+ */
+#define NUM_VMA_FLAG_BITS BITS_PER_LONG
+typedef struct {
+ DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS);
+} __private vma_flags_t;
+
/*
* Describes a VMA that is about to be mmap()'ed. Drivers may choose to
* manipulate mutable fields which will cause those fields to be updated in the
@@ -791,12 +875,18 @@ struct vm_area_desc {
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
struct file *vm_file;
- vm_flags_t vm_flags;
+ union {
+ vm_flags_t vm_flags;
+ vma_flags_t vma_flags;
+ };
pgprot_t page_prot;
/* Write-only fields. */
const struct vm_operations_struct *vm_ops;
void *private_data;
+
+ /* Take further action? */
+ struct mmap_action action;
};
/*
@@ -833,10 +923,12 @@ struct vm_area_struct {
/*
* Flags, see mm.h.
* To modify use vm_flags_{init|reset|set|clear|mod} functions.
+ * Preferably, use vma_flags_xxx() functions.
*/
union {
+ /* Temporary while VMA flags are being converted. */
const vm_flags_t vm_flags;
- vm_flags_t __private __vm_flags;
+ vma_flags_t flags;
};
#ifdef CONFIG_PER_VMA_LOCK
@@ -917,6 +1009,52 @@ struct vm_area_struct {
#endif
} __randomize_layout;
+/* Clears all bits in the VMA flags bitmap, non-atomically. */
+static inline void vma_flags_clear_all(vma_flags_t *flags)
+{
+ bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Copy value to the first system word of VMA flags, non-atomically.
+ *
+ * IMPORTANT: This does not overwrite bytes past the first system word. The
+ * caller must account for this.
+ */
+static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value)
+{
+ *ACCESS_PRIVATE(flags, __vma_flags) = value;
+}
+
+/*
+ * Copy value to the first system word of VMA flags ONCE, non-atomically.
+ *
+ * IMPORTANT: This does not overwrite bytes past the first system word. The
+ * caller must account for this.
+ */
+static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+ WRITE_ONCE(*bitmap, value);
+}
+
+/* Update the first system word of VMA flags setting bits, non-atomically. */
+static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+ *bitmap |= value;
+}
+
+/* Update the first system word of VMA flags clearing bits, non-atomically. */
+static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+ *bitmap &= ~value;
+}
+
#ifdef CONFIG_NUMA
#define vma_policy(vma) ((vma)->vm_policy)
#else
@@ -1194,15 +1332,13 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};
-/* Set the first system word of mm flags, non-atomically. */
-static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value)
+/* Copy value to the first system word of mm flags, non-atomically. */
+static inline void __mm_flags_overwrite_word(struct mm_struct *mm, unsigned long value)
{
- unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags);
-
- bitmap_copy(bitmap, &value, BITS_PER_LONG);
+ *ACCESS_PRIVATE(&mm->flags, __mm_flags) = value;
}
-/* Obtain a read-only view of the bitmap. */
+/* Obtain a read-only view of the mm flags bitmap. */
static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm)
{
return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags);
@@ -1211,9 +1347,7 @@ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct
/* Read the first system word of mm flags, non-atomically. */
static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm)
{
- const unsigned long *bitmap = __mm_flags_get_bitmap(mm);
-
- return bitmap_read(bitmap, 0, BITS_PER_LONG);
+ return *__mm_flags_get_bitmap(mm);
}
/*
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 2c9fffa58714..d53f72dba7fe 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -130,7 +130,7 @@ static inline bool is_vma_writer_only(int refcnt)
* a detached vma happens only in vma_mark_detached() and is a rare
* case, therefore most of the time there will be no unnecessary wakeup.
*/
- return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
+ return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1;
}
static inline void vma_refcount_put(struct vm_area_struct *vma)
@@ -183,7 +183,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
}
/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
+static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
{
mmap_assert_write_locked(vma->vm_mm);
@@ -195,7 +195,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l
return (vma->vm_lock_seq == *mm_lock_seq);
}
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
+int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
+ int state);
/*
* Begin writing to a VMA.
@@ -209,7 +210,30 @@ static inline void vma_start_write(struct vm_area_struct *vma)
if (__is_vma_write_locked(vma, &mm_lock_seq))
return;
- __vma_start_write(vma, mm_lock_seq);
+ __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE);
+}
+
+/**
+ * vma_start_write_killable - Begin writing to a VMA.
+ * @vma: The VMA we are going to modify.
+ *
+ * Exclude concurrent readers under the per-VMA lock until the currently
+ * write-locked mmap_lock is dropped or downgraded.
+ *
+ * Context: May sleep while waiting for readers to drop the vma read lock.
+ * Caller must already hold the mmap_lock for write.
+ *
+ * Return: 0 for a successful acquisition. -EINTR if a fatal signal was
+ * received.
+ */
+static inline __must_check
+int vma_start_write_killable(struct vm_area_struct *vma)
+{
+ unsigned int mm_lock_seq;
+
+ if (__is_vma_write_locked(vma, &mm_lock_seq))
+ return 0;
+ return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE);
}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -281,11 +305,10 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
return true;
}
static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
-static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
- struct vm_area_struct *vma)
- { return NULL; }
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline __must_check
+int vma_start_write_killable(struct vm_area_struct *vma) { return 0; }
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{ mmap_assert_write_locked(vma->vm_mm); }
static inline void vma_assert_attached(struct vm_area_struct *vma) {}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7fb7331c5725..4398e027f450 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1060,10 +1060,6 @@ struct zone {
} ____cacheline_internodealigned_in_smp;
enum pgdat_flags {
- PGDAT_DIRTY, /* reclaim scanning has recently found
- * many dirty file pages at the tail
- * of the LRU.
- */
PGDAT_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
diff --git a/include/linux/node.h b/include/linux/node.h
index 866e3323f1fd..0269b064ba65 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -132,8 +132,6 @@ static inline void register_memory_blocks_under_nodes(void)
}
#endif
-extern void unregister_node(struct node *node);
-
struct node_notify {
int nid;
};
@@ -176,8 +174,8 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri)
#ifdef CONFIG_NUMA
extern void node_dev_init(void);
/* Core of the node registration - only memory hotplug should use this */
-extern int register_one_node(int nid);
-extern void unregister_one_node(int nid);
+int register_node(int nid);
+void unregister_node(int nid);
extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
@@ -189,11 +187,11 @@ extern int register_memory_node_under_compute_node(unsigned int mem_nid,
static inline void node_dev_init(void)
{
}
-static inline int register_one_node(int nid)
+static inline int register_node(int nid)
{
return 0;
}
-static inline int unregister_one_node(int nid)
+static inline int unregister_node(int nid)
{
return 0;
}
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index ee3148ef87f6..652f287c1ef6 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1557,6 +1557,18 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
#define arch_start_context_switch(prev) do {} while (0)
#endif
+/*
+ * Some platforms can customize the PTE soft-dirty bit making it unavailable
+ * even if the architecture provides the resource.
+ * Adding this API allows architectures to add their own checks for the
+ * devices on which the kernel is running.
+ * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY
+ * is part of this macro.
+ */
+#ifndef pgtable_supports_soft_dirty
+#define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)
+#endif
+
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 0232d983b715..0e1d73955fa5 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -189,12 +189,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags, vm_flags_t);
-unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags);
+unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags);
-unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
- struct file *filp,
+unsigned long mm_get_unmapped_area_vmflags(struct file *filp,
unsigned long addr,
unsigned long len,
unsigned long pgoff,
@@ -318,6 +317,9 @@ static inline void might_alloc(gfp_t gfp_mask)
fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);
+ if (current->flags & PF_MEMALLOC)
+ return;
+
might_sleep_if(gfpflags_allow_blocking(gfp_mask));
}
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 774efe592a9a..5e4b3c1ae5c2 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -94,7 +94,8 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
unsigned long flags);
extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
const char *name, loff_t size, unsigned long flags);
-extern int shmem_zero_setup(struct vm_area_struct *);
+int shmem_zero_setup(struct vm_area_struct *vma);
+int shmem_zero_setup_desc(struct vm_area_desc *desc);
extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
@@ -135,11 +136,16 @@ static inline bool shmem_hpage_pmd_enabled(void)
#ifdef CONFIG_SHMEM
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
+extern void shmem_uncharge(struct inode *inode, long pages);
#else
static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
return 0;
}
+
+static inline void shmem_uncharge(struct inode *inode, long pages)
+{
+}
#endif
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end);
@@ -193,7 +199,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
}
extern bool shmem_charge(struct inode *inode, long pages);
-extern void shmem_uncharge(struct inode *inode, long pages);
#ifdef CONFIG_USERFAULTFD
#ifdef CONFIG_SHMEM
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e818fbade1e2..38ca3df68716 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -301,16 +301,7 @@ struct swap_info_struct {
struct work_struct discard_work; /* discard worker */
struct work_struct reclaim_work; /* reclaim worker */
struct list_head discard_clusters; /* discard clusters list */
- struct plist_node avail_lists[]; /*
- * entries in swap_avail_heads, one
- * entry per node.
- * Must be last as the number of the
- * array is nr_node_ids, which is not
- * a fixed value so have to allocate
- * dynamically.
- * And it has to be an array so that
- * plist_for_each_* can work.
- */
+ struct plist_node avail_list; /* entry in swap_avail_head */
};
static inline swp_entry_t page_swap_entry(struct page *page)
@@ -462,7 +453,7 @@ static inline long get_nr_swap_pages(void)
}
extern void si_swapinfo(struct sysinfo *);
-int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask);
+int folio_alloc_swap(struct folio *folio);
bool folio_free_swap(struct folio *folio);
void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
@@ -560,7 +551,7 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask)
+static inline int folio_alloc_swap(struct folio *folio)
{
return -EINVAL;
}
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 64ea151a7ae3..8cfc966eae48 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -28,7 +28,7 @@
#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
/*
- * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To
+ * Definitions only for PFN swap entries (see leafeant_has_pfn()). To
* store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries
* can use the extra bits to store other information besides PFN.
*/
@@ -66,8 +66,6 @@
#define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT)
#define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT)
-static inline bool is_pfn_swap_entry(swp_entry_t entry);
-
/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
@@ -110,36 +108,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
}
/*
- * This should only be called upon a pfn swap entry to get the PFN stored
- * in the swap entry. Please refers to is_pfn_swap_entry() for definition
- * of pfn swap entry.
- */
-static inline unsigned long swp_offset_pfn(swp_entry_t entry)
-{
- VM_BUG_ON(!is_pfn_swap_entry(entry));
- return swp_offset(entry) & SWP_PFN_MASK;
-}
-
-/* check whether a pte points to a swap entry */
-static inline int is_swap_pte(pte_t pte)
-{
- return !pte_none(pte) && !pte_present(pte);
-}
-
-/*
- * Convert the arch-dependent pte representation of a swp_entry_t into an
- * arch-independent swp_entry_t.
- */
-static inline swp_entry_t pte_to_swp_entry(pte_t pte)
-{
- swp_entry_t arch_entry;
-
- pte = pte_swp_clear_flags(pte);
- arch_entry = __pte_to_swp_entry(pte);
- return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
-}
-
-/*
* Convert the arch-independent representation of a swp_entry_t into the
* arch-dependent pte representation.
*/
@@ -175,27 +143,11 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
return swp_entry(SWP_DEVICE_WRITE, offset);
}
-static inline bool is_device_private_entry(swp_entry_t entry)
-{
- int type = swp_type(entry);
- return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
-}
-
-static inline bool is_writable_device_private_entry(swp_entry_t entry)
-{
- return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
-}
-
static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
{
return swp_entry(SWP_DEVICE_EXCLUSIVE, offset);
}
-static inline bool is_device_exclusive_entry(swp_entry_t entry)
-{
- return swp_type(entry) == SWP_DEVICE_EXCLUSIVE;
-}
-
#else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
@@ -207,50 +159,14 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
return swp_entry(0, 0);
}
-static inline bool is_device_private_entry(swp_entry_t entry)
-{
- return false;
-}
-
-static inline bool is_writable_device_private_entry(swp_entry_t entry)
-{
- return false;
-}
-
static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
{
return swp_entry(0, 0);
}
-static inline bool is_device_exclusive_entry(swp_entry_t entry)
-{
- return false;
-}
-
#endif /* CONFIG_DEVICE_PRIVATE */
#ifdef CONFIG_MIGRATION
-static inline int is_migration_entry(swp_entry_t entry)
-{
- return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
- swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE ||
- swp_type(entry) == SWP_MIGRATION_WRITE);
-}
-
-static inline int is_writable_migration_entry(swp_entry_t entry)
-{
- return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
-}
-
-static inline int is_readable_migration_entry(swp_entry_t entry)
-{
- return unlikely(swp_type(entry) == SWP_MIGRATION_READ);
-}
-
-static inline int is_readable_exclusive_migration_entry(swp_entry_t entry)
-{
- return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE);
-}
static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
@@ -289,14 +205,6 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
return entry;
}
-static inline bool is_migration_entry_young(swp_entry_t entry)
-{
- if (migration_entry_supports_ad())
- return swp_offset(entry) & SWP_MIG_YOUNG;
- /* Keep the old behavior of aging page after migration */
- return false;
-}
-
static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
{
if (migration_entry_supports_ad())
@@ -305,14 +213,6 @@ static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
return entry;
}
-static inline bool is_migration_entry_dirty(swp_entry_t entry)
-{
- if (migration_entry_supports_ad())
- return swp_offset(entry) & SWP_MIG_DIRTY;
- /* Keep the old behavior of clean page after migration */
- return false;
-}
-
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte);
@@ -332,43 +232,21 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
return swp_entry(0, 0);
}
-static inline int is_migration_entry(swp_entry_t swp)
-{
- return 0;
-}
-
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte) { }
-static inline int is_writable_migration_entry(swp_entry_t entry)
-{
- return 0;
-}
-static inline int is_readable_migration_entry(swp_entry_t entry)
-{
- return 0;
-}
static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
{
return entry;
}
-static inline bool is_migration_entry_young(swp_entry_t entry)
-{
- return false;
-}
-
static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
{
return entry;
}
-static inline bool is_migration_entry_dirty(swp_entry_t entry)
-{
- return false;
-}
#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_MEMORY_FAILURE
@@ -426,21 +304,6 @@ static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
return swp_entry(SWP_PTE_MARKER, marker);
}
-static inline bool is_pte_marker_entry(swp_entry_t entry)
-{
- return swp_type(entry) == SWP_PTE_MARKER;
-}
-
-static inline pte_marker pte_marker_get(swp_entry_t entry)
-{
- return swp_offset(entry) & PTE_MARKER_MASK;
-}
-
-static inline bool is_pte_marker(pte_t pte)
-{
- return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte));
-}
-
static inline pte_t make_pte_marker(pte_marker marker)
{
return swp_entry_to_pte(make_pte_marker_entry(marker));
@@ -451,83 +314,11 @@ static inline swp_entry_t make_poisoned_swp_entry(void)
return make_pte_marker_entry(PTE_MARKER_POISONED);
}
-static inline int is_poisoned_swp_entry(swp_entry_t entry)
-{
- return is_pte_marker_entry(entry) &&
- (pte_marker_get(entry) & PTE_MARKER_POISONED);
-
-}
-
static inline swp_entry_t make_guard_swp_entry(void)
{
return make_pte_marker_entry(PTE_MARKER_GUARD);
}
-static inline int is_guard_swp_entry(swp_entry_t entry)
-{
- return is_pte_marker_entry(entry) &&
- (pte_marker_get(entry) & PTE_MARKER_GUARD);
-}
-
-/*
- * This is a special version to check pte_none() just to cover the case when
- * the pte is a pte marker. It existed because in many cases the pte marker
- * should be seen as a none pte; it's just that we have stored some information
- * onto the none pte so it becomes not-none any more.
- *
- * It should be used when the pte is file-backed, ram-based and backing
- * userspace pages, like shmem. It is not needed upon pgtables that do not
- * support pte markers at all. For example, it's not needed on anonymous
- * memory, kernel-only memory (including when the system is during-boot),
- * non-ram based generic file-system. It's fine to be used even there, but the
- * extra pte marker check will be pure overhead.
- */
-static inline int pte_none_mostly(pte_t pte)
-{
- return pte_none(pte) || is_pte_marker(pte);
-}
-
-static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
-{
- struct page *p = pfn_to_page(swp_offset_pfn(entry));
-
- /*
- * Any use of migration entries may only occur while the
- * corresponding page is locked
- */
- BUG_ON(is_migration_entry(entry) && !PageLocked(p));
-
- return p;
-}
-
-static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
-{
- struct folio *folio = pfn_folio(swp_offset_pfn(entry));
-
- /*
- * Any use of migration entries may only occur while the
- * corresponding folio is locked
- */
- BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio));
-
- return folio;
-}
-
-/*
- * A pfn swap entry is a special type of swap entry that always has a pfn stored
- * in the swap offset. They can either be used to represent unaddressable device
- * memory, to restrict access to a page undergoing migration or to represent a
- * pfn which has been hwpoisoned and unmapped.
- */
-static inline bool is_pfn_swap_entry(swp_entry_t entry)
-{
- /* Make sure the swp offset can always store the needed fields */
- BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
-
- return is_migration_entry(entry) || is_device_private_entry(entry) ||
- is_device_exclusive_entry(entry) || is_hwpoison_entry(entry);
-}
-
struct page_vma_mapped_walk;
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -539,18 +330,6 @@ extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
-static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
-{
- swp_entry_t arch_entry;
-
- if (pmd_swp_soft_dirty(pmd))
- pmd = pmd_swp_clear_soft_dirty(pmd);
- if (pmd_swp_uffd_wp(pmd))
- pmd = pmd_swp_clear_uffd_wp(pmd);
- arch_entry = __pmd_to_swp_entry(pmd);
- return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
-}
-
static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
swp_entry_t arch_entry;
@@ -559,10 +338,6 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
return __swp_entry_to_pmd(arch_entry);
}
-static inline int is_pmd_migration_entry(pmd_t pmd)
-{
- return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
-}
#else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
struct page *page)
@@ -578,26 +353,12 @@ static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }
-static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
-{
- return swp_entry(0, 0);
-}
-
static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
return __pmd(0);
}
-static inline int is_pmd_migration_entry(pmd_t pmd)
-{
- return 0;
-}
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static inline int non_swap_entry(swp_entry_t entry)
-{
- return swp_type(entry) >= MAX_SWAPFILES;
-}
-
#endif /* CONFIG_MMU */
#endif /* _LINUX_SWAPOPS_H */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index c0e716aec26a..fd5f42765497 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -16,7 +16,7 @@
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <asm-generic/pgtable_uffd.h>
#include <linux/hugetlb_inline.h>
@@ -228,15 +228,14 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
if (wp_async && (vm_flags == VM_UFFD_WP))
return true;
-#ifndef CONFIG_PTE_MARKER_UFFD_WP
/*
* If user requested uffd-wp but not enabled pte markers for
* uffd-wp, then shmem & hugetlbfs are not supported but only
* anonymous.
*/
- if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
+ if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) &&
+ !vma_is_anonymous(vma))
return false;
-#endif
/* By default, allow any of anon|shmem|hugetlb */
return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
@@ -291,6 +290,43 @@ void userfaultfd_release_new(struct userfaultfd_ctx *ctx);
void userfaultfd_release_all(struct mm_struct *mm,
struct userfaultfd_ctx *ctx);
+static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
+{
+ /* Only wr-protect mode uses pte markers */
+ if (!userfaultfd_wp(vma))
+ return false;
+
+ /* File-based uffd-wp always need markers */
+ if (!vma_is_anonymous(vma))
+ return true;
+
+ /*
+ * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
+ * enabled (to apply markers on zero pages).
+ */
+ return userfaultfd_wp_unpopulated(vma);
+}
+
+/*
+ * Returns true if this is a swap pte and was uffd-wp wr-protected in either
+ * forms (pte marker or a normal swap pte), false otherwise.
+ */
+static inline bool pte_swp_uffd_wp_any(pte_t pte)
+{
+ if (!uffd_supports_wp_marker())
+ return false;
+
+ if (pte_present(pte))
+ return false;
+
+ if (pte_swp_uffd_wp(pte))
+ return true;
+
+ if (pte_is_uffd_wp_marker(pte))
+ return true;
+
+ return false;
+}
#else /* CONFIG_USERFAULTFD */
/* mm helpers */
@@ -415,49 +451,9 @@ static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
return false;
}
-#endif /* CONFIG_USERFAULTFD */
-
static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
{
- /* Only wr-protect mode uses pte markers */
- if (!userfaultfd_wp(vma))
- return false;
-
- /* File-based uffd-wp always need markers */
- if (!vma_is_anonymous(vma))
- return true;
-
- /*
- * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
- * enabled (to apply markers on zero pages).
- */
- return userfaultfd_wp_unpopulated(vma);
-}
-
-static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry)
-{
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
- return is_pte_marker_entry(entry) &&
- (pte_marker_get(entry) & PTE_MARKER_UFFD_WP);
-#else
- return false;
-#endif
-}
-
-static inline bool pte_marker_uffd_wp(pte_t pte)
-{
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
- swp_entry_t entry;
-
- if (!is_swap_pte(pte))
- return false;
-
- entry = pte_to_swp_entry(pte);
-
- return pte_marker_entry_uffd_wp(entry);
-#else
return false;
-#endif
}
/*
@@ -466,17 +462,7 @@ static inline bool pte_marker_uffd_wp(pte_t pte)
*/
static inline bool pte_swp_uffd_wp_any(pte_t pte)
{
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
- if (!is_swap_pte(pte))
- return false;
-
- if (pte_swp_uffd_wp(pte))
- return true;
-
- if (pte_marker_uffd_wp(pte))
- return true;
-#endif
return false;
}
-
+#endif /* CONFIG_USERFAULTFD */
#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index eb54b7b3202f..e8e94f90d686 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */
#endif
struct vm_struct {
- struct vm_struct *next;
+ union {
+ struct vm_struct *next; /* Early registration of vm_areas. */
+ struct llist_node llnode; /* Asynchronous freeing on error paths. */
+ };
+
void *addr;
unsigned long size;
unsigned long flags;
@@ -328,4 +332,6 @@ bool vmalloc_dump_obj(void *object);
static inline bool vmalloc_dump_obj(void *object) { return false; }
#endif
+unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask);
+void memalloc_restore_scope(unsigned int flags);
#endif /* _LINUX_VMALLOC_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index c287998908bf..3398a345bda8 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -520,32 +520,12 @@ static inline const char *vm_event_name(enum vm_event_item item)
#ifdef CONFIG_MEMCG
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val);
-static inline void mod_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx, int val)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __mod_lruvec_state(lruvec, idx, val);
- local_irq_restore(flags);
-}
-
-void __lruvec_stat_mod_folio(struct folio *folio,
+void lruvec_stat_mod_folio(struct folio *folio,
enum node_stat_item idx, int val);
-static inline void lruvec_stat_mod_folio(struct folio *folio,
- enum node_stat_item idx, int val)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __lruvec_stat_mod_folio(folio, idx, val);
- local_irq_restore(flags);
-}
-
static inline void mod_lruvec_page_state(struct page *page,
enum node_stat_item idx, int val)
{
@@ -554,24 +534,12 @@ static inline void mod_lruvec_page_state(struct page *page,
#else
-static inline void __mod_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx, int val)
-{
- __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
-}
-
static inline void mod_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx, int val)
{
mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}
-static inline void __lruvec_stat_mod_folio(struct folio *folio,
- enum node_stat_item idx, int val)
-{
- __mod_node_page_state(folio_pgdat(folio), idx, val);
-}
-
static inline void lruvec_stat_mod_folio(struct folio *folio,
enum node_stat_item idx, int val)
{
@@ -586,18 +554,6 @@ static inline void mod_lruvec_page_state(struct page *page,
#endif /* CONFIG_MEMCG */
-static inline void __lruvec_stat_add_folio(struct folio *folio,
- enum node_stat_item idx)
-{
- __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
-}
-
-static inline void __lruvec_stat_sub_folio(struct folio *folio,
- enum node_stat_item idx)
-{
- __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
-}
-
static inline void lruvec_stat_add_folio(struct folio *folio,
enum node_stat_item idx)
{