diff options
Diffstat (limited to 'include/linux')
30 files changed, 1669 insertions, 707 deletions
diff --git a/include/linux/damon.h b/include/linux/damon.h index cae8c613c5fc..3813373a9200 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,17 +91,23 @@ struct damon_region { * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. + * @obsolete: Whether the commit destination target is obsolete. * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The * @pid should be set for appropriate &struct damon_operations including the * virtual address spaces monitoring operations. + * + * @obsolete is used only for damon_commit_targets() source targets, to specify + * the matching destination targets are obsolete. Read damon_commit_targets() + * to see how it is handled. */ struct damon_target { struct pid *pid; unsigned int nr_regions; struct list_head regions_list; struct list_head list; + bool obsolete; }; /** @@ -147,6 +153,8 @@ enum damos_action { * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. + * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. + * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -156,6 +164,8 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_SOME_MEM_PSI_US, DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, + DAMOS_QUOTA_NODE_MEMCG_USED_BP, + DAMOS_QUOTA_NODE_MEMCG_FREE_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -166,6 +176,7 @@ enum damos_quota_goal_metric { * @current_value: Current value of @metric. * @last_psi_total: Last measured total PSI * @nid: Node id. + * @memcg_id: Memcg id. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -176,6 +187,12 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually * entered by the user, probably inside the kdamond callbacks. Otherwise, * DAMON sets @current_value with self-measured value of @metric. + * + * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node + * id of the target node to account the used/free memory. + * + * If @metric is DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP, @nid and @memcg_id + * represents the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; @@ -184,7 +201,10 @@ struct damos_quota_goal { /* metric-dependent fields */ union { u64 last_psi_total; - int nid; + struct { + int nid; + unsigned short memcg_id; + }; }; struct list_head list; }; @@ -472,7 +492,7 @@ struct damos_migrate_dests { * @wmarks: Watermarks for automated (in)activation of this scheme. * @migrate_dests: Destination nodes if @action is "migrate_{hot,cold}". * @target_nid: Destination node if @action is "migrate_{hot,cold}". - * @filters: Additional set of &struct damos_filter for &action. + * @core_filters: Additional set of &struct damos_filter for &action. * @ops_filters: ops layer handling &struct damos_filter objects list. * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. @@ -498,7 +518,7 @@ struct damos_migrate_dests { * * Before applying the &action to a memory region, &struct damon_operations * implementation could check pages of the region and skip &action to respect - * &filters + * &core_filters * * The minimum entity that @action can be applied depends on the underlying * &struct damon_operations. Since it may not be aligned with the core layer @@ -542,7 +562,7 @@ struct damos { struct damos_migrate_dests migrate_dests; }; }; - struct list_head filters; + struct list_head core_filters; struct list_head ops_filters; void *last_applied; struct damos_stat stat; @@ -851,11 +871,11 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damos_for_each_quota_goal_safe(goal, next, quota) \ list_for_each_entry_safe(goal, next, &(quota)->goals, list) -#define damos_for_each_filter(f, scheme) \ - list_for_each_entry(f, &(scheme)->filters, list) +#define damos_for_each_core_filter(f, scheme) \ + list_for_each_entry(f, &(scheme)->core_filters, list) -#define damos_for_each_filter_safe(f, next, scheme) \ - list_for_each_entry_safe(f, next, &(scheme)->filters, list) +#define damos_for_each_core_filter_safe(f, next, scheme) \ + list_for_each_entry_safe(f, next, &(scheme)->core_filters, list) #define damos_for_each_ops_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->ops_filters, list) @@ -947,7 +967,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end); + unsigned long *start, unsigned long *end, + unsigned long min_sz_region); #endif /* CONFIG_DAMON */ diff --git a/include/linux/fs.h b/include/linux/fs.h index ce25feb06727..17b38b9d7f90 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2041,14 +2041,14 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma); -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 623bee335383..b155929af5b1 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -387,7 +387,7 @@ extern void free_pages(unsigned long addr, unsigned int order); #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 11cab07f322a..ae7f21aad0ac 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -364,20 +364,35 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); +enum split_type { + SPLIT_TYPE_UNIFORM, + SPLIT_TYPE_NON_UNIFORM, +}; + bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); -int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, +int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); +int folio_split_unmapped(struct folio *folio, unsigned int new_order); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); -bool uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns); -bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns); +bool folio_split_supported(struct folio *folio, unsigned int new_order, + enum split_type split_type, bool warns); int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); -/* - * try_folio_split_to_order - try to split a @folio at @page to @new_order using - * non uniform split. + +static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) +{ + return __split_huge_page_to_list_to_order(page, list, new_order); +} +static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) +{ + return split_huge_page_to_list_to_order(page, NULL, new_order); +} + +/** + * try_folio_split_to_order() - try to split a @folio at @page to @new_order + * using non uniform split. * @folio: folio to be split * @page: split to @new_order at the given page * @new_order: the target split order @@ -387,14 +402,13 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *page, * folios are put back to LRU list. Use min_order_for_split() to get the lower * bound of @new_order. * - * Return: 0: split is successful, otherwise split failed. + * Return: 0 - split is successful, otherwise split failed. */ static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) { - if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) - return split_huge_page_to_list_to_order(&folio->page, NULL, - new_order); + if (!folio_split_supported(folio, new_order, SPLIT_TYPE_NON_UNIFORM, /* warns= */ false)) + return split_huge_page_to_order(&folio->page, new_order); return folio_split(folio, new_order, page, NULL); } static inline int split_huge_page(struct page *page) @@ -402,14 +416,43 @@ static inline int split_huge_page(struct page *page) return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio, bool partially_mapped); +#ifdef CONFIG_MEMCG +void reparent_deferred_split_queue(struct mem_cgroup *memcg); +#endif void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); +/** + * pmd_is_huge() - Is this PMD either a huge PMD entry or a software leaf entry? + * @pmd: The PMD to check. + * + * A huge PMD entry is a non-empty entry which is present and marked huge or a + * software leaf entry. This check be performed without the appropriate locks + * held, in which case the condition should be rechecked after they are + * acquired. + * + * Returns: true if this PMD is huge, false otherwise. + */ +static inline bool pmd_is_huge(pmd_t pmd) +{ + if (pmd_present(pmd)) { + return pmd_trans_huge(pmd); + } else if (!pmd_none(pmd)) { + /* + * Non-present PMDs must be valid huge non-present entries. We + * cannot assert that here due to header dependency issues. + */ + return true; + } + + return false; +} + #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ - if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)) \ + if (pmd_is_huge(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false); \ } while (0) @@ -447,19 +490,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); -static inline int is_swap_pmd(pmd_t pmd) -{ - return !pmd_none(pmd) && !pmd_present(pmd); -} - /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_is_huge(*pmd)) return __pmd_trans_huge_lock(pmd, vma); - else - return NULL; + + return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) @@ -473,6 +511,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, /** * folio_test_pmd_mappable - Can we map this folio with a PMD? * @folio: The folio to test + * + * Return: true - @folio can be mapped, false - @folio cannot be mapped. */ static inline bool folio_test_pmd_mappable(struct folio *folio) { @@ -481,6 +521,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf); + extern struct folio *huge_zero_folio; extern unsigned long huge_zero_pfn; @@ -524,6 +566,8 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze); bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio); +void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -576,6 +620,11 @@ split_huge_page_to_list_to_order(struct page *page, struct list_head *list, VM_WARN_ON_ONCE_PAGE(1, page); return -EINVAL; } +static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) +{ + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; +} static inline int split_huge_page(struct page *page) { VM_WARN_ON_ONCE_PAGE(1, page); @@ -602,6 +651,7 @@ static inline int try_folio_split_to_order(struct folio *folio, } static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) @@ -642,10 +692,6 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, struct vm_area_struct *next) { } -static inline int is_swap_pmd(pmd_t pmd) -{ - return 0; -} static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { @@ -662,6 +708,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) return 0; } +static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) +{ + return 0; +} + static inline bool is_huge_zero_folio(const struct folio *folio) { return false; @@ -682,12 +733,6 @@ static inline void mm_put_huge_zero_folio(struct mm_struct *mm) return; } -static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) -{ - return NULL; -} - static inline bool thp_migration_supported(void) { return false; @@ -720,6 +765,11 @@ static inline struct folio *get_persistent_huge_zero_folio(void) { return NULL; } + +static inline bool pmd_is_huge(pmd_t pmd) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8e63e46b8e1f..019a1c5281e4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags); + struct vm_area_desc *desc, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -172,7 +171,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h, struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); -extern int sysctl_hugetlb_shm_group; +extern int sysctl_hugetlb_shm_group __read_mostly; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); @@ -275,11 +274,10 @@ void hugetlb_vma_lock_release(struct kref *kref); long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); -bool is_hugetlb_entry_migration(pte_t pte); -bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -466,6 +464,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} +static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + return 0; +} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 0660a03d37d9..a27aa0162918 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -2,22 +2,27 @@ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H -#ifdef CONFIG_HUGETLB_PAGE - #include <linux/mm.h> -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +#ifdef CONFIG_HUGETLB_PAGE + +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { - return !!(vma->vm_flags & VM_HUGETLB); + return !!(vm_flags & VM_HUGETLB); } #else -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { return false; } #endif +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return is_vm_hugetlb_flags(vma->vm_flags); +} + #endif diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 801b2bd9e8d4..8c66284a91a8 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1135,7 +1135,9 @@ struct iommu_sva { struct iommu_mm_data { u32 pasid; + struct mm_struct *mm; struct list_head sva_domains; + struct list_head mm_list_elm; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1616,6 +1618,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1640,6 +1643,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */ #ifdef CONFIG_IOMMU_IOPF diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d12e1a5f5a9a..f335c1d7b61d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -571,11 +571,27 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); -void kasan_release_vmalloc(unsigned long start, unsigned long end, +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); +static inline int kasan_populate_vmalloc(unsigned long addr, + unsigned long size, gfp_t gfp_mask) +{ + if (kasan_enabled()) + return __kasan_populate_vmalloc(addr, size, gfp_mask); + return 0; +} +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags); +static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end, + unsigned long flags) +{ + if (kasan_enabled()) + return __kasan_release_vmalloc(start, end, free_region_start, + free_region_end, flags); +} #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index f2fd221107bb..7da9fd506b39 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -133,6 +133,7 @@ void kmsan_kfree_large(const void *ptr); * @prot: page protection flags used for vmap. * @pages: array of pages. * @page_shift: page_shift passed to vmap_range_noflush(). + * @gfp_mask: gfp_mask to use internally. * * KMSAN maps shadow and origin pages of @pages into contiguous ranges in * vmalloc metadata address range. Returns 0 on success, callers must check @@ -142,7 +143,8 @@ int __must_check kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift); + unsigned int page_shift, + gfp_t gfp_mask); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -347,7 +349,7 @@ static inline void kmsan_kfree_large(const void *ptr) static inline int __must_check kmsan_vmap_pages_range_noflush( unsigned long start, unsigned long end, pgprot_t prot, - struct page **pages, unsigned int page_shift) + struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return 0; } diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 067538fc4d58..c982694c987b 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,7 +17,7 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); @@ -103,7 +103,7 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, +static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { return vm_flags; diff --git a/include/linux/leafops.h b/include/linux/leafops.h new file mode 100644 index 000000000000..cfafe7a5e7b1 --- /dev/null +++ b/include/linux/leafops.h @@ -0,0 +1,619 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Describes operations that can be performed on software-defined page table + * leaf entries. These are abstracted from the hardware page table entries + * themselves by the softleaf_t type, see mm_types.h. + */ +#ifndef _LINUX_LEAFOPS_H +#define _LINUX_LEAFOPS_H + +#include <linux/mm_types.h> +#include <linux/swapops.h> +#include <linux/swap.h> + +#ifdef CONFIG_MMU + +/* Temporary until swp_entry_t eliminated. */ +#define LEAF_TYPE_SHIFT SWP_TYPE_SHIFT + +enum softleaf_type { + /* Fundamental types. */ + SOFTLEAF_NONE, + SOFTLEAF_SWAP, + /* Migration types. */ + SOFTLEAF_MIGRATION_READ, + SOFTLEAF_MIGRATION_READ_EXCLUSIVE, + SOFTLEAF_MIGRATION_WRITE, + /* Device types. */ + SOFTLEAF_DEVICE_PRIVATE_READ, + SOFTLEAF_DEVICE_PRIVATE_WRITE, + SOFTLEAF_DEVICE_EXCLUSIVE, + /* H/W posion types. */ + SOFTLEAF_HWPOISON, + /* Marker types. */ + SOFTLEAF_MARKER, +}; + +/** + * softleaf_mk_none() - Create an empty ('none') leaf entry. + * Returns: empty leaf entry. + */ +static inline softleaf_t softleaf_mk_none(void) +{ + return ((softleaf_t) { 0 }); +} + +/** + * softleaf_from_pte() - Obtain a leaf entry from a PTE entry. + * @pte: PTE entry. + * + * If @pte is present (therefore not a leaf entry) the function returns an empty + * leaf entry. Otherwise, it returns a leaf entry. + * + * Returns: Leaf entry. + */ +static inline softleaf_t softleaf_from_pte(pte_t pte) +{ + softleaf_t arch_entry; + + if (pte_present(pte) || pte_none(pte)) + return softleaf_mk_none(); + + pte = pte_swp_clear_flags(pte); + arch_entry = __pte_to_swp_entry(pte); + + /* Temporary until swp_entry_t eliminated. */ + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); +} + +/** + * softleaf_to_pte() - Obtain a PTE entry from a leaf entry. + * @entry: Leaf entry. + * + * This generates an architecture-specific PTE entry that can be utilised to + * encode the metadata the leaf entry encodes. + * + * Returns: Architecture-specific PTE entry encoding leaf entry. + */ +static inline pte_t softleaf_to_pte(softleaf_t entry) +{ + /* Temporary until swp_entry_t eliminated. */ + return swp_entry_to_pte(entry); +} + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +/** + * softleaf_from_pmd() - Obtain a leaf entry from a PMD entry. + * @pmd: PMD entry. + * + * If @pmd is present (therefore not a leaf entry) the function returns an empty + * leaf entry. Otherwise, it returns a leaf entry. + * + * Returns: Leaf entry. + */ +static inline softleaf_t softleaf_from_pmd(pmd_t pmd) +{ + softleaf_t arch_entry; + + if (pmd_present(pmd) || pmd_none(pmd)) + return softleaf_mk_none(); + + if (pmd_swp_soft_dirty(pmd)) + pmd = pmd_swp_clear_soft_dirty(pmd); + if (pmd_swp_uffd_wp(pmd)) + pmd = pmd_swp_clear_uffd_wp(pmd); + arch_entry = __pmd_to_swp_entry(pmd); + + /* Temporary until swp_entry_t eliminated. */ + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); +} + +#else + +static inline softleaf_t softleaf_from_pmd(pmd_t pmd) +{ + return softleaf_mk_none(); +} + +#endif + +/** + * softleaf_is_none() - Is the leaf entry empty? + * @entry: Leaf entry. + * + * Empty entries are typically the result of a 'none' page table leaf entry + * being converted to a leaf entry. + * + * Returns: true if the entry is empty, false otherwise. + */ +static inline bool softleaf_is_none(softleaf_t entry) +{ + return entry.val == 0; +} + +/** + * softleaf_type() - Identify the type of leaf entry. + * @enntry: Leaf entry. + * + * Returns: the leaf entry type associated with @entry. + */ +static inline enum softleaf_type softleaf_type(softleaf_t entry) +{ + unsigned int type_num; + + if (softleaf_is_none(entry)) + return SOFTLEAF_NONE; + + type_num = entry.val >> LEAF_TYPE_SHIFT; + + if (type_num < MAX_SWAPFILES) + return SOFTLEAF_SWAP; + + switch (type_num) { +#ifdef CONFIG_MIGRATION + case SWP_MIGRATION_READ: + return SOFTLEAF_MIGRATION_READ; + case SWP_MIGRATION_READ_EXCLUSIVE: + return SOFTLEAF_MIGRATION_READ_EXCLUSIVE; + case SWP_MIGRATION_WRITE: + return SOFTLEAF_MIGRATION_WRITE; +#endif +#ifdef CONFIG_DEVICE_PRIVATE + case SWP_DEVICE_WRITE: + return SOFTLEAF_DEVICE_PRIVATE_WRITE; + case SWP_DEVICE_READ: + return SOFTLEAF_DEVICE_PRIVATE_READ; + case SWP_DEVICE_EXCLUSIVE: + return SOFTLEAF_DEVICE_EXCLUSIVE; +#endif +#ifdef CONFIG_MEMORY_FAILURE + case SWP_HWPOISON: + return SOFTLEAF_HWPOISON; +#endif + case SWP_PTE_MARKER: + return SOFTLEAF_MARKER; + } + + /* Unknown entry type. */ + VM_WARN_ON_ONCE(1); + return SOFTLEAF_NONE; +} + +/** + * softleaf_is_swap() - Is this leaf entry a swap entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a swap entry, otherwise false. + */ +static inline bool softleaf_is_swap(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_SWAP; +} + +/** + * softleaf_is_migration_write() - Is this leaf entry a writable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a writable migration entry, otherwise + * false. + */ +static inline bool softleaf_is_migration_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_WRITE; +} + +/** + * softleaf_is_migration_read() - Is this leaf entry a readable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a readable migration entry, otherwise + * false. + */ +static inline bool softleaf_is_migration_read(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ; +} + +/** + * softleaf_is_migration_read_exclusive() - Is this leaf entry an exclusive + * readable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is an exclusive readable migration entry, + * otherwise false. + */ +static inline bool softleaf_is_migration_read_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ_EXCLUSIVE; +} + +/** + * softleaf_is_migration() - Is this leaf entry a migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a migration entry, otherwise false. + */ +static inline bool softleaf_is_migration(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_MIGRATION_READ: + case SOFTLEAF_MIGRATION_READ_EXCLUSIVE: + case SOFTLEAF_MIGRATION_WRITE: + return true; + default: + return false; + } +} + +/** + * softleaf_is_device_private_write() - Is this leaf entry a device private + * writable entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private writable entry, otherwise + * false. + */ +static inline bool softleaf_is_device_private_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_DEVICE_PRIVATE_WRITE; +} + +/** + * softleaf_is_device_private() - Is this leaf entry a device private entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private entry, otherwise false. + */ +static inline bool softleaf_is_device_private(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_DEVICE_PRIVATE_WRITE: + case SOFTLEAF_DEVICE_PRIVATE_READ: + return true; + default: + return false; + } +} + +/** + * softleaf_is_device_exclusive() - Is this leaf entry a device-exclusive entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device-exclusive entry, otherwise false. + */ +static inline bool softleaf_is_device_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_DEVICE_EXCLUSIVE; +} + +/** + * softleaf_is_hwpoison() - Is this leaf entry a hardware poison entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a hardware poison entry, otherwise false. + */ +static inline bool softleaf_is_hwpoison(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_HWPOISON; +} + +/** + * softleaf_is_marker() - Is this leaf entry a marker? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a marker entry, otherwise false. + */ +static inline bool softleaf_is_marker(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MARKER; +} + +/** + * softleaf_to_marker() - Obtain marker associated with leaf entry. + * @entry: Leaf entry, softleaf_is_marker(@entry) must return true. + * + * Returns: Marker associated with the leaf entry. + */ +static inline pte_marker softleaf_to_marker(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_marker(entry)); + + return swp_offset(entry) & PTE_MARKER_MASK; +} + +/** + * softleaf_has_pfn() - Does this leaf entry encode a valid PFN number? + * @entry: Leaf entry. + * + * A pfn swap entry is a special type of swap entry that always has a pfn stored + * in the swap offset. They can either be used to represent unaddressable device + * memory, to restrict access to a page undergoing migration or to represent a + * pfn which has been hwpoisoned and unmapped. + * + * Returns: true if the leaf entry encodes a PFN, otherwise false. + */ +static inline bool softleaf_has_pfn(softleaf_t entry) +{ + /* Make sure the swp offset can always store the needed fields. */ + BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); + + if (softleaf_is_migration(entry)) + return true; + if (softleaf_is_device_private(entry)) + return true; + if (softleaf_is_device_exclusive(entry)) + return true; + if (softleaf_is_hwpoison(entry)) + return true; + + return false; +} + +/** + * softleaf_to_pfn() - Obtain PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: The PFN associated with the leaf entry. + */ +static inline unsigned long softleaf_to_pfn(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return swp_offset(entry) & SWP_PFN_MASK; +} + +/** + * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: Pointer to the struct page associated with the leaf entry's PFN. + */ +static inline struct page *softleaf_to_page(softleaf_t entry) +{ + struct page *page = pfn_to_page(softleaf_to_pfn(entry)); + + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); + + return page; +} + +/** + * softleaf_to_folio() - Obtains struct folio for PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: Pointer to the struct folio associated with the leaf entry's PFN. + */ +static inline struct folio *softleaf_to_folio(softleaf_t entry) +{ + struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); + + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding folio is locked. + */ + VM_WARN_ON_ONCE(softleaf_is_migration(entry) && + !folio_test_locked(folio)); + + return folio; +} + +/** + * softleaf_is_poison_marker() - Is this leaf entry a poison marker? + * @entry: Leaf entry. + * + * The poison marker is set via UFFDIO_POISON. Userfaultfd-specific. + * + * Returns: true if the leaf entry is a poison marker, otherwise false. + */ +static inline bool softleaf_is_poison_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_POISONED; +} + +/** + * softleaf_is_guard_marker() - Is this leaf entry a guard region marker? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a guard marker, otherwise false. + */ +static inline bool softleaf_is_guard_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_GUARD; +} + +/** + * softleaf_is_uffd_wp_marker() - Is this leaf entry a userfautlfd write protect + * marker? + * @entry: Leaf entry. + * + * Userfaultfd-specific. + * + * Returns: true if the leaf entry is a UFFD WP marker, otherwise false. + */ +static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP; +} + +#ifdef CONFIG_MIGRATION + +/** + * softleaf_is_migration_young() - Does this migration entry contain an accessed + * bit? + * @entry: Leaf entry. + * + * If the architecture can support storing A/D bits in migration entries, this + * determines whether the accessed (or 'young') bit was set on the migrated page + * table entry. + * + * Returns: true if the entry contains an accessed bit, otherwise false. + */ +static inline bool softleaf_is_migration_young(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_migration(entry)); + + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_YOUNG; + /* Keep the old behavior of aging page after migration */ + return false; +} + +/** + * softleaf_is_migration_dirty() - Does this migration entry contain a dirty bit? + * @entry: Leaf entry. + * + * If the architecture can support storing A/D bits in migration entries, this + * determines whether the dirty bit was set on the migrated page table entry. + * + * Returns: true if the entry contains a dirty bit, otherwise false. + */ +static inline bool softleaf_is_migration_dirty(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_migration(entry)); + + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_DIRTY; + /* Keep the old behavior of clean page after migration */ + return false; +} + +#else /* CONFIG_MIGRATION */ + +static inline bool softleaf_is_migration_young(softleaf_t entry) +{ + return false; +} + +static inline bool softleaf_is_migration_dirty(softleaf_t entry) +{ + return false; +} +#endif /* CONFIG_MIGRATION */ + +/** + * pte_is_marker() - Does the PTE entry encode a marker leaf entry? + * @pte: PTE entry. + * + * Returns: true if this PTE is a marker leaf entry, otherwise false. + */ +static inline bool pte_is_marker(pte_t pte) +{ + return softleaf_is_marker(softleaf_from_pte(pte)); +} + +/** + * pte_is_uffd_wp_marker() - Does this PTE entry encode a userfaultfd write + * protect marker leaf entry? + * @pte: PTE entry. + * + * Returns: true if this PTE is a UFFD WP marker leaf entry, otherwise false. + */ +static inline bool pte_is_uffd_wp_marker(pte_t pte) +{ + const softleaf_t entry = softleaf_from_pte(pte); + + return softleaf_is_uffd_wp_marker(entry); +} + +/** + * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker + * leaf entry? + * @entry: Leaf entry. + * + * It's useful to be able to determine which leaf entries encode UFFD-specific + * markers so we can handle these correctly. + * + * Returns: true if this PTE entry is a UFFD-specific marker, otherwise false. + */ +static inline bool pte_is_uffd_marker(pte_t pte) +{ + const softleaf_t entry = softleaf_from_pte(pte); + + if (!softleaf_is_marker(entry)) + return false; + + /* UFFD WP, poisoned swap entries are UFFD-handled. */ + if (softleaf_is_uffd_wp_marker(entry)) + return true; + if (softleaf_is_poison_marker(entry)) + return true; + + return false; +} + +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) + +/** + * pmd_is_device_private_entry() - Check if PMD contains a device private swap + * entry. + * @pmd: The PMD to check. + * + * Returns true if the PMD contains a swap entry that represents a device private + * page mapping. This is used for zone device private pages that have been + * swapped out but still need special handling during various memory management + * operations. + * + * Return: true if PMD contains device private entry, false otherwise + */ +static inline bool pmd_is_device_private_entry(pmd_t pmd) +{ + return softleaf_is_device_private(softleaf_from_pmd(pmd)); +} + +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static inline bool pmd_is_device_private_entry(pmd_t pmd) +{ + return false; +} + +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +/** + * pmd_is_migration_entry() - Does this PMD entry encode a migration entry? + * @pmd: PMD entry. + * + * Returns: true if the PMD encodes a migration entry, otherwise false. + */ +static inline bool pmd_is_migration_entry(pmd_t pmd) +{ + return softleaf_is_migration(softleaf_from_pmd(pmd)); +} + +/** + * pmd_is_valid_softleaf() - Is this PMD entry a valid leaf entry? + * @pmd: PMD entry. + * + * PMD leaf entries are valid only if they are device private or migration + * entries. This function asserts that a PMD leaf entry is valid in this + * respect. + * + * Returns: true if the PMD entry is a valid leaf entry, otherwise false. + */ +static inline bool pmd_is_valid_softleaf(pmd_t pmd) +{ + const softleaf_t entry = softleaf_from_pmd(pmd); + + /* Only device private, migration entries valid for PMD. */ + return softleaf_is_device_private(entry) || + softleaf_is_migration(entry); +} + +#endif /* CONFIG_MMU */ +#endif /* _LINUX_LEAFOPS_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 873e510d6f8d..0651865a4564 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,6 +52,7 @@ enum memcg_memory_event { MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, + MEMCG_SOCK_THROTTLED, MEMCG_NR_MEMORY_EVENTS, }; @@ -956,17 +957,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, void mem_cgroup_flush_stats(struct mem_cgroup *memcg); void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); -void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); - -static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, - int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_kmem_state(p, idx, val); - local_irq_restore(flags); -} +void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); @@ -1001,36 +992,8 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, count_memcg_events_mm(mm, idx, 1); } -static inline void __memcg_memory_event(struct mem_cgroup *memcg, - enum memcg_memory_event event, - bool allow_spinning) -{ - bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || - event == MEMCG_SWAP_FAIL; - - /* For now only MEMCG_MAX can happen with !allow_spinning context. */ - VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); - - atomic_long_inc(&memcg->memory_events_local[event]); - if (!swap_event && allow_spinning) - cgroup_file_notify(&memcg->events_local_file); - - do { - atomic_long_inc(&memcg->memory_events[event]); - if (allow_spinning) { - if (swap_event) - cgroup_file_notify(&memcg->swap_events_file); - else - cgroup_file_notify(&memcg->events_file); - } - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - break; - if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) - break; - } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); -} +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning); static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) @@ -1430,14 +1393,6 @@ static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } -static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, - int val) -{ - struct page *page = virt_to_head_page(p); - - __mod_node_page_state(page_pgdat(page), idx, val); -} - static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { @@ -1497,16 +1452,6 @@ struct slabobj_ext { #endif } __aligned(8); -static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) -{ - __mod_lruvec_kmem_state(p, idx, 1); -} - -static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) -{ - __mod_lruvec_kmem_state(p, idx, -1); -} - static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; @@ -1674,6 +1619,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); + +static inline int shrinker_id(struct shrinker *shrinker) +{ + return shrinker->id; +} #else #define mem_cgroup_sockets_enabled 0 @@ -1705,6 +1655,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } + +static inline int shrinker_id(struct shrinker *shrinker) +{ + return -1; +} #endif #ifdef CONFIG_MEMCG @@ -1791,6 +1746,13 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); +void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); + +static inline bool memcg_is_dying(struct mem_cgroup *memcg) +{ + return memcg ? css_is_dying(&memcg->css) : false; +} + #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1857,6 +1819,15 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) { return true; } + +static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) +{ +} + +static inline bool memcg_is_dying(struct mem_cgroup *memcg) +{ + return false; +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 000000000000..bc326503d2d2 --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include <linux/interval_tree.h> + +struct pfn_address_space; + +struct pfn_address_space { + struct interval_tree_node node; + struct address_space *mapping; +}; + +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/memory.h b/include/linux/memory.h index ba1515160894..faeaa921e55b 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -64,9 +64,19 @@ struct memory_group { }; }; +enum memory_block_state { + /* These states are exposed to userspace as text strings in sysfs */ + MEM_ONLINE, /* exposed to userspace */ + MEM_GOING_OFFLINE, /* exposed to userspace */ + MEM_OFFLINE, /* exposed to userspace */ + MEM_GOING_ONLINE, + MEM_CANCEL_ONLINE, + MEM_CANCEL_OFFLINE, +}; + struct memory_block { unsigned long start_section_nr; - unsigned long state; /* serialized by the dev->lock */ + enum memory_block_state state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ /* @@ -89,14 +99,6 @@ int arch_get_memory_phys_device(unsigned long start_pfn); unsigned long memory_block_size_bytes(void); int set_memory_block_size_order(unsigned int order); -/* These states are exposed to userspace as text strings in sysfs */ -#define MEM_ONLINE (1<<0) /* exposed to userspace */ -#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ -#define MEM_OFFLINE (1<<2) /* exposed to userspace */ -#define MEM_GOING_ONLINE (1<<3) -#define MEM_CANCEL_ONLINE (1<<4) -#define MEM_CANCEL_OFFLINE (1<<5) - struct memory_notify { unsigned long start_pfn; unsigned long nr_pages; @@ -130,7 +132,7 @@ static inline int register_memory_notifier(struct notifier_block *nb) static inline void unregister_memory_notifier(struct notifier_block *nb) { } -static inline int memory_notify(unsigned long val, void *v) +static inline int memory_notify(enum memory_block_state state, void *v) { return 0; } @@ -154,7 +156,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); -extern int memory_notify(unsigned long val, void *v); +extern int memory_notify(enum memory_block_state state, void *v); extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 30c7aecbd245..713ec0435b48 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -76,11 +76,11 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 0. The reference count will be + * Called once the folio refcount reaches 0. The reference count will be * reset to one by the core code after the method is called to prepare - * for handing out the page again. + * for handing out the folio again. */ - void (*page_free)(struct page *page); + void (*folio_free)(struct folio *folio); /* * Used for private (un-addressable) device memory only. Must migrate @@ -99,6 +99,13 @@ struct dev_pagemap_ops { */ int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, unsigned long nr_pages, int mf_flags); + + /* + * Used for private (un-addressable) device memory only. + * This callback is used when a folio is split into + * a smaller folio + */ + void (*folio_split)(struct folio *head, struct folio *tail); }; #define PGMAP_ALTMAP_VALID (1 << 0) @@ -176,6 +183,18 @@ static inline bool folio_is_pci_p2pdma(const struct folio *folio) folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } +static inline void *folio_zone_device_data(const struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); + return folio->page.zone_device_data; +} + +static inline void folio_set_zone_device_data(struct folio *folio, void *data) +{ + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); + folio->page.zone_device_data = data; +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && @@ -205,7 +224,7 @@ static inline bool is_fsdax_page(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE -void zone_device_page_init(struct page *page); +void zone_device_page_init(struct page *page, unsigned int order); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -214,6 +233,31 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); + +static inline void zone_device_folio_init(struct folio *folio, unsigned int order) +{ + zone_device_page_init(&folio->page, order); + if (order) + folio_set_large_rmappable(folio); +} + +static inline void zone_device_private_split_cb(struct folio *original_folio, + struct folio *new_folio) +{ + if (folio_is_device_private(original_folio)) { + if (!original_folio->pgmap->ops->folio_split) { + if (new_folio) { + new_folio->pgmap = original_folio->pgmap; + new_folio->page.mapping = + original_folio->page.mapping; + } + } else { + original_folio->pgmap->ops->folio_split(original_folio, + new_folio); + } + } +} + #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) @@ -247,6 +291,11 @@ static inline unsigned long memremap_compat_align(void) { return PAGE_SIZE; } + +static inline void zone_device_private_split_cb(struct folio *original_folio, + struct folio *new_folio) +{ +} #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 1f0ac122c3bf..26ca00c325d9 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) +void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, @@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node) #define MIGRATE_PFN_VALID (1UL << 0) #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_WRITE (1UL << 3) +#define MIGRATE_PFN_COMPOUND (1UL << 4) #define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) @@ -143,6 +144,7 @@ enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, + MIGRATE_VMA_SELECT_COMPOUND = 1 << 3, }; struct migrate_vma { diff --git a/include/linux/mm.h b/include/linux/mm.h index 8dc0a07570cc..7a1819c20643 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -105,6 +105,8 @@ extern int mmap_rnd_compat_bits __read_mostly; # endif #endif +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + #include <asm/page.h> #include <asm/processor.h> @@ -273,178 +275,235 @@ extern unsigned int kobjsize(const void *objp); * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ -#define VM_NONE 0x00000000 -#define VM_READ 0x00000001 /* currently active flags */ -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 +#define VM_NONE 0x00000000 -/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ -#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_MAYSHARE 0x00000080 +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. + * + * This value is made type safe by sparse to avoid passing invalid flag values + * around. + */ +typedef int __bitwise vma_flag_t; -#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT) +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ #ifdef CONFIG_MMU -#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ -#else /* CONFIG_MMU */ -#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ -#define VM_UFFD_MISSING 0 + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), #endif /* CONFIG_MMU */ -#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ - -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ - - /* Used by sys_madvise() */ -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ - -#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ -#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ -#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ -#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ -#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_SYNC 0x00800000 /* Synchronous page faults */ -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ -#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ - + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ +#ifdef CONFIG_STACK_GROWSUP + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; +#undef DECLARE_VMA_BIT +#undef DECLARE_VMA_BIT_ALIAS + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) #ifdef CONFIG_MEM_SOFT_DIRTY -# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) #else -# define VM_SOFTDIRTY 0 +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE #endif - -#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ -#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ -#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ -#define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */ - -#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS -#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) -#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) -#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) -#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) -#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) -#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) -#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) -#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ - #ifdef CONFIG_ARCH_HAS_PKEYS -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 -# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 -# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 -# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) #if CONFIG_ARCH_PKEY_BITS > 3 -# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) #else -# define VM_PKEY_BIT3 0 -#endif +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ #if CONFIG_ARCH_PKEY_BITS > 4 -# define VM_PKEY_BIT4 VM_HIGH_ARCH_4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) #else -# define VM_PKEY_BIT4 0 -#endif +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ #endif /* CONFIG_ARCH_HAS_PKEYS */ - -#ifdef CONFIG_X86_USER_SHADOW_STACK -/* - * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of - * support core mm. - * - * These VMAs will get a single end guard page. This helps userspace protect - * itself from attacks. A single page is enough for current shadow stack archs - * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c - * for more details on the guard size. - */ -# define VM_SHADOW_STACK VM_HIGH_ARCH_5 -#endif - -#if defined(CONFIG_ARM64_GCS) -/* - * arm64's Guarded Control Stack implements similar functionality and - * has similar constraints to shadow stacks. - */ -# define VM_SHADOW_STACK VM_HIGH_ARCH_6 -#endif - -#ifndef VM_SHADOW_STACK -# define VM_SHADOW_STACK VM_NONE +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE #endif - #if defined(CONFIG_PPC64) -# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ +#define VM_SAO INIT_VM_FLAG(SAO) #elif defined(CONFIG_PARISC) -# define VM_GROWSUP VM_ARCH_1 +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) #elif defined(CONFIG_SPARC64) -# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ -# define VM_ARCH_CLEAR VM_SPARC_ADI +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif defined(CONFIG_ARM64) -# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ -# define VM_ARCH_CLEAR VM_ARM64_BTI +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif !defined(CONFIG_MMU) -# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ -#endif - -#if defined(CONFIG_ARM64_MTE) -# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ -# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ -#else -# define VM_MTE VM_NONE -# define VM_MTE_ALLOWED VM_NONE +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) #endif - #ifndef VM_GROWSUP -# define VM_GROWSUP VM_NONE +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE #endif - #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR -# define VM_UFFD_MINOR_BIT 41 -# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ -#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ -# define VM_UFFD_MINOR VM_NONE -#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ - -/* - * This flag is used to connect VFIO to arch specific KVM code. It - * indicates that the memory under this VMA is safe for use with any - * non-cachable memory type inside KVM. Some VFIO devices, on some - * platforms, are thought to be unsafe and can cause machine crashes - * if KVM does not lock down the memory type. - */ -#ifdef CONFIG_64BIT -#define VM_ALLOW_ANY_UNCACHED_BIT 39 -#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) #else -#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_UFFD_MINOR VM_NONE #endif - #ifdef CONFIG_64BIT -#define VM_DROPPABLE_BIT 40 -#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) -#elif defined(CONFIG_PPC32) -#define VM_DROPPABLE VM_ARCH_1 +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) #else -#define VM_DROPPABLE VM_NONE +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE #endif - -#ifdef CONFIG_64BIT -#define VM_SEALED_BIT 42 -#define VM_SEALED BIT(VM_SEALED_BIT) +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) #else -#define VM_SEALED VM_NONE +#define VM_DROPPABLE VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ @@ -470,12 +529,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) -#ifdef CONFIG_STACK_GROWSUP -#define VM_STACK VM_GROWSUP -#define VM_STACK_EARLY VM_GROWSDOWN +#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS +#define VM_SEALED_SYSMAP VM_SEALED #else -#define VM_STACK VM_GROWSDOWN -#define VM_STACK_EARLY 0 +#define VM_SEALED_SYSMAP VM_NONE #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) @@ -483,12 +540,26 @@ extern unsigned int kobjsize(const void *objp); /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) - /* * Special vmas that are non-mergable, non-mlock()able. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. + */ +#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) + /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) @@ -498,13 +569,69 @@ extern unsigned int kobjsize(const void *objp); /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +/* These flags can be updated atomically via VMA/mmap read lock. */ +#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD + /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR -# define VM_ARCH_CLEAR VM_NONE +#define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) /* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. + * + * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that + * mapped page tables may contain metadata not described by the + * VMA and thus any merged VMA may also contain this metadata, + * and thus we must make this flag sticky. + */ +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE VM_STICKY + +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) + +/* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ @@ -783,7 +910,9 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { - ACCESS_PRIVATE(vma, __vm_flags) = flags; + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); } /* @@ -794,6 +923,7 @@ static inline void vm_flags_init(struct vm_area_struct *vma, static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_assert_write_locked(vma); vm_flags_init(vma, flags); } @@ -802,21 +932,33 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); - WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); + /* + * If VMA flags exist beyond the first system word, also clear these. It + * is assumed the write once behaviour is required only for the first + * system word. + */ + if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + + bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); + } + + vma_flags_overwrite_word_once(&vma->flags, flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); - ACCESS_PRIVATE(vma, __vm_flags) |= flags; + vma_flags_set_word(&vma->flags, flags); } static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_start_write(vma); - ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; + vma_flags_clear_word(&vma->flags, flags); } /* @@ -840,6 +982,51 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } +static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, + vma_flag_t bit) +{ + const vm_flags_t mask = BIT((__force int)bit); + + /* Only specific flags are permitted */ + if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED))) + return false; + + return true; +} + +/* + * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific + * valid flags are allowed to do this. + */ +static inline void vma_flag_set_atomic(struct vm_area_struct *vma, + vma_flag_t bit) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + + /* mmap read lock/VMA read lock must be held. */ + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) + vma_assert_locked(vma); + + if (__vma_flag_atomic_valid(vma, bit)) + set_bit((__force int)bit, bitmap); +} + +/* + * Test for VMA flag atomically. Requires no locks. Only specific valid flags + * are allowed to do this. + * + * This is necessarily racey, so callers must ensure that serialisation is + * achieved through some other means, or that races are permissible. + */ +static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, + vma_flag_t bit) +{ + if (__vma_flag_atomic_valid(vma, bit)) + return test_bit((__force int)bit, &vma->vm_flags); + + return false; +} + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; @@ -2438,7 +2625,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, unsigned long tree_end, bool mm_wr_locked); + unsigned long end, unsigned long tree_end); struct mmu_notifier_range; @@ -2922,6 +3109,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #endif /* CONFIG_MMU */ enum pt_flags { + PT_kernel = PG_referenced, PT_reserved = PG_reserved, /* High bits are used for zone/node/section */ }; @@ -2948,6 +3136,46 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt) } /** + * ptdesc_set_kernel - Mark a ptdesc used to map the kernel + * @ptdesc: The ptdesc to be marked + * + * Kernel page tables often need special handling. Set a flag so that + * the handling code knows this ptdesc will not be used for userspace. + */ +static inline void ptdesc_set_kernel(struct ptdesc *ptdesc) +{ + set_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel + * @ptdesc: The ptdesc to be unmarked + * + * Use when the ptdesc is no longer used to map the kernel and no longer + * needs special handling. + */ +static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) +{ + /* + * Note: the 'PG_referenced' bit does not strictly need to be + * cleared before freeing the page. But this is nice for + * symmetry. + */ + clear_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel + * @ptdesc: The ptdesc being tested + * + * Call to tell if the ptdesc used to map the kernel. + */ +static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) +{ + return test_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags * @order: desired pagetable order @@ -2965,6 +3193,21 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) +static inline void __pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +void pagetable_free_kernel(struct ptdesc *pt); +#else +static inline void pagetable_free_kernel(struct ptdesc *pt) +{ + __pagetable_free(pt); +} +#endif /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -2974,9 +3217,12 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde */ static inline void pagetable_free(struct ptdesc *pt) { - struct page *page = ptdesc_page(pt); - - __free_pages(page, compound_order(page)); + if (ptdesc_test_kernel(pt)) { + ptdesc_clear_kernel(pt); + pagetable_free_kernel(pt); + } else { + __pagetable_free(pt); + } } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) @@ -3560,6 +3806,90 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) +{ + return desc->end - desc->start; +} + +static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) +{ + return vma_desc_size(desc) >> PAGE_SHIFT; +} + +/** + * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN + * remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_remap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + /* [start, start + size) must be within the VMA. */ + WARN_ON_ONCE(start < desc->start || start >= desc->end); + WARN_ON_ONCE(start + size > desc->end); + + action->type = MMAP_REMAP_PFN; + action->remap.start = start; + action->remap.start_pfn = start_pfn; + action->remap.size = size; + action->remap.pgprot = desc->page_prot; +} + +/** + * mmap_action_remap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_remap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +/** + * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN + * I/O remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_ioremap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + mmap_action_remap(desc, start, start_pfn, size); + desc->action.type = MMAP_IO_REMAP_PFN; +} + +/** + * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN I/O remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc); +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma); + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) @@ -3601,10 +3931,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); -int remap_pfn_range(struct vm_area_struct *, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t); -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot); +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); @@ -3637,15 +3966,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -#ifndef io_remap_pfn_range -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, - unsigned long size, pgprot_t prot) +#ifndef io_remap_pfn_range_pfn +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { - return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); + return pfn; } #endif +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long orig_pfn, + unsigned long size, pgprot_t orig_prot) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + const pgprot_t prot = pgprot_decrypted(orig_prot); + + return remap_pfn_range(vma, addr, pfn, size, prot); +} + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) @@ -4094,6 +4432,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, MF_MSG_UNKNOWN, }; @@ -4222,16 +4561,6 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); - -/* - * mseal of userspace process's system mappings. - */ -#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS -#define VM_SEALED_SYSMAP VM_SEALED -#else -#define VM_SEALED_SYSMAP VM_NONE -#endif - /* * DMA mapping IDs for page_pool * diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index f6a2b2d20016..fa2d6ba811b5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -8,7 +8,7 @@ #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> -#include <linux/swapops.h> +#include <linux/leafops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -44,7 +44,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); - __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); + mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } @@ -541,9 +541,9 @@ static inline bool mm_tlb_flush_nested(const struct mm_struct *mm) * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( - swp_entry_t entry, struct vm_area_struct *dst_vma) + softleaf_t entry, struct vm_area_struct *dst_vma) { - pte_marker srcm = pte_marker_get(entry); + const pte_marker srcm = softleaf_to_marker(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); @@ -553,7 +553,6 @@ static inline pte_marker copy_pte_marker( return dstm; } -#endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to @@ -571,9 +570,11 @@ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; + if (!uffd_supports_wp_marker()) + return false; + /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); @@ -602,7 +603,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } -#endif + return false; } @@ -616,6 +617,7 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma) return true; } +#endif /** * num_pages_contiguous() - determine the number of contiguous pages diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3b7d05e7169c..9f6de068295d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -286,6 +286,31 @@ typedef struct { unsigned long val; } swp_entry_t; +/** + * typedef softleaf_t - Describes a page table software leaf entry, abstracted + * from its architecture-specific encoding. + * + * Page table leaf entries are those which do not reference any descendent page + * tables but rather either reference a data page, are an empty (or 'none' + * entry), or contain a non-present entry. + * + * If referencing another page table or a data page then the page table entry is + * pertinent to hardware - that is it tells the hardware how to decode the page + * table entry. + * + * Otherwise it is a software-defined leaf page table entry, which this type + * describes. See leafops.h and specifically @softleaf_type for a list of all + * possible kinds of software leaf entry. + * + * A softleaf_t entry is abstracted from the hardware page table entry, so is + * not architecture-specific. + * + * NOTE: While we transition from the confusing swp_entry_t type used for this + * purpose, we simply alias this type. This will be removed once the + * transition is complete. + */ +typedef swp_entry_t softleaf_t; + #if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT) /* We have some extra room after the refcount in tail pages. */ #define NR_PAGES_IN_LARGE_FOLIO @@ -774,6 +799,65 @@ struct pfnmap_track_ctx { }; #endif +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + +/* + * Opaque type representing current VMA (vm_area_struct) flag state. Must be + * accessed via vma_flags_xxx() helper functions. + */ +#define NUM_VMA_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); +} __private vma_flags_t; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -791,12 +875,18 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; pgprot_t page_prot; /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; /* @@ -833,10 +923,12 @@ struct vm_area_struct { /* * Flags, see mm.h. * To modify use vm_flags_{init|reset|set|clear|mod} functions. + * Preferably, use vma_flags_xxx() functions. */ union { + /* Temporary while VMA flags are being converted. */ const vm_flags_t vm_flags; - vm_flags_t __private __vm_flags; + vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK @@ -917,6 +1009,52 @@ struct vm_area_struct { #endif } __randomize_layout; +/* Clears all bits in the VMA flags bitmap, non-atomically. */ +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + #ifdef CONFIG_NUMA #define vma_policy(vma) ((vma)->vm_policy) #else @@ -1194,15 +1332,13 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; -/* Set the first system word of mm flags, non-atomically. */ -static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) +/* Copy value to the first system word of mm flags, non-atomically. */ +static inline void __mm_flags_overwrite_word(struct mm_struct *mm, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); - - bitmap_copy(bitmap, &value, BITS_PER_LONG); + *ACCESS_PRIVATE(&mm->flags, __mm_flags) = value; } -/* Obtain a read-only view of the bitmap. */ +/* Obtain a read-only view of the mm flags bitmap. */ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) { return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); @@ -1211,9 +1347,7 @@ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct /* Read the first system word of mm flags, non-atomically. */ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) { - const unsigned long *bitmap = __mm_flags_get_bitmap(mm); - - return bitmap_read(bitmap, 0, BITS_PER_LONG); + return *__mm_flags_get_bitmap(mm); } /* diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 2c9fffa58714..d53f72dba7fe 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -130,7 +130,7 @@ static inline bool is_vma_writer_only(int refcnt) * a detached vma happens only in vma_mark_detached() and is a rare * case, therefore most of the time there will be no unnecessary wakeup. */ - return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; + return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) @@ -183,7 +183,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -195,7 +195,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l return (vma->vm_lock_seq == *mm_lock_seq); } -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, + int state); /* * Begin writing to a VMA. @@ -209,7 +210,30 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - __vma_start_write(vma, mm_lock_seq); + __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE); +} + +/** + * vma_start_write_killable - Begin writing to a VMA. + * @vma: The VMA we are going to modify. + * + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + * + * Context: May sleep while waiting for readers to drop the vma read lock. + * Caller must already hold the mmap_lock for write. + * + * Return: 0 for a successful acquisition. -EINTR if a fatal signal was + * received. + */ +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + if (__is_vma_write_locked(vma, &mm_lock_seq)) + return 0; + return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -281,11 +305,10 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int return true; } static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) - { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) { return 0; } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_assert_attached(struct vm_area_struct *vma) {} diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7fb7331c5725..4398e027f450 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1060,10 +1060,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum pgdat_flags { - PGDAT_DIRTY, /* reclaim scanning has recently found - * many dirty file pages at the tail - * of the LRU. - */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ diff --git a/include/linux/node.h b/include/linux/node.h index 866e3323f1fd..0269b064ba65 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -132,8 +132,6 @@ static inline void register_memory_blocks_under_nodes(void) } #endif -extern void unregister_node(struct node *node); - struct node_notify { int nid; }; @@ -176,8 +174,8 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) #ifdef CONFIG_NUMA extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ -extern int register_one_node(int nid); -extern void unregister_one_node(int nid); +int register_node(int nid); +void unregister_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); @@ -189,11 +187,11 @@ extern int register_memory_node_under_compute_node(unsigned int mem_nid, static inline void node_dev_init(void) { } -static inline int register_one_node(int nid) +static inline int register_node(int nid) { return 0; } -static inline int unregister_one_node(int nid) +static inline int unregister_node(int nid) { return 0; } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index ee3148ef87f6..652f287c1ef6 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1557,6 +1557,18 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif +/* + * Some platforms can customize the PTE soft-dirty bit making it unavailable + * even if the architecture provides the resource. + * Adding this API allows architectures to add their own checks for the + * devices on which the kernel is running. + * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY + * is part of this macro. + */ +#ifndef pgtable_supports_soft_dirty +#define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) +#endif + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0232d983b715..0e1d73955fa5 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -189,12 +189,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t); -unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags); +unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, - struct file *filp, +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -318,6 +317,9 @@ static inline void might_alloc(gfp_t gfp_mask) fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); + if (current->flags & PF_MEMALLOC) + return; + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 774efe592a9a..5e4b3c1ae5c2 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,7 +94,8 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); -extern int shmem_zero_setup(struct vm_area_struct *); +int shmem_zero_setup(struct vm_area_struct *vma); +int shmem_zero_setup_desc(struct vm_area_desc *desc); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); @@ -135,11 +136,16 @@ static inline bool shmem_hpage_pmd_enabled(void) #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); +extern void shmem_uncharge(struct inode *inode, long pages); #else static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) { return 0; } + +static inline void shmem_uncharge(struct inode *inode, long pages) +{ +} #endif extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -193,7 +199,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) } extern bool shmem_charge(struct inode *inode, long pages); -extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM diff --git a/include/linux/swap.h b/include/linux/swap.h index e818fbade1e2..38ca3df68716 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -301,16 +301,7 @@ struct swap_info_struct { struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ - struct plist_node avail_lists[]; /* - * entries in swap_avail_heads, one - * entry per node. - * Must be last as the number of the - * array is nr_node_ids, which is not - * a fixed value so have to allocate - * dynamically. - * And it has to be an array so that - * plist_for_each_* can work. - */ + struct plist_node avail_list; /* entry in swap_avail_head */ }; static inline swp_entry_t page_swap_entry(struct page *page) @@ -462,7 +453,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); +int folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); @@ -560,7 +551,7 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) +static inline int folio_alloc_swap(struct folio *folio) { return -EINVAL; } diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 64ea151a7ae3..8cfc966eae48 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -28,7 +28,7 @@ #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* - * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To + * Definitions only for PFN swap entries (see leafeant_has_pfn()). To * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries * can use the extra bits to store other information besides PFN. */ @@ -66,8 +66,6 @@ #define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) #define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) -static inline bool is_pfn_swap_entry(swp_entry_t entry); - /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { @@ -110,36 +108,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry) } /* - * This should only be called upon a pfn swap entry to get the PFN stored - * in the swap entry. Please refers to is_pfn_swap_entry() for definition - * of pfn swap entry. - */ -static inline unsigned long swp_offset_pfn(swp_entry_t entry) -{ - VM_BUG_ON(!is_pfn_swap_entry(entry)); - return swp_offset(entry) & SWP_PFN_MASK; -} - -/* check whether a pte points to a swap entry */ -static inline int is_swap_pte(pte_t pte) -{ - return !pte_none(pte) && !pte_present(pte); -} - -/* - * Convert the arch-dependent pte representation of a swp_entry_t into an - * arch-independent swp_entry_t. - */ -static inline swp_entry_t pte_to_swp_entry(pte_t pte) -{ - swp_entry_t arch_entry; - - pte = pte_swp_clear_flags(pte); - arch_entry = __pte_to_swp_entry(pte); - return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); -} - -/* * Convert the arch-independent representation of a swp_entry_t into the * arch-dependent pte representation. */ @@ -175,27 +143,11 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) return swp_entry(SWP_DEVICE_WRITE, offset); } -static inline bool is_device_private_entry(swp_entry_t entry) -{ - int type = swp_type(entry); - return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; -} - -static inline bool is_writable_device_private_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); -} - static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } -static inline bool is_device_exclusive_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; -} - #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -207,50 +159,14 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) return swp_entry(0, 0); } -static inline bool is_device_private_entry(swp_entry_t entry) -{ - return false; -} - -static inline bool is_writable_device_private_entry(swp_entry_t entry) -{ - return false; -} - static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } -static inline bool is_device_exclusive_entry(swp_entry_t entry) -{ - return false; -} - #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION -static inline int is_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ || - swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || - swp_type(entry) == SWP_MIGRATION_WRITE); -} - -static inline int is_writable_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); -} - -static inline int is_readable_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ); -} - -static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); -} static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -289,14 +205,6 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_young(swp_entry_t entry) -{ - if (migration_entry_supports_ad()) - return swp_offset(entry) & SWP_MIG_YOUNG; - /* Keep the old behavior of aging page after migration */ - return false; -} - static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) @@ -305,14 +213,6 @@ static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_dirty(swp_entry_t entry) -{ - if (migration_entry_supports_ad()) - return swp_offset(entry) & SWP_MIG_DIRTY; - /* Keep the old behavior of clean page after migration */ - return false; -} - extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); @@ -332,43 +232,21 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) return swp_entry(0, 0); } -static inline int is_migration_entry(swp_entry_t swp) -{ - return 0; -} - static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } -static inline int is_writable_migration_entry(swp_entry_t entry) -{ - return 0; -} -static inline int is_readable_migration_entry(swp_entry_t entry) -{ - return 0; -} static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { return entry; } -static inline bool is_migration_entry_young(swp_entry_t entry) -{ - return false; -} - static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { return entry; } -static inline bool is_migration_entry_dirty(swp_entry_t entry) -{ - return false; -} #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE @@ -426,21 +304,6 @@ static inline swp_entry_t make_pte_marker_entry(pte_marker marker) return swp_entry(SWP_PTE_MARKER, marker); } -static inline bool is_pte_marker_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_PTE_MARKER; -} - -static inline pte_marker pte_marker_get(swp_entry_t entry) -{ - return swp_offset(entry) & PTE_MARKER_MASK; -} - -static inline bool is_pte_marker(pte_t pte) -{ - return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); -} - static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); @@ -451,83 +314,11 @@ static inline swp_entry_t make_poisoned_swp_entry(void) return make_pte_marker_entry(PTE_MARKER_POISONED); } -static inline int is_poisoned_swp_entry(swp_entry_t entry) -{ - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_POISONED); - -} - static inline swp_entry_t make_guard_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_GUARD); } -static inline int is_guard_swp_entry(swp_entry_t entry) -{ - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_GUARD); -} - -/* - * This is a special version to check pte_none() just to cover the case when - * the pte is a pte marker. It existed because in many cases the pte marker - * should be seen as a none pte; it's just that we have stored some information - * onto the none pte so it becomes not-none any more. - * - * It should be used when the pte is file-backed, ram-based and backing - * userspace pages, like shmem. It is not needed upon pgtables that do not - * support pte markers at all. For example, it's not needed on anonymous - * memory, kernel-only memory (including when the system is during-boot), - * non-ram based generic file-system. It's fine to be used even there, but the - * extra pte marker check will be pure overhead. - */ -static inline int pte_none_mostly(pte_t pte) -{ - return pte_none(pte) || is_pte_marker(pte); -} - -static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) -{ - struct page *p = pfn_to_page(swp_offset_pfn(entry)); - - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - BUG_ON(is_migration_entry(entry) && !PageLocked(p)); - - return p; -} - -static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) -{ - struct folio *folio = pfn_folio(swp_offset_pfn(entry)); - - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked - */ - BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); - - return folio; -} - -/* - * A pfn swap entry is a special type of swap entry that always has a pfn stored - * in the swap offset. They can either be used to represent unaddressable device - * memory, to restrict access to a page undergoing migration or to represent a - * pfn which has been hwpoisoned and unmapped. - */ -static inline bool is_pfn_swap_entry(swp_entry_t entry) -{ - /* Make sure the swp offset can always store the needed fields */ - BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); - - return is_migration_entry(entry) || is_device_private_entry(entry) || - is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); -} - struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION @@ -539,18 +330,6 @@ extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); -static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) -{ - swp_entry_t arch_entry; - - if (pmd_swp_soft_dirty(pmd)) - pmd = pmd_swp_clear_soft_dirty(pmd); - if (pmd_swp_uffd_wp(pmd)) - pmd = pmd_swp_clear_uffd_wp(pmd); - arch_entry = __pmd_to_swp_entry(pmd); - return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); -} - static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { swp_entry_t arch_entry; @@ -559,10 +338,6 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) return __swp_entry_to_pmd(arch_entry); } -static inline int is_pmd_migration_entry(pmd_t pmd) -{ - return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); -} #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) @@ -578,26 +353,12 @@ static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } -static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) -{ - return swp_entry(0, 0); -} - static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { return __pmd(0); } -static inline int is_pmd_migration_entry(pmd_t pmd) -{ - return 0; -} #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static inline int non_swap_entry(swp_entry_t entry) -{ - return swp_type(entry) >= MAX_SWAPFILES; -} - #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c0e716aec26a..fd5f42765497 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -16,7 +16,7 @@ #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <asm-generic/pgtable_uffd.h> #include <linux/hugetlb_inline.h> @@ -228,15 +228,14 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, if (wp_async && (vm_flags == VM_UFFD_WP)) return true; -#ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for * uffd-wp, then shmem & hugetlbfs are not supported but only * anonymous. */ - if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && + !vma_is_anonymous(vma)) return false; -#endif /* By default, allow any of anon|shmem|hugetlb */ return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || @@ -291,6 +290,43 @@ void userfaultfd_release_new(struct userfaultfd_ctx *ctx); void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx); +static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) +{ + /* Only wr-protect mode uses pte markers */ + if (!userfaultfd_wp(vma)) + return false; + + /* File-based uffd-wp always need markers */ + if (!vma_is_anonymous(vma)) + return true; + + /* + * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED + * enabled (to apply markers on zero pages). + */ + return userfaultfd_wp_unpopulated(vma); +} + +/* + * Returns true if this is a swap pte and was uffd-wp wr-protected in either + * forms (pte marker or a normal swap pte), false otherwise. + */ +static inline bool pte_swp_uffd_wp_any(pte_t pte) +{ + if (!uffd_supports_wp_marker()) + return false; + + if (pte_present(pte)) + return false; + + if (pte_swp_uffd_wp(pte)) + return true; + + if (pte_is_uffd_wp_marker(pte)) + return true; + + return false; +} #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -415,49 +451,9 @@ static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) return false; } -#endif /* CONFIG_USERFAULTFD */ - static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { - /* Only wr-protect mode uses pte markers */ - if (!userfaultfd_wp(vma)) - return false; - - /* File-based uffd-wp always need markers */ - if (!vma_is_anonymous(vma)) - return true; - - /* - * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED - * enabled (to apply markers on zero pages). - */ - return userfaultfd_wp_unpopulated(vma); -} - -static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) -{ -#ifdef CONFIG_PTE_MARKER_UFFD_WP - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); -#else - return false; -#endif -} - -static inline bool pte_marker_uffd_wp(pte_t pte) -{ -#ifdef CONFIG_PTE_MARKER_UFFD_WP - swp_entry_t entry; - - if (!is_swap_pte(pte)) - return false; - - entry = pte_to_swp_entry(pte); - - return pte_marker_entry_uffd_wp(entry); -#else return false; -#endif } /* @@ -466,17 +462,7 @@ static inline bool pte_marker_uffd_wp(pte_t pte) */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP - if (!is_swap_pte(pte)) - return false; - - if (pte_swp_uffd_wp(pte)) - return true; - - if (pte_marker_uffd_wp(pte)) - return true; -#endif return false; } - +#endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index eb54b7b3202f..e8e94f90d686 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ #endif struct vm_struct { - struct vm_struct *next; + union { + struct vm_struct *next; /* Early registration of vm_areas. */ + struct llist_node llnode; /* Asynchronous freeing on error paths. */ + }; + void *addr; unsigned long size; unsigned long flags; @@ -328,4 +332,6 @@ bool vmalloc_dump_obj(void *object); static inline bool vmalloc_dump_obj(void *object) { return false; } #endif +unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask); +void memalloc_restore_scope(unsigned int flags); #endif /* _LINUX_VMALLOC_H */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c287998908bf..3398a345bda8 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -520,32 +520,12 @@ static inline const char *vm_event_name(enum vm_event_item item) #ifdef CONFIG_MEMCG -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -static inline void mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_state(lruvec, idx, val); - local_irq_restore(flags); -} - -void __lruvec_stat_mod_folio(struct folio *folio, +void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); -static inline void lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __lruvec_stat_mod_folio(folio, idx, val); - local_irq_restore(flags); -} - static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { @@ -554,24 +534,12 @@ static inline void mod_lruvec_page_state(struct page *page, #else -static inline void __mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); -} - static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } -static inline void __lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - __mod_node_page_state(folio_pgdat(folio), idx, val); -} - static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { @@ -586,18 +554,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __lruvec_stat_add_folio(struct folio *folio, - enum node_stat_item idx) -{ - __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); -} - -static inline void __lruvec_stat_sub_folio(struct folio *folio, - enum node_stat_item idx) -{ - __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); -} - static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { |
