diff options
Diffstat (limited to 'include/linux')
44 files changed, 991 insertions, 542 deletions
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 8c61ccd161ba..1f0a9ff23a2c 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -70,7 +70,7 @@ static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) /* * When percpu variables are required to be defined as weak, static percpu * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION). - * Instead we will accound all module allocations to a single counter. + * Instead we will account all module allocations to a single counter. */ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); @@ -137,7 +137,16 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {} /* Caller should verify both ref and tag to be valid */ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) { + alloc_tag_add_check(ref, tag); + if (!ref || !tag) + return; + ref->ct = &tag->ct; +} + +static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +{ + __alloc_tag_ref_set(ref, tag); /* * We need in increment the call counter every time we have a new * allocation or when we split a large allocation into smaller ones. @@ -147,22 +156,9 @@ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag this_cpu_inc(tag->counters->calls); } -static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) -{ - alloc_tag_add_check(ref, tag); - if (!ref || !tag) - return; - - __alloc_tag_ref_set(ref, tag); -} - static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) { - alloc_tag_add_check(ref, tag); - if (!ref || !tag) - return; - - __alloc_tag_ref_set(ref, tag); + alloc_tag_ref_set(ref, tag); this_cpu_add(tag->counters->bytes, bytes); } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index eb0f6f349496..47ae4c4d924c 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -172,7 +172,11 @@ struct cgroup_subsys_state { /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; - /* siblings list anchored at the parent's ->children */ + /* + * siblings list anchored at the parent's ->children + * + * linkage is protected by cgroup_mutex or RCU + */ struct list_head sibling; struct list_head children; @@ -789,6 +793,11 @@ struct cgroup_subsys { extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +struct cgroup_of_peak { + unsigned long value; + struct list_head list; +}; + /** * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups * @tsk: target task diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c60ba0ab1462..3e0563753cc3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -11,6 +11,7 @@ #include <linux/sched.h> #include <linux/nodemask.h> +#include <linux/list.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> @@ -854,4 +855,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); +struct cgroup_of_peak *of_peak(struct kernfs_open_file *of); + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/cma.h b/include/linux/cma.h index 9db877506ea8..d15b64f51336 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -52,4 +52,20 @@ extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); extern void cma_reserve_pages_on_error(struct cma *cma); + +#ifdef CONFIG_CMA +struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp); +bool cma_free_folio(struct cma *cma, const struct folio *folio); +#else +static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) +{ + return NULL; +} + +static inline bool cma_free_folio(struct cma *cma, const struct folio *folio) +{ + return false; +} +#endif + #endif diff --git a/include/linux/damon.h b/include/linux/damon.h index 27c546bfc6d4..a67f2c4940e9 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -233,7 +233,6 @@ struct damos_quota { unsigned long charge_addr_from; /* For prioritization */ - unsigned long histogram[DAMOS_MAX_SCORE + 1]; unsigned int min_score; /* For feedback loop */ @@ -630,6 +629,8 @@ struct damon_ctx { unsigned long next_ops_update_sis; /* for waiting until the execution of the kdamond_fn is started */ struct completion kdamond_started; + /* for scheme quotas prioritization */ + unsigned long *regions_score_histogram; /* public: */ struct task_struct *kdamond; diff --git a/include/linux/err.h b/include/linux/err.h index b5d9bb2a2349..a4dacd745fcf 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -41,6 +41,9 @@ static inline void * __must_check ERR_PTR(long error) return (void *) error; } +/* Return the pointer in the percpu address space. */ +#define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) + /** * PTR_ERR - Extract the error code from an error pointer. * @ptr: An error pointer. @@ -51,6 +54,9 @@ static inline long __must_check PTR_ERR(__force const void *ptr) return (long) ptr; } +/* Read an error pointer from the percpu address space. */ +#define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr))) + /** * IS_ERR - Detect an error pointer. * @ptr: The pointer to check. @@ -61,6 +67,9 @@ static inline bool __must_check IS_ERR(__force const void *ptr) return IS_ERR_VALUE((unsigned long)ptr); } +/* Read an error pointer from the percpu address space. */ +#define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr))) + /** * IS_ERR_OR_NULL - Detect an error pointer or a null pointer. * @ptr: The pointer to check. diff --git a/include/linux/fs.h b/include/linux/fs.h index 0df3e5f0dd2b..6b8df574729c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3513,7 +3513,6 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; - kiocb_flags |= IOCB_NOIO; } if (flags & RWF_ATOMIC) { if (rw_type != WRITE) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f53f76e0b17e..a951de920e20 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -319,7 +319,7 @@ static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order } static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { - return __folio_alloc_node(gfp, order, numa_node_id()); + return __folio_alloc_node_noprof(gfp, order, numa_node_id()); } static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid) @@ -446,4 +446,27 @@ extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_ #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); +#ifdef CONFIG_CONTIG_ALLOC +static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, + int nid, nodemask_t *node) +{ + struct page *page; + + if (WARN_ON(!order || !(gfp & __GFP_COMP))) + return NULL; + + page = alloc_contig_pages_noprof(1 << order, gfp, nid, node); + + return page ? page_folio(page) : NULL; +} +#else +static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, + int nid, nodemask_t *node) +{ + return NULL; +} +#endif +/* This should be paired with folio_put() rather than free_contig_range(). */ +#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) + #endif /* __LINUX_GFP_H */ diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 313be4ad79fd..65db9349f905 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -215,7 +215,8 @@ enum { * the caller still has to check for failures) while costly requests try to be * not disruptive and back off even without invoking the OOM killer. * The following three modifiers might be used to override some of these - * implicit rules. + * implicit rules. Please note that all of them must be used along with + * %__GFP_DIRECT_RECLAIM flag. * * %__GFP_NORETRY: The VM implementation will try only very lightweight * memory direct reclaim to get some memory under memory pressure (thus @@ -246,11 +247,14 @@ enum { * cannot handle allocation failures. The allocation could block * indefinitely but will never return with failure. Testing for * failure is pointless. + * It _must_ be blockable and used together with __GFP_DIRECT_RECLAIM. + * It should _never_ be used in non-sleepable contexts. * New users should be evaluated carefully (and the flag should be * used only when there is no reasonable failure policy) but it is * definitely preferable to use the flag rather than opencode endless * loop around allocator. - * Using this flag for costly allocations is _highly_ discouraged. + * Allocating pages from the buddy with __GFP_NOFAIL and order > 1 is + * not supported. Please consider using kvmalloc() instead. */ #define __GFP_IO ((__force gfp_t)___GFP_IO) #define __GFP_FS ((__force gfp_t)___GFP_FS) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7a570e0437c9..67d0ab3c3bba 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -76,9 +76,9 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for file THP. Folios in a DAX * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to - * it. + * it. Same to PFNMAPs where there's neither page* nor pagecache. */ -#define THP_ORDERS_ALL_FILE_DAX \ +#define THP_ORDERS_ALL_SPECIAL \ (BIT(PMD_ORDER) | BIT(PUD_ORDER)) #define THP_ORDERS_ALL_FILE_DEFAULT \ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) @@ -87,7 +87,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * Mask of all large folio orders supported for THP. */ #define THP_ORDERS_ALL \ - (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ #define TVA_IN_PF (1 << 1) /* Page fault handler */ @@ -116,6 +116,53 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) +enum mthp_stat_item { + MTHP_STAT_ANON_FAULT_ALLOC, + MTHP_STAT_ANON_FAULT_FALLBACK, + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + MTHP_STAT_SWPOUT, + MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_SHMEM_ALLOC, + MTHP_STAT_SHMEM_FALLBACK, + MTHP_STAT_SHMEM_FALLBACK_CHARGE, + MTHP_STAT_SPLIT, + MTHP_STAT_SPLIT_FAILED, + MTHP_STAT_SPLIT_DEFERRED, + MTHP_STAT_NR_ANON, + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, + __MTHP_STAT_COUNT +}; + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) +struct mthp_stat { + unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; +}; + +DECLARE_PER_CPU(struct mthp_stat, mthp_stats); + +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta) +{ + if (order <= 0 || order > PMD_ORDER) + return; + + this_cpu_add(mthp_stats.stats[order][item], delta); +} + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ + mod_mthp_stat(order, item, 1); +} + +#else +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta) +{ +} + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long transparent_hugepage_flags; @@ -271,41 +318,6 @@ struct thpsize { #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) -enum mthp_stat_item { - MTHP_STAT_ANON_FAULT_ALLOC, - MTHP_STAT_ANON_FAULT_FALLBACK, - MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, - MTHP_STAT_SWPOUT, - MTHP_STAT_SWPOUT_FALLBACK, - MTHP_STAT_SHMEM_ALLOC, - MTHP_STAT_SHMEM_FALLBACK, - MTHP_STAT_SHMEM_FALLBACK_CHARGE, - MTHP_STAT_SPLIT, - MTHP_STAT_SPLIT_FAILED, - MTHP_STAT_SPLIT_DEFERRED, - __MTHP_STAT_COUNT -}; - -struct mthp_stat { - unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; -}; - -#ifdef CONFIG_SYSFS -DECLARE_PER_CPU(struct mthp_stat, mthp_stats); - -static inline void count_mthp_stat(int order, enum mthp_stat_item item) -{ - if (order <= 0 || order > PMD_ORDER) - return; - - this_cpu_inc(mthp_stats.stats[order][item]); -} -#else -static inline void count_mthp_stat(int order, enum mthp_stat_item item) -{ -} -#endif - #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) @@ -316,7 +328,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); -bool can_split_folio(struct folio *folio, int *pextra_pins); +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); int min_order_for_split(struct folio *folio); @@ -338,7 +350,7 @@ static inline int split_huge_page(struct page *page) */ return split_huge_page_to_list_to_order(page, NULL, ret); } -void deferred_split_folio(struct folio *folio); +void deferred_split_folio(struct folio *folio, bool partially_mapped); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); @@ -359,6 +371,17 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pudp, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags); +#else +static inline int +change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pudp, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) { return 0; } +#endif + #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ @@ -427,11 +450,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); } -static inline bool is_huge_zero_pud(pud_t pud) -{ - return false; -} - struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); @@ -487,7 +505,7 @@ thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, } static inline bool -can_split_folio(struct folio *folio, int *pextra_pins) +can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) { return false; } @@ -507,7 +525,7 @@ static inline int split_folio_to_list(struct folio *folio, struct list_head *lis return 0; } -static inline void deferred_split_folio(struct folio *folio) {} +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) @@ -578,11 +596,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return false; } -static inline bool is_huge_zero_pud(pud_t pud) -{ - return false; -} - static inline void mm_put_huge_zero_folio(struct mm_struct *mm) { return; @@ -608,6 +621,19 @@ static inline int next_order(unsigned long *orders, int prev) { return 0; } + +static inline void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address) +{ +} + +static inline int change_huge_pud(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pudp, + unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 45bf05ad5c53..98c47c394b89 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -127,9 +127,6 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); -struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); @@ -899,10 +896,11 @@ static inline bool hugepage_movable_supported(struct hstate *h) /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepage_movable_supported(h)) - return GFP_HIGHUSER_MOVABLE; - else - return GFP_HIGHUSER; + gfp_t gfp = __GFP_COMP | __GFP_NOWARN; + + gfp |= hugepage_movable_supported(h) ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER; + + return gfp; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) @@ -1251,7 +1249,7 @@ static inline __init void hugetlb_cma_reserve(int order) } #endif -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline bool hugetlb_pmd_shared(pte_t *pte) { return page_count(virt_to_page(pte)) > 1; @@ -1287,8 +1285,7 @@ bool __vma_private_lock(struct vm_area_struct *vma); static inline pte_t * hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { -#if defined(CONFIG_HUGETLB_PAGE) && \ - defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP) +#if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; /* diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index 859f4b0c1b2b..196778a087c4 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -10,12 +10,11 @@ */ #define KPF_RESERVED 32 #define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 +#define KPF_OWNER_2 34 #define KPF_PRIVATE 35 #define KPF_PRIVATE_2 36 #define KPF_OWNER_PRIVATE 37 #define KPF_ARCH 38 -#define KPF_UNCACHED 39 #define KPF_SOFTDIRTY 40 #define KPF_ARCH_2 41 #define KPF_ARCH_3 42 diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 88100cc9caba..0ad1ddbb8b99 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -124,7 +124,7 @@ static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp if (!static_branch_likely(&kfence_allocation_key)) return NULL; #endif - if (likely(atomic_read(&kfence_allocation_gate))) + if (likely(atomic_read(&kfence_allocation_gate) > 0)) return NULL; return __kfence_alloc(s, size, flags); } diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index f68865e19b0b..30baae91b225 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -4,6 +4,7 @@ #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */ +extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index a53ad4dabd7e..c2c11004085e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -52,9 +52,9 @@ * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * - * Once the type is decided, the decision of an allocation range type or a range - * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE - * flag. + * Once the type is decided, the decision of an allocation range type or a + * range type is done by examining the immutable tree flag for the + * MT_FLAGS_ALLOC_RANGE flag. * * Node types: * 0x??1 = Root @@ -148,6 +148,18 @@ enum maple_type { maple_arange_64, }; +enum store_type { + wr_invalid, + wr_new_root, + wr_store_root, + wr_exact_fit, + wr_spanning_store, + wr_split_store, + wr_rebalance, + wr_append, + wr_node_store, + wr_slot_store, +}; /** * DOC: Maple tree flags @@ -436,6 +448,7 @@ struct ma_state { unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ + enum store_type store_type; /* The type of store needed for this operation */ }; struct ma_wr_state { @@ -477,6 +490,7 @@ struct ma_wr_state { .max = ULONG_MAX, \ .alloc = NULL, \ .mas_flags = 0, \ + .store_type = wr_invalid, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0e5bf25d324f..34d2da05f2f1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -57,7 +57,7 @@ enum memcg_memory_event { struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; - unsigned int generation; + int generation; }; #ifdef CONFIG_MEMCG @@ -70,6 +70,7 @@ struct mem_cgroup_id { }; struct memcg_vmstats_percpu; +struct memcg1_events_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; struct lruvec_stats; @@ -77,7 +78,7 @@ struct lruvec_stats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; /* scan generation, increased every round-trip */ - unsigned int generation; + atomic_t generation; }; /* @@ -193,6 +194,11 @@ struct mem_cgroup { struct page_counter memsw; /* v1 only */ }; + /* registered local peak watchers */ + struct list_head memory_peaks; + struct list_head swap_peaks; + spinlock_t peaks_lock; + /* Range enforcement for interrupt charges */ struct work_struct high_work; @@ -270,6 +276,8 @@ struct mem_cgroup { struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ + struct memcg1_events_percpu __percpu *events_percpu; + unsigned long soft_limit; /* protected by memcg_oom_lock */ @@ -361,11 +369,11 @@ static inline bool folio_memcg_kmem(struct folio *folio); * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. * - * The caller must ensure that the returned memcg won't be released: - * e.g. acquire the rcu_read_lock or css_set_lock. + * The caller must ensure that the returned memcg won't be released. */ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) { + lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex)); return READ_ONCE(objcg->memcg); } @@ -439,6 +447,19 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return __folio_memcg(folio); } +/* + * folio_memcg_charged - If a folio is charged to a memory cgroup. + * @folio: Pointer to the folio. + * + * Returns true if folio is charged to a memory cgroup, otherwise returns false. + */ +static inline bool folio_memcg_charged(struct folio *folio) +{ + if (folio_memcg_kmem(folio)) + return __folio_objcg(folio) != NULL; + return __folio_memcg(folio) != NULL; +} + /** * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. * @folio: Pointer to the folio. @@ -455,7 +476,6 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) unsigned long memcg_data = READ_ONCE(folio->memcg_data); VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - WARN_ON_ONCE(!rcu_read_lock_held()); if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; @@ -464,6 +484,8 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) return obj_cgroup_memcg(objcg); } + WARN_ON_ONCE(!rcu_read_lock_held()); + return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } @@ -677,7 +699,8 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); + +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); void __mem_cgroup_uncharge(struct folio *folio); @@ -762,6 +785,8 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_current(void); +struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio); + struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, @@ -1006,8 +1031,8 @@ static inline void count_memcg_folio_events(struct folio *folio, count_memcg_events(memcg, idx, nr); } -static inline void count_memcg_event_mm(struct mm_struct *mm, - enum vm_event_item idx) +static inline void count_memcg_events_mm(struct mm_struct *mm, + enum vm_event_item idx, unsigned long count) { struct mem_cgroup *memcg; @@ -1017,10 +1042,16 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) - count_memcg_events(memcg, idx, 1); + count_memcg_events(memcg, idx, count); rcu_read_unlock(); } +static inline void count_memcg_event_mm(struct mm_struct *mm, + enum vm_event_item idx) +{ + count_memcg_events_mm(mm, idx, 1); +} + static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { @@ -1176,7 +1207,7 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, return 0; } -static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr) { } @@ -1240,6 +1271,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_current(void) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) +{ + return NULL; +} + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) { @@ -1462,6 +1498,11 @@ static inline void count_memcg_folio_events(struct folio *folio, { } +static inline void count_memcg_events_mm(struct mm_struct *mm, + enum vm_event_item idx, unsigned long count) +{ +} + static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { @@ -1717,7 +1758,6 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -struct mem_cgroup *mem_cgroup_from_obj(void *p); struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_event(struct obj_cgroup *objcg, @@ -1780,11 +1820,6 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return -1; } -static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) -{ - return NULL; -} - static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { return NULL; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ebe876930e78..b27ddce5d324 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,54 +16,6 @@ struct resource; struct vmem_altmap; struct dev_pagemap; -#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION -/* - * For supporting node-hotadd, we have to allocate a new pgdat. - * - * If an arch has generic style NODE_DATA(), - * node_data[nid] = kzalloc() works well. But it depends on the architecture. - * - * In general, generic_alloc_nodedata() is used. - * - */ -extern pg_data_t *arch_alloc_nodedata(int nid); -extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); - -#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - -#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) - -#ifdef CONFIG_NUMA -/* - * XXX: node aware allocation can't work well to get new node's memory at this time. - * Because, pgdat for the new node is not allocated/initialized yet itself. - * To use new node's memory, more consideration will be necessary. - */ -#define generic_alloc_nodedata(nid) \ -({ \ - memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ -}) - -extern pg_data_t *node_data[]; -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - node_data[nid] = pgdat; -} - -#else /* !CONFIG_NUMA */ - -/* never called */ -static inline pg_data_t *generic_alloc_nodedata(int nid) -{ - BUG(); - return NULL; -} -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - #ifdef CONFIG_MEMORY_HOTPLUG struct page *pfn_to_online_page(unsigned long pfn); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 644be30b69c8..002e49b2ebd9 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -70,6 +70,7 @@ int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free, unsigned int *ret_succeeded); struct folio *alloc_migration_target(struct folio *src, unsigned long private); bool isolate_movable_page(struct page *page, isolate_mode_t mode); +bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); @@ -91,6 +92,8 @@ static inline struct folio *alloc_migration_target(struct folio *src, { return NULL; } static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode) { return false; } +static inline bool isolate_folio_to_list(struct folio *folio, struct list_head *list) + { return false; } static inline int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) diff --git a/include/linux/mm.h b/include/linux/mm.h index 13bff7cf03b7..b62437447077 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -98,7 +98,11 @@ extern int mmap_rnd_compat_bits __read_mostly; #endif #ifndef PHYSMEM_END +# ifdef MAX_PHYSMEM_BITS # define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) +# else +# define PHYSMEM_END (-1ULL) +# endif #endif #include <asm/page.h> @@ -1015,27 +1019,6 @@ static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) return mas_prev(&vmi->mas, 0); } -static inline -struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) -{ - return mas_prev_range(&vmi->mas, 0); -} - -static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) -{ - return vmi->mas.index; -} - -static inline unsigned long vma_iter_end(struct vma_iterator *vmi) -{ - return vmi->mas.last + 1; -} -static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, - unsigned long count) -{ - return mas_expected_entries(&vmi->mas, count); -} - static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { @@ -1259,8 +1242,7 @@ static inline int folio_mapcount(const struct folio *folio) if (likely(!folio_test_large(folio))) { mapcount = atomic_read(&folio->_mapcount) + 1; - /* Handle page_has_type() pages */ - if (mapcount < PAGE_MAPCOUNT_RESERVE + 1) + if (page_mapcount_is_type(mapcount)) mapcount = 0; return mapcount; } @@ -1756,6 +1738,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } + +bool folio_use_access_time(struct folio *folio); #else /* !CONFIG_NUMA_BALANCING */ static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { @@ -1809,6 +1793,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { } +static inline bool folio_use_access_time(struct folio *folio) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) @@ -2158,14 +2146,19 @@ static inline size_t folio_size(const struct folio *folio) * MM ("mapped shared"), or if the folio is only mapped into a single MM * ("mapped exclusively"). * + * For KSM folios, this function also returns "mapped shared" when a folio is + * mapped multiple times into the same MM, because the individual page mappings + * are independent. + * * As precise information is not easily available for all folios, this function * estimates the number of MMs ("sharers") that are currently mapping a folio * using the number of times the first page of the folio is currently mapped * into page tables. * - * For small anonymous folios (except KSM folios) and anonymous hugetlb folios, - * the return value will be exactly correct, because they can only be mapped - * at most once into an MM, and they cannot be partially mapped. + * For small anonymous folios and anonymous hugetlb folios, the return + * value will be exactly correct: non-KSM folios can only be mapped at most once + * into an MM, and they cannot be partially mapped. KSM folios are + * considered shared even if mapped multiple times into the same MM. * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly @@ -2174,9 +2167,6 @@ static inline size_t folio_size(const struct folio *folio) * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. - * #. For (small) KSM folios, the return value can wrongly indicate "mapped - * shared" (false positive), when the folio is mapped multiple times into - * the same MM. * * Further, this function only considers current page table mappings that * are tracked using the folio mapcount(s). @@ -2210,26 +2200,10 @@ static inline bool folio_likely_mapped_shared(struct folio *folio) return atomic_read(&folio->_mapcount) > 0; } -#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE -static inline int arch_make_page_accessible(struct page *page) -{ - return 0; -} -#endif - #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { - int ret; - long i, nr = folio_nr_pages(folio); - - for (i = 0; i < nr; i++) { - ret = arch_make_page_accessible(folio_page(folio, i)); - if (ret) - break; - } - - return ret; + return 0; } #endif @@ -2409,11 +2383,40 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_pte(struct vm_area_struct *vma, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); +struct follow_pfnmap_args { + /** + * Inputs: + * @vma: Pointer to @vm_area_struct struct + * @address: the virtual address to walk + */ + struct vm_area_struct *vma; + unsigned long address; + /** + * Internals: + * + * The caller shouldn't touch any of these. + */ + spinlock_t *lock; + pte_t *ptep; + /** + * Outputs: + * + * @pfn: the PFN of the address + * @pgprot: the pgprot_t of the mapping + * @writable: whether the mapping is writable + * @special: whether the mapping is a special mapping (real PFN maps) + */ + unsigned long pfn; + pgprot_t pgprot; + bool writable; + bool special; +}; +int follow_pfnmap_start(struct follow_pfnmap_args *args); +void follow_pfnmap_end(struct follow_pfnmap_args *args); + extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); @@ -2541,11 +2544,6 @@ int set_page_dirty_lock(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); -extern unsigned long move_page_tables(struct vm_area_struct *vma, - unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len, - bool need_rmap_locks, bool for_stack); - /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However @@ -2566,21 +2564,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) -bool vma_needs_dirty_tracking(struct vm_area_struct *vma); -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); -static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) -{ - /* - * We want to check manually if we can change individual PTEs writable - * if we can't do that automatically for all PTEs in a mapping. For - * private mappings, that's always the case when we have write - * permissions as we properly have to handle COW. - */ - if (vma->vm_flags & VM_SHARED) - return vma_wants_writenotify(vma, vma->vm_page_prot); - return !!(vma->vm_flags & VM_WRITE); - -} bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern long change_protection(struct mmu_gather *tlb, @@ -2704,6 +2687,30 @@ static inline pte_t pte_mkspecial(pte_t pte) } #endif +#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return false; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return false; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud; +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { @@ -2896,7 +2903,7 @@ static inline void pagetable_free(struct ptdesc *pt) __free_pages(page, compound_order(page)); } -#if USE_SPLIT_PTE_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); @@ -2954,7 +2961,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) return true; } -#else /* !USE_SPLIT_PTE_PTLOCKS */ +#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ @@ -2969,7 +2976,7 @@ static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} -#endif /* USE_SPLIT_PTE_PTLOCKS */ +#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { @@ -3029,7 +3036,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) -#if USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_SPLIT_PMD_PTLOCKS) static inline struct page *pmd_pgtable_page(pmd_t *pmd) { @@ -3288,78 +3295,9 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next); -extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff); -extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void unlink_file_vma(struct vm_area_struct *); -extern struct vm_area_struct *copy_vma(struct vm_area_struct **, - unsigned long addr, unsigned long len, pgoff_t pgoff, - bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); -struct vm_area_struct *vma_modify(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long vm_flags, - struct mempolicy *policy, - struct vm_userfaultfd_ctx uffd_ctx, - struct anon_vma_name *anon_name); - -/* We are about to modify the VMA's flags. */ -static inline struct vm_area_struct -*vma_modify_flags(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long new_flags) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); -} - -/* We are about to modify the VMA's flags and/or anon_name. */ -static inline struct vm_area_struct -*vma_modify_flags_name(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long new_flags, - struct anon_vma_name *new_name) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); -} - -/* We are about to modify the VMA's memory policy. */ -static inline struct vm_area_struct -*vma_modify_policy(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct mempolicy *new_pol) -{ - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); -} - -/* We are about to modify the VMA's flags and/or uffd context. */ -static inline struct vm_area_struct -*vma_modify_flags_uffd(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), new_ctx, anon_vma_name(vma)); -} +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, @@ -3392,10 +3330,6 @@ extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, const struct vm_special_mapping *spec); -/* This is an obsolete alternative to _install_special_mapping. */ -extern int install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long flags, struct page **pages); unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); @@ -3421,14 +3355,14 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); +int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU -extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *uf, bool unlock); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) @@ -3656,9 +3590,6 @@ static inline vm_fault_t vmf_fs_error(int err) return VM_FAULT_SIGBUS; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags); - static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) @@ -4194,18 +4125,18 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, #ifdef CONFIG_UNACCEPTED_MEMORY -bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end); -void accept_memory(phys_addr_t start, phys_addr_t end); +bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); +void accept_memory(phys_addr_t start, unsigned long size); #else static inline bool range_contains_unaccepted_memory(phys_addr_t start, - phys_addr_t end) + unsigned long size) { return false; } -static inline void accept_memory(phys_addr_t start, phys_addr_t end) +static inline void accept_memory(phys_addr_t start, unsigned long size) { } @@ -4213,9 +4144,7 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end) static inline bool pfn_is_unaccepted_memory(unsigned long pfn) { - phys_addr_t paddr = pfn << PAGE_SHIFT; - - return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE); + return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); } void vma_pgtable_walk_begin(struct vm_area_struct *vma); @@ -4233,4 +4162,61 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif +#ifdef CONFIG_MEM_ALLOC_PROFILING +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ + int i; + struct alloc_tag *tag; + unsigned int nr_pages = 1 << new_order; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = pgalloc_tag_get(&folio->page); + if (!tag) + return; + + for (i = nr_pages; i < (1 << old_order); i += nr_pages) { + union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i)); + + if (ref) { + /* Set new reference to point to the original tag */ + alloc_tag_ref_set(ref, tag); + put_page_tag_ref(ref); + } + } +} + +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ + struct alloc_tag *tag; + union codetag_ref *ref; + + tag = pgalloc_tag_get(&old->page); + if (!tag) + return; + + ref = get_page_tag_ref(&new->page); + if (!ref) + return; + + /* Clear the old ref to the original allocation tag. */ + clear_page_tag_ref(&old->page); + /* Decrement the counters of the tag on get_new_folio. */ + alloc_tag_sub(ref, folio_nr_pages(new)); + + __alloc_tag_ref_set(ref, tag); + + put_page_tag_ref(ref); +} +#else /* !CONFIG_MEM_ALLOC_PROFILING */ +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ +} + +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ +} +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 485424979254..6e3bdf8e38bc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -109,7 +109,7 @@ struct page { /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. - * Used for swp_entry_t if PageSwapCache. + * Used for swp_entry_t if swapcache flag set. * Indicates order in the buddy system if PageBuddy. */ unsigned long private; @@ -660,6 +660,9 @@ struct vma_numab_state { * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). + * + * Only explicitly marked struct members may be accessed by RCU readers before + * getting a stable reference. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -675,7 +678,11 @@ struct vm_area_struct { #endif }; - struct mm_struct *vm_mm; /* The address space we belong to. */ + /* + * The address space we belong to. + * Unstable RCU readers are allowed to read this. + */ + struct mm_struct *vm_mm; pgprot_t vm_page_prot; /* Access permissions of this VMA. */ /* @@ -688,7 +695,10 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* Flag to indicate areas detached from the mm->mm_mt tree */ + /* + * Flag to indicate areas detached from the mm->mm_mt tree. + * Unstable RCU readers are allowed to read this. + */ bool detached; /* @@ -706,6 +716,7 @@ struct vm_area_struct { * slowpath. */ int vm_lock_seq; + /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -947,7 +958,7 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_subscriptions *notifier_subscriptions; #endif -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_NUMA_BALANCING @@ -1313,6 +1324,9 @@ struct vm_special_mapping { int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); + + void (*close)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma); }; enum tlb_flush_reason { diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index a2f6179b672b..bff5706b76e1 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -16,9 +16,6 @@ #include <asm/tlbbatch.h> #endif -#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) -#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ - IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) /* diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1dc6248feb83..17506e4a2835 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -666,11 +666,6 @@ enum zone_watermarks { #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) -#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) -#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) -#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) -#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) - /* * Flags used in pcp->flags field. * @@ -1016,6 +1011,32 @@ enum zone_flags { ZONE_BELOW_HIGH, /* zone is below high watermark. */ }; +static inline unsigned long wmark_pages(const struct zone *z, + enum zone_watermarks w) +{ + return z->_watermark[w] + z->watermark_boost; +} + +static inline unsigned long min_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_MIN); +} + +static inline unsigned long low_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_LOW); +} + +static inline unsigned long high_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_HIGH); +} + +static inline unsigned long promo_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_PROMO); +} + static inline unsigned long zone_managed_pages(struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); @@ -1688,7 +1709,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ - for (zone = z->zone; \ + for (zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) @@ -1724,7 +1745,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes) nid = first_node(*nodes); zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); - return (!z->zone) ? true : false; + return (!zonelist_zone(z)) ? true : false; } diff --git a/include/linux/numa.h b/include/linux/numa.h index eb19503604fe..3567e40329eb 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -30,6 +30,12 @@ static inline bool numa_valid_node(int nid) #ifdef CONFIG_NUMA #include <asm/sparsemem.h> +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) + +void __init alloc_node_data(int nid); +void __init alloc_offline_node_data(int nid); + /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); @@ -57,6 +63,8 @@ static inline int phys_to_target_node(u64 start) { return 0; } + +static inline void alloc_offline_node_data(int nid) {} #endif #define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE) diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h new file mode 100644 index 000000000000..cfad6ce7e1bd --- /dev/null +++ b/include/linux/numa_memblks.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NUMA_MEMBLKS_H +#define __NUMA_MEMBLKS_H + +#ifdef CONFIG_NUMA_MEMBLKS +#include <linux/types.h> + +#define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) + +void __init numa_set_distance(int from, int to, int distance); +void __init numa_reset_distance(void); + +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +int __init numa_add_memblk(int nodeid, u64 start, u64 end); +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); + +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); + +int __init numa_memblks_init(int (*init_func)(void), + bool memblock_force_top_down); + +#ifdef CONFIG_NUMA_EMU +int numa_emu_cmdline(char *str); +void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, + unsigned int nr_emu_nids); +u64 __init numa_emu_dma_end(void); +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +static inline int numa_emu_cmdline(char *str) +{ + return -EINVAL; +} +#endif /* CONFIG_NUMA_EMU */ + +#ifdef CONFIG_NUMA_KEEP_MEMINFO +extern int phys_to_target_node(u64 start); +#define phys_to_target_node phys_to_target_node +extern int memory_add_physaddr_to_nid(u64 start); +#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid +#endif /* CONFIG_NUMA_KEEP_MEMINFO */ + +#endif /* CONFIG_NUMA_MEMBLKS */ + +#endif /* __NUMA_MEMBLKS_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5769fe6e4950..1b3a76710487 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -66,8 +66,6 @@ * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * - * PG_error is set to indicate that an I/O error occurred on this page. - * * PG_arch_1 is an architecture specific page state bit. The generic code * guarantees that this bit is cleared for a page when it first is entered into * the page cache. @@ -103,22 +101,18 @@ enum pageflags { PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, - PG_error, - PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ + PG_owner_priv_1, /* Owner use. If pagecache, fs may use */ + PG_owner_2, /* Owner use. If pagecache, fs may use */ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ - PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - PG_uncached, /* Page has been mapped as uncached */ -#endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif @@ -126,14 +120,21 @@ enum pageflags { PG_young, PG_idle, #endif -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 PG_arch_2, +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 PG_arch_3, #endif __NR_PAGEFLAGS, PG_readahead = PG_reclaim, + /* Anonymous memory (and shmem) */ + PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ + /* Some filesystems */ + PG_checked = PG_owner_priv_1, + /* * Depending on the way an anonymous folio can be mapped into a page * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped @@ -141,13 +142,13 @@ enum pageflags { * tail pages of an anonymous folio. For now, we only expect it to be * set on tail pages for PTE-mapped THP. */ - PG_anon_exclusive = PG_mappedtodisk, - - /* Filesystems */ - PG_checked = PG_owner_priv_1, + PG_anon_exclusive = PG_owner_2, - /* SwapBacked */ - PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ + /* + * Set if all buffer heads in the folio are mapped. + * Filesystems which do not use BHs can use it for their own purpose. + */ + PG_mappedtodisk = PG_owner_2, /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes @@ -183,8 +184,9 @@ enum pageflags { */ /* At least one page in this folio has the hwpoison flag set */ - PG_has_hwpoisoned = PG_error, + PG_has_hwpoisoned = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -235,7 +237,7 @@ static __always_inline int page_is_fake_head(const struct page *page) return page_fixed_fake_head(page) != page; } -static inline unsigned long _compound_head(const struct page *page) +static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); @@ -506,7 +508,6 @@ static inline int TestClearPage##uname(struct page *page) { return 0; } __PAGEFLAG(Locked, locked, PF_NO_TAIL) FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) -PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE) @@ -514,8 +515,9 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) TESTCLEARFLAG(LRU, lru, PF_HEAD) -PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) - TESTCLEARFLAG(Active, active, PF_HEAD) +FOLIO_FLAG(active, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ @@ -531,9 +533,9 @@ PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) -PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) - __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) - __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) +FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE) + __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE) /* * Private page markings that may be used by the filesystem that owns the page @@ -542,8 +544,9 @@ PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) */ PAGEFLAG(Private, private, PF_ANY) PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) -PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) - TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) + +/* owner_2 can be set on tail pages for anon memory */ +FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) /* * Only test-and-set exist for PG_writeback. The unconditional operators are @@ -556,8 +559,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) -PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) - TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND) +FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) #ifdef CONFIG_HIGHMEM /* @@ -577,34 +580,26 @@ static __always_inline bool folio_test_swapcache(const struct folio *folio) test_bit(PG_swapcache, const_folio_flags(folio, 0)); } -static __always_inline bool PageSwapCache(const struct page *page) -{ - return folio_test_swapcache(page_folio(page)); -} - -SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) -CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) +FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE) +FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE) #else -PAGEFLAG_FALSE(SwapCache, swapcache) +FOLIO_FLAG_FALSE(swapcache) #endif -PAGEFLAG(Unevictable, unevictable, PF_HEAD) - __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) - TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) +FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) #ifdef CONFIG_MMU -PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) - __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) - TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) -#else -PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked) - TESTSCFLAG_FALSE(Mlocked, mlocked) -#endif - -#ifdef CONFIG_ARCH_USES_PG_UNCACHED -PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) +FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) + FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE) #else -PAGEFLAG_FALSE(Uncached, uncached) +FOLIO_FLAG_FALSE(mlocked) + __FOLIO_CLEAR_FLAG_NOOP(mlocked) + FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked) + FOLIO_TEST_SET_FLAG_FALSE(mlocked) #endif #ifdef CONFIG_MEMORY_FAILURE @@ -865,8 +860,18 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +/* + * PG_partially_mapped is protected by deferred_split split_queue_lock, + * so its safe to use non-atomic set/clear. + */ +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) +FOLIO_TEST_FLAG_FALSE(partially_mapped) +__FOLIO_SET_FLAG_NOOP(partially_mapped) +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) @@ -927,79 +932,74 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #endif /* - * For pages that are never mapped to userspace, - * page_type may be used. Because it is initialised to -1, we invert the - * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and - * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and - * low bits so that an underflow or overflow of _mapcount won't be - * mistaken for a page type value. + * For pages that do not use mapcount, page_type may be used. + * The low 24 bits of pagetype may be used for your own purposes, as long + * as you are careful to not affect the top 8 bits. The low bits of + * pagetype will be overwritten when you clear the page_type from the page. */ - enum pagetype { - PG_buddy = 0x40000000, - PG_offline = 0x20000000, - PG_table = 0x10000000, - PG_guard = 0x08000000, - PG_hugetlb = 0x04000000, - PG_slab = 0x02000000, - PG_zsmalloc = 0x01000000, - - PAGE_TYPE_BASE = 0x80000000, - - /* - * Reserve 0xffff0000 - 0xfffffffe to catch _mapcount underflows and - * allow owners that set a type to reuse the lower 16 bit for their own - * purposes. - */ - PAGE_MAPCOUNT_RESERVE = ~0x0000ffff, + /* 0x00-0x7f are positive numbers, ie mapcount */ + /* Reserve 0x80-0xef for mapcount overflow. */ + PGTY_buddy = 0xf0, + PGTY_offline = 0xf1, + PGTY_table = 0xf2, + PGTY_guard = 0xf3, + PGTY_hugetlb = 0xf4, + PGTY_slab = 0xf5, + PGTY_zsmalloc = 0xf6, + PGTY_unaccepted = 0xf7, + + PGTY_mapcount_underflow = 0xff }; -#define PageType(page, flag) \ - ((READ_ONCE(page->page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) -#define folio_test_type(folio, flag) \ - ((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) +static inline bool page_type_has_type(int page_type) +{ + return page_type < (PGTY_mapcount_underflow << 24); +} -static inline int page_type_has_type(unsigned int page_type) +/* This takes a mapcount which is one more than page->_mapcount */ +static inline bool page_mapcount_is_type(unsigned int mapcount) { - return (int)page_type < PAGE_MAPCOUNT_RESERVE; + return page_type_has_type(mapcount - 1); } -static inline int page_has_type(const struct page *page) +static inline bool page_has_type(const struct page *page) { - return page_type_has_type(READ_ONCE(page->page_type)); + return page_mapcount_is_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ -static __always_inline bool folio_test_##fname(const struct folio *folio)\ +static __always_inline bool folio_test_##fname(const struct folio *folio) \ { \ - return folio_test_type(folio, PG_##lname); \ + return data_race(folio->page.page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ - VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ - folio->page.page_type &= ~PG_##lname; \ + VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ + folio); \ + folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ - folio->page.page_type |= PG_##lname; \ + folio->page.page_type = UINT_MAX; \ } #define PAGE_TYPE_OPS(uname, lname, fname) \ FOLIO_TYPE_OPS(lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ - return PageType(page, PG_##lname); \ + return data_race(page->page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ - VM_BUG_ON_PAGE(!PageType(page, 0), page); \ - page->page_type &= ~PG_##lname; \ + VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ + page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ - page->page_type |= PG_##lname; \ + page->page_type = UINT_MAX; \ } /* @@ -1076,6 +1076,13 @@ FOLIO_TEST_FLAG_FALSE(hugetlb) PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) +/* + * Mark pages that has to be accepted before touched for the first time. + * + * Serialized with zone lock. + */ +PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) + /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. @@ -1175,25 +1182,20 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ - 1UL << PG_large_rmappable) + 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** - * page_has_private - Determine if page has private stuff - * @page: The page to be checked + * folio_has_private - Determine if folio has private stuff + * @folio: The folio to be checked * - * Determine if a page has private stuff, indicating that release routines + * Determine if a folio has private stuff, indicating that release routines * should be invoked upon it. */ -static inline int page_has_private(const struct page *page) -{ - return !!(page->flags & PAGE_FLAGS_PRIVATE); -} - -static inline bool folio_has_private(const struct folio *folio) +static inline int folio_has_private(const struct folio *folio) { - return page_has_private(&folio->page); + return !!(folio->flags & PAGE_FLAGS_PRIVATE); } #undef PF_ANY diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 904c52f97284..79dbd8bc35a7 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -26,11 +26,14 @@ struct page_counter { atomic_long_t children_low_usage; unsigned long watermark; + /* Latest cg2 reset watermark */ + unsigned long local_watermark; unsigned long failcnt; /* Keep all the read most fields in a separete cacheline. */ CACHELINE_PADDING(_pad2_); + bool protection_support; unsigned long min; unsigned long low; unsigned long high; @@ -44,12 +47,17 @@ struct page_counter { #define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE) #endif +/* + * Protection is supported only for the first counter (with id 0). + */ static inline void page_counter_init(struct page_counter *counter, - struct page_counter *parent) + struct page_counter *parent, + bool protection_support) { - atomic_long_set(&counter->usage, 0); + counter->usage = (atomic_long_t)ATOMIC_LONG_INIT(0); counter->max = PAGE_COUNTER_MAX; counter->parent = parent; + counter->protection_support = protection_support; } static inline unsigned long page_counter_read(struct page_counter *counter) @@ -78,11 +86,24 @@ int page_counter_memparse(const char *buf, const char *max, static inline void page_counter_reset_watermark(struct page_counter *counter) { - counter->watermark = page_counter_read(counter); + unsigned long usage = page_counter_read(counter); + + /* + * Update local_watermark first, so it's always <= watermark + * (modulo CPU/compiler re-ordering) + */ + counter->local_watermark = usage; + counter->watermark = usage; } +#ifdef CONFIG_MEMCG void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection); +#else +static inline void page_counter_calculate_protection(struct page_counter *root, + struct page_counter *counter, + bool recursive_protection) {} +#endif #endif /* _LINUX_PAGE_COUNTER_H */ diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 27cd1e59ccf7..f5eb5a32aeed 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -130,4 +130,62 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private); +typedef int __bitwise folio_walk_flags_t; + +/* + * Walk migration entries as well. Careful: a large folio might get split + * concurrently. + */ +#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) + +/* Walk shared zeropages (small + huge) as well. */ +#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) + +enum folio_walk_level { + FW_LEVEL_PTE, + FW_LEVEL_PMD, + FW_LEVEL_PUD, +}; + +/** + * struct folio_walk - folio_walk_start() / folio_walk_end() data + * @page: exact folio page referenced (if applicable) + * @level: page table level identifying the entry type + * @pte: pointer to the page table entry (FW_LEVEL_PTE). + * @pmd: pointer to the page table entry (FW_LEVEL_PMD). + * @pud: pointer to the page table entry (FW_LEVEL_PUD). + * @ptl: pointer to the page table lock. + * + * (see folio_walk_start() documentation for more details) + */ +struct folio_walk { + /* public */ + struct page *page; + enum folio_walk_level level; + union { + pte_t *ptep; + pud_t *pudp; + pmd_t *pmdp; + }; + union { + pte_t pte; + pud_t pud; + pmd_t pmd; + }; + /* private */ + struct vm_area_struct *vma; + spinlock_t *ptl; +}; + +struct folio *folio_walk_start(struct folio_walk *fw, + struct vm_area_struct *vma, unsigned long addr, + folio_walk_flags_t flags); + +#define folio_walk_end(__fw, __vma) do { \ + spin_unlock((__fw)->ptl); \ + if (likely((__fw)->level == FW_LEVEL_PTE)) \ + pte_unmap((__fw)->ptep); \ + vma_pgtable_walk_end(__vma); \ +} while (0) + #endif /* _LINUX_PAGEWALK_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 4b2047b78b67..b6321fc49159 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -135,7 +135,6 @@ extern void __init setup_per_cpu_areas(void); extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, gfp_t gfp) __alloc_size(1); -extern size_t pcpu_alloc_size(void __percpu *__pdata); #define __alloc_percpu_gfp(_size, _align, _gfp) \ alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp)) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 207f0c83c8e9..59a3deb792a8 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -80,36 +80,6 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) } } -static inline void pgalloc_tag_split(struct page *page, unsigned int nr) -{ - int i; - struct page_ext *first_page_ext; - struct page_ext *page_ext; - union codetag_ref *ref; - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - first_page_ext = page_ext = page_ext_get(page); - if (unlikely(!page_ext)) - return; - - ref = codetag_ref_from_page_ext(page_ext); - if (!ref->ct) - goto out; - - tag = ct_to_alloc_tag(ref->ct); - page_ext = page_ext_next(page_ext); - for (i = 1; i < nr; i++) { - /* Set new reference to point to the original tag */ - alloc_tag_ref_set(codetag_ref_from_page_ext(page_ext), tag); - page_ext = page_ext_next(page_ext); - } -out: - page_ext_put(first_page_ext); -} - static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { struct alloc_tag *tag = NULL; @@ -142,7 +112,6 @@ static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2a6a3cccfc36..e8b2ac6bd2ae 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -447,6 +447,12 @@ static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, } #endif +#ifndef arch_check_zapped_pud +static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) +{ +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, @@ -1950,6 +1956,18 @@ typedef unsigned int pgtbl_mod_mask; #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif +#ifndef pte_pgprot +#define pte_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pmd_pgprot +#define pmd_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pud_pgprot +#define pud_pgprot(x) ((pgprot_t) {0}) +#endif + /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 0978c64f49d8..d5e93e44322e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -331,7 +331,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, switch (level) { case RMAP_LEVEL_PTE: if (!folio_test_large(folio)) { - atomic_inc(&page->_mapcount); + atomic_inc(&folio->_mapcount); break; } @@ -425,7 +425,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, if (!folio_test_large(folio)) { if (PageAnonExclusive(page)) ClearPageAnonExclusive(page); - atomic_inc(&page->_mapcount); + atomic_inc(&folio->_mapcount); break; } @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *); int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); +enum rmp_flags { + RMP_LOCKED = 1 << 0, + RMP_USE_SHARED_ZEROPAGE = 1 << 1, +}; + +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); /* * rmap_walk_control: To control rmap traversing for specific needs diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 91546493c43d..07bb8d4181d7 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -179,27 +179,20 @@ static inline void mm_update_next_owner(struct mm_struct *mm) extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); -extern unsigned long -arch_get_unmapped_area(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); -extern unsigned long + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags); +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t); unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -unsigned long -arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags, vm_flags_t vm_flags); -unsigned long -arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags, vm_flags_t); - unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, @@ -211,11 +204,11 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long flags, vm_flags_t vm_flags); unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) {} diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index ccd72b978e1f..bf10bdb487dd 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -95,23 +95,11 @@ static inline int object_is_on_stack(const void *obj) extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE +unsigned long stack_not_used(struct task_struct *p); +#else static inline unsigned long stack_not_used(struct task_struct *p) { - unsigned long *n = end_of_stack(p); - - do { /* Skip over canary */ -# ifdef CONFIG_STACK_GROWSUP - n--; -# else - n++; -# endif - } while (!*n); - -# ifdef CONFIG_STACK_GROWSUP - return (unsigned long)end_of_stack(p) - (unsigned long)n; -# else - return (unsigned long)n - (unsigned long)end_of_stack(p); -# endif + return 0; } #endif extern void set_task_stack_end_magic(struct task_struct *tsk); diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 95ac8398ee72..e7aec20fb44f 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -8,10 +8,10 @@ #ifdef CONFIG_ARCH_HAS_SET_MEMORY #include <asm/set_memory.h> #else -static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } -static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } -static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } -static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_ro(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_rw(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_x(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif #ifndef set_memory_rox diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 1d06b1e5408a..515a9a6a3c6f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -111,20 +111,13 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags); unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge); + loff_t write_end, bool shmem_huge_force); #else -static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) -{ - return false; -} static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge) + loff_t write_end, bool shmem_huge_force) { return 0; } @@ -150,8 +143,8 @@ enum sgp_type { SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp); +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp); diff --git a/include/linux/slab.h b/include/linux/slab.h index da3a546571e7..b35e2db7eb0e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -930,6 +930,16 @@ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t siz * @new_n: new number of elements to alloc * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * See krealloc_noprof() for further details. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. */ static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p, size_t new_n, @@ -1038,8 +1048,8 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) #define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__)) #define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__)) -extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) - __realloc_size(3); +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) + __realloc_size(2); #define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) extern void kvfree(const void *addr); diff --git a/include/linux/swap.h b/include/linux/swap.h index ba7ea95d1c57..ca533b478c21 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -243,22 +243,24 @@ enum { * free clusters are organized into a list. We fetch an entry from the list to * get a free cluster. * - * The data field stores next cluster if the cluster is free or cluster usage - * counter otherwise. The flags field determines if a cluster is free. This is - * protected by swap_info_struct.lock. + * The flags field determines if a cluster is free. This is + * protected by cluster lock. */ struct swap_cluster_info { spinlock_t lock; /* * Protect swap_cluster_info fields - * and swap_info_struct->swap_map - * elements correspond to the swap - * cluster + * other than list, and swap_info_struct->swap_map + * elements corresponding to the swap cluster. */ - unsigned int data:24; - unsigned int flags:8; + u16 count; + u8 flags; + u8 order; + struct list_head list; }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ +#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ +#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ /* * The first page in the swap file is the swap header, which is always marked @@ -283,11 +285,6 @@ struct percpu_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; -struct swap_cluster_list { - struct swap_cluster_info head; - struct swap_cluster_info tail; -}; - /* * The in-memory structure used to track swap areas. */ @@ -299,8 +296,15 @@ struct swap_info_struct { signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ - struct swap_cluster_list free_clusters; /* free clusters list */ + struct list_head free_clusters; /* free clusters list */ + struct list_head full_clusters; /* full clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; + /* list of cluster that are fragmented or contented */ + unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ @@ -331,7 +335,7 @@ struct swap_info_struct { * list. */ struct work_struct discard_work; /* discard worker */ - struct swap_cluster_list discard_clusters; /* discard clusters list */ + struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one * entry per node. @@ -478,9 +482,9 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern void swap_shmem_alloc(swp_entry_t); +extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); -extern int swapcache_prepare(swp_entry_t); +extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); @@ -545,7 +549,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline void swap_shmem_alloc(swp_entry_t swp) +static inline void swap_shmem_alloc(swp_entry_t swp, int nr) { } @@ -554,7 +558,7 @@ static inline int swap_duplicate(swp_entry_t swp) return 0; } -static inline int swapcache_prepare(swp_entry_t swp) +static inline int swapcache_prepare(swp_entry_t swp, int nr) { return 0; } diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a12bcf042551..9fc6ce15c499 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -267,6 +267,25 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); extern bool userfaultfd_wp_async(struct vm_area_struct *vma); +void userfaultfd_reset_ctx(struct vm_area_struct *vma); + +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end); + +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, + unsigned long vm_flags, + unsigned long start, unsigned long end, + bool wp_async); + +void userfaultfd_release_new(struct userfaultfd_ctx *ctx); + +void userfaultfd_release_all(struct mm_struct *mm, + struct userfaultfd_ctx *ctx); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 747943bc8cc2..aed952d04132 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -50,6 +50,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSTEAL_ANON, PGSTEAL_FILE, #ifdef CONFIG_NUMA + PGSCAN_ZONE_RECLAIM_SUCCESS, PGSCAN_ZONE_RECLAIM_FAILED, #endif PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, @@ -104,6 +105,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, + THP_UNDERUSED_SPLIT_PAGE, THP_SPLIT_PMD, THP_SCAN_EXCEED_NONE_PTE, THP_SCAN_EXCEED_SWAP_PTE, @@ -154,6 +156,30 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, VMA_LOCK_RETRY, VMA_LOCK_MISS, #endif +#ifdef CONFIG_DEBUG_STACK_USAGE + KSTACK_1K, +#if THREAD_SIZE > 1024 + KSTACK_2K, +#endif +#if THREAD_SIZE > 2048 + KSTACK_4K, +#endif +#if THREAD_SIZE > 4096 + KSTACK_8K, +#endif +#if THREAD_SIZE > 8192 + KSTACK_16K, +#endif +#if THREAD_SIZE > 16384 + KSTACK_32K, +#endif +#if THREAD_SIZE > 32768 + KSTACK_64K, +#endif +#if THREAD_SIZE > 65536 + KSTACK_REST, +#endif +#endif /* CONFIG_DEBUG_STACK_USAGE */ NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index e4a631ec430b..ad2ce7a6ab7a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -189,6 +189,10 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1 extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) +void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags) + __realloc_size(2); +#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) + extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9eb77c9007e6..d2761bf8ff32 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -32,6 +32,7 @@ struct reclaim_stat { unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; + unsigned nr_demoted; }; /* Stat data for system wide items */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8f651bb0a1a5..d6db822e4bb3 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -79,6 +79,9 @@ struct writeback_control { */ struct swap_iocb **swap_plug; + /* Target list for splitting a large folio */ + struct list_head *list; + /* internal fields used by the ->writepages implementation: */ struct folio_batch fbatch; pgoff_t index; diff --git a/include/linux/zstd.h b/include/linux/zstd.h index 113408eef6ec..b2c7cf310c8f 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -77,6 +77,30 @@ int zstd_min_clevel(void); */ int zstd_max_clevel(void); +/** + * zstd_default_clevel() - default compression level + * + * Return: Default compression level. + */ +int zstd_default_clevel(void); + +/** + * struct zstd_custom_mem - custom memory allocation + */ +typedef ZSTD_customMem zstd_custom_mem; + +/** + * struct zstd_dict_load_method - Dictionary load method. + * See zstd_lib.h. + */ +typedef ZSTD_dictLoadMethod_e zstd_dict_load_method; + +/** + * struct zstd_dict_content_type - Dictionary context type. + * See zstd_lib.h. + */ +typedef ZSTD_dictContentType_e zstd_dict_content_type; + /* ====== Parameter Selection ====== */ /** @@ -136,6 +160,19 @@ typedef ZSTD_parameters zstd_parameters; zstd_parameters zstd_get_params(int level, unsigned long long estimated_src_size); + +/** + * zstd_get_cparams() - returns zstd_compression_parameters for selected level + * @level: The compression level + * @estimated_src_size: The estimated source size to compress or 0 + * if unknown. + * @dict_size: Dictionary size. + * + * Return: The selected zstd_compression_parameters. + */ +zstd_compression_parameters zstd_get_cparams(int level, + unsigned long long estimated_src_size, size_t dict_size); + /* ====== Single-pass Compression ====== */ typedef ZSTD_CCtx zstd_cctx; @@ -180,6 +217,71 @@ zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size); size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, const void *src, size_t src_size, const zstd_parameters *parameters); +/** + * zstd_create_cctx_advanced() - Create compression context + * @custom_mem: Custom allocator. + * + * Return: NULL on error, pointer to compression context otherwise. + */ +zstd_cctx *zstd_create_cctx_advanced(zstd_custom_mem custom_mem); + +/** + * zstd_free_cctx() - Free compression context + * @cdict: Pointer to compression context. + * + * Return: Always 0. + */ +size_t zstd_free_cctx(zstd_cctx* cctx); + +/** + * struct zstd_cdict - Compression dictionary. + * See zstd_lib.h. + */ +typedef ZSTD_CDict zstd_cdict; + +/** + * zstd_create_cdict_byreference() - Create compression dictionary + * @dict: Pointer to dictionary buffer. + * @dict_size: Size of the dictionary buffer. + * @dict_load_method: Dictionary load method. + * @dict_content_type: Dictionary content type. + * @custom_mem: Memory allocator. + * + * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be + * free before zstd_cdict is destroyed. + * + * Return: NULL on error, pointer to compression dictionary + * otherwise. + */ +zstd_cdict *zstd_create_cdict_byreference(const void *dict, size_t dict_size, + zstd_compression_parameters cparams, + zstd_custom_mem custom_mem); + +/** + * zstd_free_cdict() - Free compression dictionary + * @cdict: Pointer to compression dictionary. + * + * Return: Always 0. + */ +size_t zstd_free_cdict(zstd_cdict* cdict); + +/** + * zstd_compress_using_cdict() - compress src into dst using a dictionary + * @cctx: The context. Must have been initialized with zstd_init_cctx(). + * @dst: The buffer to compress src into. + * @dst_capacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @src_size: The size of the data to compress. + * @cdict: The dictionary to be used. + * + * Return: The compressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_compress_using_cdict(zstd_cctx *cctx, void *dst, + size_t dst_capacity, const void *src, size_t src_size, + const zstd_cdict *cdict); + /* ====== Single-pass Decompression ====== */ typedef ZSTD_DCtx zstd_dctx; @@ -220,6 +322,71 @@ zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size); size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity, const void *src, size_t src_size); +/** + * struct zstd_ddict - Decompression dictionary. + * See zstd_lib.h. + */ +typedef ZSTD_DDict zstd_ddict; + +/** + * zstd_create_ddict_byreference() - Create decompression dictionary + * @dict: Pointer to dictionary buffer. + * @dict_size: Size of the dictionary buffer. + * @dict_load_method: Dictionary load method. + * @dict_content_type: Dictionary content type. + * @custom_mem: Memory allocator. + * + * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be + * free before zstd_ddict is destroyed. + * + * Return: NULL on error, pointer to decompression dictionary + * otherwise. + */ +zstd_ddict *zstd_create_ddict_byreference(const void *dict, size_t dict_size, + zstd_custom_mem custom_mem); +/** + * zstd_free_ddict() - Free decompression dictionary + * @dict: Pointer to the dictionary. + * + * Return: Always 0. + */ +size_t zstd_free_ddict(zstd_ddict *ddict); + +/** + * zstd_create_dctx_advanced() - Create decompression context + * @custom_mem: Custom allocator. + * + * Return: NULL on error, pointer to decompression context otherwise. + */ +zstd_dctx *zstd_create_dctx_advanced(zstd_custom_mem custom_mem); + +/** + * zstd_free_dctx() -- Free decompression context + * @dctx: Pointer to decompression context. + * Return: Always 0. + */ +size_t zstd_free_dctx(zstd_dctx *dctx); + +/** + * zstd_decompress_using_ddict() - decompress src into dst using a dictionary + * @dctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dst_capacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @src_size: The exact size of the data to decompress. + * @ddict: The dictionary to be used. + * + * Return: The decompressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_decompress_using_ddict(zstd_dctx *dctx, + void *dst, size_t dst_capacity, const void *src, size_t src_size, + const zstd_ddict *ddict); + + /* ====== Streaming Buffers ====== */ /** diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 6cecb4a4f68b..9cd1beef0654 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -13,17 +13,15 @@ extern atomic_t zswap_stored_pages; struct zswap_lruvec_state { /* - * Number of pages in zswap that should be protected from the shrinker. - * This number is an estimate of the following counts: + * Number of swapped in pages from disk, i.e not found in the zswap pool. * - * a) Recent page faults. - * b) Recent insertion to the zswap LRU. This includes new zswap stores, - * as well as recent zswap LRU rotations. - * - * These pages are likely to be warm, and might incur IO if the are written - * to swap. + * This is consumed and subtracted from the lru size in + * zswap_shrinker_count() to penalize past overshrinking that led to disk + * swapins. The idea is that had we considered this many more pages in the + * LRU active/protected and not written them back, we would not have had to + * swapped them in. */ - atomic_long_t nr_zswap_protected; + atomic_long_t nr_disk_swapins; }; unsigned long zswap_total_pages(void); |