diff options
Diffstat (limited to 'include')
30 files changed, 787 insertions, 405 deletions
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index cab7cfebf40b..b46617207c93 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -246,7 +246,7 @@ struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; - struct page *pages[]; + struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ @@ -260,8 +260,31 @@ struct mmu_gather_batch { */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) -extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, +extern bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct encoded_page *page, int page_size); + +#ifdef CONFIG_SMP +/* + * This both sets 'delayed_rmap', and returns true. It would be an inline + * function, except we define it before the 'struct mmu_gather'. + */ +#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true) +extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma); +#endif + +#endif + +/* + * We have a no-op version of the rmap removal that doesn't + * delay anything. That is used on S390, which flushes remote + * TLBs synchronously, and on UP, which doesn't have any + * remote TLBs to flush and is not preemptible due to this + * all happening under the page table lock. + */ +#ifndef tlb_delay_rmap +#define tlb_delay_rmap(tlb) (false) +static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif /* @@ -295,6 +318,11 @@ struct mmu_gather { unsigned int freed_tables : 1; /* + * Do we have pending delayed rmap removals? + */ + unsigned int delayed_rmap : 1; + + /* * at which levels have we cleared entries? */ unsigned int cleared_ptes : 1; @@ -435,13 +463,13 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, page_size)) + if (__tlb_remove_page_size(tlb, encode_page(page, 0), page_size)) tlb_flush_mmu(tlb); } -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, unsigned int flags) { - return __tlb_remove_page_size(tlb, page, PAGE_SIZE); + return __tlb_remove_page_size(tlb, encode_page(page, flags), PAGE_SIZE); } /* tlb_remove_page diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 439815cc1ab9..fbad4fcd408e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -102,8 +102,18 @@ static inline unsigned long wb_stat_error(void) #endif } +/* BDI ratio is expressed as part per 1000000 for finer granularity. */ +#define BDI_RATIO_SCALE 10000 + +u64 bdi_get_min_bytes(struct backing_dev_info *bdi); +u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index f55a37efdb97..7af9e34ec261 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -82,26 +82,21 @@ #define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif -#if __has_attribute(__no_sanitize_address__) -#define __no_sanitize_address __attribute__((no_sanitize_address)) -#else -#define __no_sanitize_address -#endif +#define __no_sanitize_address __attribute__((__no_sanitize_address__)) -#if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) -#define __no_sanitize_thread __attribute__((no_sanitize_thread)) +#if defined(__SANITIZE_THREAD__) +#define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) #else #define __no_sanitize_thread #endif -#if __has_attribute(__no_sanitize_undefined__) -#define __no_sanitize_undefined __attribute__((no_sanitize_undefined)) -#else -#define __no_sanitize_undefined -#endif +#define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__)) +/* + * Only supported since gcc >= 12 + */ #if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__) -#define __no_sanitize_coverage __attribute__((no_sanitize_coverage)) +#define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__)) #else #define __no_sanitize_coverage #endif diff --git a/include/linux/damon.h b/include/linux/damon.h index 84525b9cdf6e..ad15a5b88e3a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -357,6 +357,7 @@ struct damon_operations { * @after_wmarks_check: Called after each schemes' watermarks check. * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. + * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. * @private: User private data. * @@ -385,6 +386,10 @@ struct damon_callback { int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); + int (*before_damos_apply)(struct damon_ctx *context, + struct damon_target *target, + struct damon_region *region, + struct damos *scheme); void (*before_terminate)(struct damon_ctx *context); }; diff --git a/include/linux/dax.h b/include/linux/dax.h index ba985333e26b..2b5ecb591059 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -205,6 +205,8 @@ static inline void dax_unlock_mapping_entry(struct address_space *mapping, } #endif +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops); int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, diff --git a/include/linux/highmem.h b/include/linux/highmem.h index e9912da5441b..44242268f53b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -319,6 +319,32 @@ static inline void copy_user_highpage(struct page *to, struct page *from, #endif +#ifdef copy_mc_to_kernel +static inline int copy_mc_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + unsigned long ret; + char *vfrom, *vto; + + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); + ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); + if (!ret) + kmsan_unpoison_memory(page_address(to), PAGE_SIZE); + kunmap_local(vto); + kunmap_local(vfrom); + + return ret; +} +#else +static inline int copy_mc_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + copy_user_highpage(to, from, vaddr, vma); + return 0; +} +#endif + #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8b4f93e84868..551834cd5299 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -33,22 +33,9 @@ typedef struct { unsigned long pd; } hugepd_t; /* * For HugeTLB page, there are more metadata to save in the struct page. But * the head struct page cannot meet our needs, so we have to abuse other tail - * struct page to store the metadata. In order to avoid conflicts caused by - * subsequent use of more tail struct pages, we gather these discrete indexes - * of tail struct page here. + * struct page to store the metadata. */ -enum { - SUBPAGE_INDEX_SUBPOOL = 1, /* reuse page->private */ -#ifdef CONFIG_CGROUP_HUGETLB - SUBPAGE_INDEX_CGROUP, /* reuse page->private */ - SUBPAGE_INDEX_CGROUP_RSVD, /* reuse page->private */ - __MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD, -#endif -#ifdef CONFIG_MEMORY_FAILURE - SUBPAGE_INDEX_HWPOISON, -#endif - __NR_USED_SUBPAGE, -}; +#define __NR_USED_SUBPAGE 3 struct hugepage_subpool { spinlock_t lock; @@ -149,6 +136,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, unsigned long *, long, unsigned int, @@ -181,10 +170,11 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); int isolate_hugetlb(struct page *page, struct list_head *list); -int get_hwpoison_huge_page(struct page *page, bool *hugetlb); -int get_huge_page_for_hwpoison(unsigned long pfn, int flags); +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); +int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared); void putback_active_hugepage(struct page *page); -void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); +void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -209,17 +199,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write); -struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, - int flags, int pdshift); -struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, - int flags); -struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags); -struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, - pgd_t *pgd, int flags); void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); @@ -272,6 +251,12 @@ static inline void adjust_range_if_pmd_sharing_possible( { } +static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ +} + static inline long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, @@ -282,12 +267,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm, return 0; } -static inline struct page *follow_huge_addr(struct mm_struct *mm, - unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, @@ -320,31 +299,6 @@ static inline void hugetlb_show_meminfo_node(int nid) { } -static inline struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, int flags, - int pdshift) -{ - return NULL; -} - -static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, - unsigned long address, int flags) -{ - return NULL; -} - -static inline struct page *follow_huge_pud(struct mm_struct *mm, - unsigned long address, pud_t *pud, int flags) -{ - return NULL; -} - -static inline struct page *follow_huge_pgd(struct mm_struct *mm, - unsigned long address, pgd_t *pgd, int flags) -{ - return NULL; -} - static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { @@ -425,12 +379,13 @@ static inline int isolate_hugetlb(struct page *page, struct list_head *list) return -EBUSY; } -static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { return 0; } -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { return 0; } @@ -439,8 +394,8 @@ static inline void putback_active_hugepage(struct page *page) { } -static inline void move_hugetlb_state(struct page *oldpage, - struct page *newpage, int reason) +static inline void move_hugetlb_state(struct folio *old_folio, + struct folio *new_folio, int reason) { } @@ -623,26 +578,50 @@ enum hugetlb_page_flags { */ #ifdef CONFIG_HUGETLB_PAGE #define TESTHPAGEFLAG(uname, flname) \ +static __always_inline \ +bool folio_test_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + return test_bit(HPG_##flname, private); \ + } \ static inline int HPage##uname(struct page *page) \ { return test_bit(HPG_##flname, &(page->private)); } #define SETHPAGEFLAG(uname, flname) \ +static __always_inline \ +void folio_set_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + set_bit(HPG_##flname, private); \ + } \ static inline void SetHPage##uname(struct page *page) \ { set_bit(HPG_##flname, &(page->private)); } #define CLEARHPAGEFLAG(uname, flname) \ +static __always_inline \ +void folio_clear_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + clear_bit(HPG_##flname, private); \ + } \ static inline void ClearHPage##uname(struct page *page) \ { clear_bit(HPG_##flname, &(page->private)); } #else #define TESTHPAGEFLAG(uname, flname) \ +static inline bool \ +folio_test_hugetlb_##flname(struct folio *folio) \ + { return 0; } \ static inline int HPage##uname(struct page *page) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ +static inline void \ +folio_set_hugetlb_##flname(struct folio *folio) \ + { } \ static inline void SetHPage##uname(struct page *page) \ { } #define CLEARHPAGEFLAG(uname, flname) \ +static inline void \ +folio_clear_hugetlb_##flname(struct folio *folio) \ + { } \ static inline void ClearHPage##uname(struct page *page) \ { } #endif @@ -728,18 +707,29 @@ extern unsigned int default_hstate_idx; #define default_hstate (hstates[default_hstate_idx]) +static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) +{ + return folio->_hugetlb_subpool; +} + /* - * hugetlb page subpool pointer located in hpage[1].private + * hugetlb page subpool pointer located in hpage[2].hugetlb_subpool */ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { - return (void *)page_private(hpage + SUBPAGE_INDEX_SUBPOOL); + return hugetlb_folio_subpool(page_folio(hpage)); +} + +static inline void hugetlb_set_folio_subpool(struct folio *folio, + struct hugepage_subpool *subpool) +{ + folio->_hugetlb_subpool = subpool; } static inline void hugetlb_set_page_subpool(struct page *hpage, struct hugepage_subpool *subpool) { - set_page_private(hpage + SUBPAGE_INDEX_SUBPOOL, (unsigned long)subpool); + hugetlb_set_folio_subpool(page_folio(hpage), subpool); } static inline struct hstate *hstate_file(struct file *f) @@ -823,10 +813,15 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, } #endif +static inline struct hstate *folio_hstate(struct folio *folio) +{ + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); + return size_to_hstate(folio_size(folio)); +} + static inline struct hstate *page_hstate(struct page *page) { - VM_BUG_ON_PAGE(!PageHuge(page), page); - return size_to_hstate(page_size(page)); + return folio_hstate(page_folio(page)); } static inline unsigned hstate_index_to_shift(unsigned index) @@ -983,6 +978,11 @@ void hugetlb_unregister_node(struct node *node); #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) +{ + return NULL; +} + static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { return NULL; @@ -1035,6 +1035,11 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma) return NULL; } +static inline struct hstate *folio_hstate(struct folio *folio) +{ + return NULL; +} + static inline struct hstate *page_hstate(struct page *page) { return NULL; diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 630cd255d0cf..f706626a8063 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -24,12 +24,10 @@ struct file_region; #ifdef CONFIG_CGROUP_HUGETLB /* * Minimum page order trackable by hugetlb cgroup. - * At least 4 pages are necessary for all the tracking information. - * The second tail page (hpage[SUBPAGE_INDEX_CGROUP]) is the fault - * usage cgroup. The third tail page (hpage[SUBPAGE_INDEX_CGROUP_RSVD]) - * is the reservation usage cgroup. + * At least 3 pages are necessary for all the tracking information. + * The second tail page contains all of the hugetlb-specific fields. */ -#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__MAX_CGROUP_SUBPAGE_INDEX + 1) +#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE) enum hugetlb_memory_event { HUGETLB_MAX, @@ -67,54 +65,50 @@ struct hugetlb_cgroup { }; static inline struct hugetlb_cgroup * -__hugetlb_cgroup_from_page(struct page *page, bool rsvd) +__hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { - VM_BUG_ON_PAGE(!PageHuge(page), page); - - if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); + if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return NULL; if (rsvd) - return (void *)page_private(page + SUBPAGE_INDEX_CGROUP_RSVD); + return folio->_hugetlb_cgroup_rsvd; else - return (void *)page_private(page + SUBPAGE_INDEX_CGROUP); + return folio->_hugetlb_cgroup; } -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { - return __hugetlb_cgroup_from_page(page, false); + return __hugetlb_cgroup_from_folio(folio, false); } static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_rsvd(struct page *page) +hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { - return __hugetlb_cgroup_from_page(page, true); + return __hugetlb_cgroup_from_folio(folio, true); } -static inline void __set_hugetlb_cgroup(struct page *page, +static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { - VM_BUG_ON_PAGE(!PageHuge(page), page); - - if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); + if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return; if (rsvd) - set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD, - (unsigned long)h_cg); + folio->_hugetlb_cgroup_rsvd = h_cg; else - set_page_private(page + SUBPAGE_INDEX_CGROUP, - (unsigned long)h_cg); + folio->_hugetlb_cgroup = h_cg; } -static inline void set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page, h_cg, false); + __set_hugetlb_cgroup(folio, h_cg, false); } -static inline void set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page, h_cg, true); + __set_hugetlb_cgroup(folio, h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) @@ -151,10 +145,10 @@ extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page); -extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page); -extern void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, - struct page *page); +extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio); +extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, + struct folio *folio); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); @@ -170,8 +164,8 @@ extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, bool region_del); extern void hugetlb_cgroup_file_init(void) __init; -extern void hugetlb_cgroup_migrate(struct page *oldhpage, - struct page *newhpage); +extern void hugetlb_cgroup_migrate(struct folio *old_folio, + struct folio *new_folio); #else static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, @@ -181,29 +175,23 @@ static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, { } -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) -{ - return NULL; -} - -static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_resv(struct page *page) +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { return NULL; } static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_rsvd(struct page *page) +hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { return NULL; } -static inline void set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { } -static inline void set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { } @@ -253,14 +241,14 @@ hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, { } -static inline void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page) +static inline void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio) { } -static inline void hugetlb_cgroup_uncharge_page_rsvd(int idx, +static inline void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, - struct page *page) + struct folio *folio) { } static inline void hugetlb_cgroup_uncharge_cgroup(int idx, @@ -285,8 +273,8 @@ static inline void hugetlb_cgroup_file_init(void) { } -static inline void hugetlb_cgroup_migrate(struct page *oldhpage, - struct page *newhpage) +static inline void hugetlb_cgroup_migrate(struct folio *old_folio, + struct folio *new_folio) { } diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 70162d707caf..f68865e19b0b 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,7 @@ extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); +extern bool current_is_khugepaged(void); #ifdef CONFIG_SHMEM extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd); @@ -57,6 +58,11 @@ static inline int collapse_pte_mapped_thp(struct mm_struct *mm, static inline void khugepaged_min_free_kbytes_update(void) { } + +static inline bool current_is_khugepaged(void) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e1644a24009c..d3c8203cab6c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -615,28 +615,32 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root, void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg); -static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg) +static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, + struct mem_cgroup *memcg) { /* * The root memcg doesn't account charges, and doesn't support - * protection. + * protection. The target memcg's protection is ignored, see + * mem_cgroup_calculate_protection() and mem_cgroup_protection() */ - return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg); - + return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) || + memcg == target; } -static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_low(struct mem_cgroup *target, + struct mem_cgroup *memcg) { - if (!mem_cgroup_supports_protection(memcg)) + if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } -static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_min(struct mem_cgroup *target, + struct mem_cgroup *memcg) { - if (!mem_cgroup_supports_protection(memcg)) + if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.emin) >= @@ -1209,12 +1213,19 @@ static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, { } -static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) +static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, + struct mem_cgroup *memcg) +{ + return true; +} +static inline bool mem_cgroup_below_low(struct mem_cgroup *target, + struct mem_cgroup *memcg) { return false; } -static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_min(struct mem_cgroup *target, + struct mem_cgroup *memcg) { return false; } diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 965009aa01d7..fc9647b1b4f9 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -18,7 +18,6 @@ * the same memory tier. */ #define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1)) -#define MEMTIER_HOTPLUG_PRIO 100 struct memory_tier; struct memory_dev_type { diff --git a/include/linux/memory.h b/include/linux/memory.h index aa619464a1df..31343566c221 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -19,7 +19,6 @@ #include <linux/node.h> #include <linux/compiler.h> #include <linux/mutex.h> -#include <linux/notifier.h> #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) @@ -85,6 +84,9 @@ struct memory_block { unsigned long nr_vmemmap_pages; struct memory_group *group; /* group (if any) for this block */ struct list_head group_next; /* next block inside memory group */ +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) + atomic_long_t nr_hwpoison; +#endif }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -113,8 +115,13 @@ struct mem_section; * Priorities for the hotplug memory callback routines (stored in decreasing * order in the callback chain) */ -#define SLAB_CALLBACK_PRI 1 -#define IPC_CALLBACK_PRI 10 +#define DEFAULT_CALLBACK_PRI 0 +#define SLAB_CALLBACK_PRI 1 +#define HMAT_CALLBACK_PRI 2 +#define MM_COMPUTE_BATCH_PRI 10 +#define CPUSET_CALLBACK_PRI 10 +#define MEMTIER_HOTPLUG_PRI 100 +#define KSM_CALLBACK_PRI 100 #ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) @@ -136,9 +143,6 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) { return 0; } -/* These aren't inline functions due to a GCC bug. */ -#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) -#define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) #else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); @@ -166,8 +170,6 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, { .notifier_call = fn, .priority = pri };\ register_memory_notifier(&fn##_mem_nb); \ }) -#define register_hotmemory_notifier(nb) register_memory_notifier(nb) -#define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb) #ifdef CONFIG_NUMA void memory_block_add_nid(struct memory_block *mem, int nid, diff --git a/include/linux/mm.h b/include/linux/mm.h index 6a05a3bc0a28..8178fe894e2e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -74,6 +74,7 @@ static inline void totalram_pages_add(long count) extern void * high_memory; extern int page_cluster; +extern const int page_cluster_max; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; @@ -549,7 +550,7 @@ struct vm_operations_struct { /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not - * be modified. Returns 0 if eprotect() can proceed. + * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); @@ -699,8 +700,10 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); +bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); @@ -817,8 +820,8 @@ static inline int is_vmalloc_or_module_addr(const void *x) /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for - * debugging purposes; look at folio_mapcount() or page_mapcount() - * instead. + * debugging purposes - it does not include PTE-mapped sub-pages; look + * at folio_mapcount() or page_mapcount() or total_mapcount() instead. */ static inline int folio_entire_mapcount(struct folio *folio) { @@ -828,12 +831,29 @@ static inline int folio_entire_mapcount(struct folio *folio) /* * Mapcount of compound page as a whole, does not include mapped sub-pages. - * - * Must be called only for compound pages. + * Must be called only on head of compound page. */ -static inline int compound_mapcount(struct page *page) +static inline int head_compound_mapcount(struct page *head) { - return folio_entire_mapcount(page_folio(page)); + return atomic_read(compound_mapcount_ptr(head)) + 1; +} + +/* + * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages, + * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently + * leaves subpages_mapcount at 0, but avoid surprise if it participates later. + */ +#define COMPOUND_MAPPED 0x800000 +#define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1) + +/* + * Number of sub-pages mapped by PTE, does not include compound mapcount. + * Must be called only on head of compound page. + */ +static inline int head_subpages_mapcount(struct page *head) +{ + return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED; } /* @@ -846,11 +866,9 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -int __page_mapcount(struct page *page); - /* * Mapcount of 0-order page; when compound sub-page, includes - * compound_mapcount(). + * compound_mapcount of compound_head of page. * * Result is undefined for pages which cannot be mapped into userspace. * For example SLAB or special types of pages. See function page_has_type(). @@ -858,25 +876,75 @@ int __page_mapcount(struct page *page); */ static inline int page_mapcount(struct page *page) { - if (unlikely(PageCompound(page))) - return __page_mapcount(page); - return atomic_read(&page->_mapcount) + 1; + int mapcount = atomic_read(&page->_mapcount) + 1; + + if (likely(!PageCompound(page))) + return mapcount; + page = compound_head(page); + return head_compound_mapcount(page) + mapcount; } -int folio_mapcount(struct folio *folio); +int total_compound_mapcount(struct page *head); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int total_mapcount(struct page *page) +/** + * folio_mapcount() - Calculate the number of mappings of this folio. + * @folio: The folio. + * + * A large folio tracks both how many times the entire folio is mapped, + * and how many times each individual page in the folio is mapped. + * This function calculates the total number of times the folio is + * mapped. + * + * Return: The number of times this folio is mapped. + */ +static inline int folio_mapcount(struct folio *folio) { - return folio_mapcount(page_folio(page)); + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) + 1; + return total_compound_mapcount(&folio->page); } -#else static inline int total_mapcount(struct page *page) { - return page_mapcount(page); + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + return total_compound_mapcount(compound_head(page)); +} + +static inline bool folio_large_is_mapped(struct folio *folio) +{ + /* + * Reading folio_mapcount_ptr() below could be omitted if hugetlb + * participated in incrementing subpages_mapcount when compound mapped. + */ + return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 || + atomic_read(folio_mapcount_ptr(folio)) >= 0; +} + +/** + * folio_mapped - Is this folio mapped into userspace? + * @folio: The folio. + * + * Return: True if any page in this folio is referenced by user page tables. + */ +static inline bool folio_mapped(struct folio *folio) +{ + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) >= 0; + return folio_large_is_mapped(folio); +} + +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any sub-page of compound page is mapped, + * even if this particular sub-page is not itself mapped by any PTE or PMD. + */ +static inline bool page_mapped(struct page *page) +{ + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + return folio_large_is_mapped(page_folio(page)); } -#endif static inline struct page *virt_to_head_page(const void *x) { @@ -929,6 +997,13 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } +static inline void folio_set_compound_dtor(struct folio *folio, + enum compound_dtor_id compound_dtor) +{ + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio); + folio->_folio_dtor = compound_dtor; +} + void destroy_large_folio(struct folio *folio); static inline int head_compound_pincount(struct page *head) @@ -944,6 +1019,22 @@ static inline void set_compound_order(struct page *page, unsigned int order) #endif } +/* + * folio_set_compound_order is generally passed a non-zero order to + * initialize a large folio. However, hugetlb code abuses this by + * passing in zero when 'dissolving' a large folio. + */ +static inline void folio_set_compound_order(struct folio *folio, + unsigned int order) +{ + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + + folio->_folio_order = order; +#ifdef CONFIG_64BIT + folio->_folio_nr_pages = order ? 1U << order : 0; +#endif +} + /* Returns the number of pages in this potentially compound page. */ static inline unsigned long compound_nr(struct page *page) { @@ -1179,7 +1270,24 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } -void release_pages(struct page **pages, int nr); +/** + * release_pages - release an array of pages or folios + * + * This just releases a simple array of multiple pages, and + * accepts various different forms of said page array: either + * a regular old boring array of pages, an array of folios, or + * an array of encoded page pointers. + * + * The transparent union syntax for this kind of "any of these + * argument types" is all kinds of ugly, so look away. + */ +typedef union { + struct page **pages; + struct folio **folios; + struct encoded_page **encoded_pages; +} release_pages_arg __attribute__ ((__transparent_union__)); + +void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. @@ -1195,7 +1303,7 @@ void release_pages(struct page **pages, int nr); */ static inline void folios_put(struct folio **folios, unsigned int nr) { - release_pages((struct page **)folios, nr); + release_pages(folios, nr); } static inline void put_page(struct page *page) @@ -1799,9 +1907,6 @@ static inline pgoff_t page_index(struct page *page) return page->index; } -bool page_mapped(struct page *page); -bool folio_mapped(struct folio *folio); - /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not @@ -2025,6 +2130,22 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); +static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) +{ + /* + * We want to check manually if we can change individual PTEs writable + * if we can't do that automatically for all PTEs in a mapping. For + * private mappings, that's always the case when we have write + * permissions as we properly have to handle COW. + */ + if (vma->vm_flags & VM_SHARED) + return vma_wants_writenotify(vma, vma->vm_page_prot); + return !!(vma->vm_flags & VM_WRITE); + +} +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); extern unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, @@ -2051,40 +2172,30 @@ static inline bool get_user_page_fast_only(unsigned long addr, */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - long val = atomic_long_read(&mm->rss_stat.count[member]); - -#ifdef SPLIT_RSS_COUNTING - /* - * counter is updated in asynchronous manner and may go to minus. - * But it's never be expected number for users. - */ - if (val < 0) - val = 0; -#endif - return (unsigned long)val; + return percpu_counter_read_positive(&mm->rss_stat[member]); } -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count); +void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); + percpu_counter_add(&mm->rss_stat[member], value); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_inc_return(&mm->rss_stat.count[member]); + percpu_counter_inc(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_dec_return(&mm->rss_stat.count[member]); + percpu_counter_dec(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } /* Optimized variant when page is already known not to be PageAnon */ @@ -2174,8 +2285,6 @@ static inline int pte_devmap(pte_t pte) } #endif -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); - extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, @@ -2424,7 +2533,7 @@ static inline void pgtable_pte_page_dtor(struct page *page) #if USE_SPLIT_PMD_PTLOCKS -static struct page *pmd_to_page(pmd_t *pmd) +static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); @@ -2432,7 +2541,7 @@ static struct page *pmd_to_page(pmd_t *pmd) static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(pmd_to_page(pmd)); + return ptlock_ptr(pmd_pgtable_page(pmd)); } static inline bool pmd_ptlock_init(struct page *page) @@ -2451,7 +2560,7 @@ static inline void pmd_ptlock_free(struct page *page) ptlock_free(page); } -#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte) +#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte) #else @@ -2971,7 +3080,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * and return without waiting upon it */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_ANON 0x8000 /* don't do file mappings */ @@ -3064,8 +3172,12 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a * PTE-mapped THP. + * + * If the vma is NULL, we're coming from the GUP-fast path and might have + * to fallback to the slow path just to lookup the vma. */ -static inline bool gup_must_unshare(unsigned int flags, struct page *page) +static inline bool gup_must_unshare(struct vm_area_struct *vma, + unsigned int flags, struct page *page) { /* * FOLL_WRITE is implicitly handled correctly as the page table entry @@ -3078,8 +3190,25 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page) * Note: PageAnon(page) is stable until the page is actually getting * freed. */ - if (!PageAnon(page)) - return false; + if (!PageAnon(page)) { + /* + * We only care about R/O long-term pining: R/O short-term + * pinning does not have the semantics to observe successive + * changes through the process page tables. + */ + if (!(flags & FOLL_LONGTERM)) + return false; + + /* We really need the vma ... */ + if (!vma) + return true; + + /* + * ... because we only care about writable private ("COW") + * mappings where we have to break COW early. + */ + return is_cow_mapping(vma->vm_flags); + } /* Paired with a memory barrier in page_try_share_anon_rmap(). */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) @@ -3255,6 +3384,8 @@ void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); +void pmd_init(void *addr); +void pud_init(void *addr); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); @@ -3266,8 +3397,14 @@ struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); +void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next); +int vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +int vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); void vmemmap_populate_print_last(void); @@ -3290,7 +3427,6 @@ enum mf_flags { int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); -extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; @@ -3299,12 +3435,42 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags); +extern void memory_failure_queue(unsigned long pfn, int flags); +extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared); +void num_poisoned_pages_inc(unsigned long pfn); +void num_poisoned_pages_sub(unsigned long pfn, long i); #else -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +static inline void memory_failure_queue(unsigned long pfn, int flags) +{ +} + +static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { return 0; } + +static inline void num_poisoned_pages_inc(unsigned long pfn) +{ +} + +static inline void num_poisoned_pages_sub(unsigned long pfn, long i) +{ +} +#endif + +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +extern void memblk_nr_poison_inc(unsigned long pfn); +extern void memblk_nr_poison_sub(unsigned long pfn, long i); +#else +static inline void memblk_nr_poison_inc(unsigned long pfn) +{ +} + +static inline void memblk_nr_poison_sub(unsigned long pfn, long i) +{ +} #endif #ifndef arch_memory_failure diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 500e536796ca..3b8475007734 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -18,6 +18,7 @@ #include <linux/page-flags-layout.h> #include <linux/workqueue.h> #include <linux/seqlock.h> +#include <linux/percpu_counter.h> #include <asm/mmu.h> @@ -67,7 +68,7 @@ struct mem_cgroup; #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) #else -#define _struct_page_alignment +#define _struct_page_alignment __aligned(sizeof(unsigned long)) #endif struct page { @@ -103,7 +104,10 @@ struct page { }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; - pgoff_t index; /* Our offset within mapping. */ + union { + pgoff_t index; /* Our offset within mapping. */ + unsigned long share; /* share count for fsdax */ + }; /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. @@ -141,17 +145,26 @@ struct page { unsigned char compound_dtor; unsigned char compound_order; atomic_t compound_mapcount; + atomic_t subpages_mapcount; atomic_t compound_pincount; #ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ #endif }; - struct { /* Second tail page of compound page */ + struct { /* Second tail page of transparent huge page */ unsigned long _compound_pad_1; /* compound_head */ unsigned long _compound_pad_2; /* For both global and memcg */ struct list_head deferred_list; }; + struct { /* Second tail page of hugetlb page */ + unsigned long _hugetlb_pad_1; /* compound_head */ + void *hugetlb_subpool; + void *hugetlb_cgroup; + void *hugetlb_cgroup_rsvd; + void *hugetlb_hwpoison; + /* No more space on 32-bit: use third tail if more */ + }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ @@ -241,6 +254,38 @@ struct page { #endif } _struct_page_alignment; +/* + * struct encoded_page - a nonexistent type marking this pointer + * + * An 'encoded_page' pointer is a pointer to a regular 'struct page', but + * with the low bits of the pointer indicating extra context-dependent + * information. Not super-common, but happens in mmu_gather and mlock + * handling, and this acts as a type system check on that use. + * + * We only really have two guaranteed bits in general, although you could + * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + * for more. + * + * Use the supplied helper functions to endcode/decode the pointer and bits. + */ +struct encoded_page; +#define ENCODE_PAGE_BITS 3ul +static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) +{ + BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); + return (struct encoded_page *)(flags | (unsigned long)page); +} + +static inline unsigned long encoded_page_flags(struct encoded_page *page) +{ + return ENCODE_PAGE_BITS & (unsigned long)page; +} + +static inline struct page *encoded_page_ptr(struct encoded_page *page) +{ + return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); +} + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. @@ -258,12 +303,19 @@ struct page { * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. * @_flags_1: For large folios, additional page flags. - * @__head: Points to the folio. Do not use. + * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). - * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_subpages_mapcount: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). + * @_flags_2: For alignment. Do not use. + * @_head_2: Points to the folio. Do not use. + * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. + * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. + * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. + * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -302,15 +354,32 @@ struct folio { }; struct page page; }; - unsigned long _flags_1; - unsigned long __head; - unsigned char _folio_dtor; - unsigned char _folio_order; - atomic_t _total_mapcount; - atomic_t _pincount; + union { + struct { + unsigned long _flags_1; + unsigned long _head_1; + unsigned char _folio_dtor; + unsigned char _folio_order; + atomic_t _compound_mapcount; + atomic_t _subpages_mapcount; + atomic_t _pincount; #ifdef CONFIG_64BIT - unsigned int _folio_nr_pages; + unsigned int _folio_nr_pages; #endif + }; + struct page __page_1; + }; + union { + struct { + unsigned long _flags_2; + unsigned long _head_2; + void *_hugetlb_subpool; + void *_hugetlb_cgroup; + void *_hugetlb_cgroup_rsvd; + void *_hugetlb_hwpoison; + }; + struct page __page_2; + }; }; #define FOLIO_MATCH(pg, fl) \ @@ -331,15 +400,26 @@ FOLIO_MATCH(memcg_data, memcg_data); static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); -FOLIO_MATCH(compound_head, __head); +FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); -FOLIO_MATCH(compound_mapcount, _total_mapcount); +FOLIO_MATCH(compound_mapcount, _compound_mapcount); +FOLIO_MATCH(subpages_mapcount, _subpages_mapcount); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); #endif #undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + 2 * sizeof(struct page)) +FOLIO_MATCH(flags, _flags_2); +FOLIO_MATCH(compound_head, _head_2); +FOLIO_MATCH(hugetlb_subpool, _hugetlb_subpool); +FOLIO_MATCH(hugetlb_cgroup, _hugetlb_cgroup); +FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd); +FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison); +#undef FOLIO_MATCH static inline atomic_t *folio_mapcount_ptr(struct folio *folio) { @@ -347,11 +427,22 @@ static inline atomic_t *folio_mapcount_ptr(struct folio *folio) return &tail->compound_mapcount; } +static inline atomic_t *folio_subpages_mapcount_ptr(struct folio *folio) +{ + struct page *tail = &folio->page + 1; + return &tail->subpages_mapcount; +} + static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; } +static inline atomic_t *subpages_mapcount_ptr(struct page *page) +{ + return &page[1].subpages_mapcount; +} + static inline atomic_t *compound_pincount_ptr(struct page *page) { return &page[1].compound_pincount; @@ -461,21 +552,11 @@ struct vm_area_struct { * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. * - * For private anonymous mappings, a pointer to a null terminated string - * containing the name given to the vma, or NULL if unnamed. */ - - union { - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* - * Serialized by mmap_sem. Never use directly because it is - * valid only when vm_file is NULL. Use anon_vma_name instead. - */ - struct anon_vma_name *anon_name; - }; + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -496,6 +577,14 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_sem. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif @@ -612,11 +701,7 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - /* - * Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - struct mm_rss_stat rss_stat; + struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; @@ -847,7 +932,6 @@ typedef __bitwise unsigned int vm_fault_t; * @VM_FAULT_OOM: Out Of Memory * @VM_FAULT_SIGBUS: Bad access * @VM_FAULT_MAJOR: Page read from storage - * @VM_FAULT_WRITE: Special case for get_user_pages * @VM_FAULT_HWPOISON: Hit poisoned small page * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded * in upper bits @@ -868,7 +952,6 @@ enum vm_fault_reason { VM_FAULT_OOM = (__force vm_fault_t)0x000001, VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, - VM_FAULT_WRITE = (__force vm_fault_t)0x000008, VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, @@ -894,7 +977,6 @@ enum vm_fault_reason { { VM_FAULT_OOM, "OOM" }, \ { VM_FAULT_SIGBUS, "SIGBUS" }, \ { VM_FAULT_MAJOR, "MAJOR" }, \ - { VM_FAULT_WRITE, "WRITE" }, \ { VM_FAULT_HWPOISON, "HWPOISON" }, \ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ @@ -957,9 +1039,9 @@ typedef struct { * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to unshare (and mark - * exclusive) a possibly shared anonymous page that is - * mapped R/O. + * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a + * COW mapping, making sure that an exclusive anon page is + * mapped after the fault. * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. * We should only access orig_pte if this flag set. * @@ -984,7 +1066,7 @@ typedef struct { * * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when - * no existing R/O-mapped anonymous page is encountered. + * applied to mappings that are not COW mappings. */ enum fault_flag { FAULT_FLAG_WRITE = 1 << 0, diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 0bb4b6da9993..5414b5c6a103 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -36,19 +36,6 @@ enum { NR_MM_COUNTERS }; -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU) -#define SPLIT_RSS_COUNTING -/* per-thread cached information, */ -struct task_rss_stat { - int events; /* for synchronization threshold */ - int count[NR_MM_COUNTERS]; -}; -#endif /* USE_SPLIT_PTE_PTLOCKS */ - -struct mm_rss_stat { - atomic_long_t count[NR_MM_COUNTERS]; -}; - struct page_frag { struct page *page; #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9c49ec5d0e25..cd28a100d9e4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1224,7 +1224,7 @@ typedef struct pglist_data { /* start time in ms of current promote threshold adjustment period */ unsigned int nbp_th_start; /* - * number of promote candidate pages at stat time of current promote + * number of promote candidate pages at start time of current promote * threshold adjustment period */ unsigned long nbp_th_nr_cand; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0b0ae5084e60..9aec9fd8c50b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -176,9 +176,6 @@ enum pageflags { /* SLOB */ PG_slob_free = PG_private, - /* Compound pages. Stored in first tail page's flags */ - PG_double_map = PG_workingset, - #ifdef CONFIG_MEMORY_FAILURE /* * Compound pages. Stored in first tail page's flags. @@ -641,7 +638,7 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) * Different with flags above, this flag is used only for fsdax mode. It * indicates that this page->mapping is now under reflink case. */ -#define PAGE_MAPPING_DAX_COW 0x1 +#define PAGE_MAPPING_DAX_SHARED ((void *)0x1) static __always_inline bool folio_mapping_flags(struct folio *folio) { @@ -874,29 +871,11 @@ static inline int PageTransTail(struct page *page) { return PageTail(page); } - -/* - * PageDoubleMap indicates that the compound page is mapped with PTEs as well - * as PMDs. - * - * This is required for optimization of rmap operations for THP: we can postpone - * per small page mapcount accounting (and its overhead from atomic operations) - * until the first PMD split. - * - * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up - * by one. This reference will go away with last compound_mapcount. - * - * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). - */ -PAGEFLAG(DoubleMap, double_map, PF_SECOND) - TESTSCFLAG(DoubleMap, double_map, PF_SECOND) #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) TESTPAGEFLAG_FALSE(TransTail, transtail) -PAGEFLAG_FALSE(DoubleMap, double_map) - TESTSCFLAG_FALSE(DoubleMap, double_map) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bbccb4044222..29e1f9e76eb6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -504,9 +504,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 -#define FGP_HEAD 0x00000080 -#define FGP_ENTRY 0x00000100 -#define FGP_STABLE 0x00000200 +#define FGP_ENTRY 0x00000080 +#define FGP_STABLE 0x00000100 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp); @@ -1102,12 +1101,10 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); -void delete_from_page_cache(struct page *page); void __filemap_remove_folio(struct folio *folio, void *shadow); -void replace_page_cache_page(struct page *old, struct page *new); +void replace_page_cache_folio(struct folio *old, struct folio *new); void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch); -int try_to_release_page(struct page *page, gfp_t gfp); bool filemap_release_folio(struct folio *folio, gfp_t gfp); loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, int whence); diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index f3fafb731ffd..959f52e5867d 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -27,6 +27,8 @@ struct mm_walk; * "do page table walk over the current vma", returning * a negative value means "abort current page table walk * right now" and returning 1 means "skip the current vma" + * Note that this callback is not called when the caller + * passes in a single VMA as for walk_page_vma(). * @pre_vma: if set, called before starting walk on a non-null vma. * @post_vma: if set, called after a walk on a non-null vma, provided * that @pre_vma and the vma walk succeeded. @@ -99,6 +101,9 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private); int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 8ed5fba6d156..a3aae8d57a42 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -13,7 +13,6 @@ #include <linux/threads.h> #include <linux/percpu.h> #include <linux/types.h> -#include <linux/gfp.h> /* percpu_counter batch for local add or sub */ #define PERCPU_COUNTER_LOCAL_BATCH INT_MAX @@ -46,6 +45,7 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); +s64 percpu_counter_sum_all(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); @@ -194,6 +194,11 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc) return percpu_counter_read(fbc); } +static inline s64 percpu_counter_sum_all(struct percpu_counter *fbc) +{ + return percpu_counter_read(fbc); +} + static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return true; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5f0d7d0b9471..dfabd549d2e7 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -425,9 +425,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { - pte_t pte; - pte = ptep_get_and_clear(mm, address, ptep); - return pte; + return ptep_get_and_clear(mm, address, ptep); } #endif @@ -503,30 +501,6 @@ static inline pte_t pte_sw_mkyoung(pte_t pte) #define pte_sw_mkyoung pte_sw_mkyoung #endif -#ifndef pte_savedwrite -#define pte_savedwrite pte_write -#endif - -#ifndef pte_mk_savedwrite -#define pte_mk_savedwrite pte_mkwrite -#endif - -#ifndef pte_clear_savedwrite -#define pte_clear_savedwrite pte_wrprotect -#endif - -#ifndef pmd_savedwrite -#define pmd_savedwrite pmd_write -#endif - -#ifndef pmd_mk_savedwrite -#define pmd_mk_savedwrite pmd_mkwrite -#endif - -#ifndef pmd_clear_savedwrite -#define pmd_clear_savedwrite pmd_wrprotect -#endif - #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, diff --git a/include/linux/sched.h b/include/linux/sched.h index 5affff14993d..853d08f7562b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -870,9 +870,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; -#ifdef SPLIT_RSS_COUNTING - struct task_rss_stat rss_stat; -#endif int exit_state; int exit_code; int exit_signal; diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 08e6054e061f..71310efe2fab 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -2,6 +2,9 @@ #ifndef _LINUX_SHRINKER_H #define _LINUX_SHRINKER_H +#include <linux/atomic.h> +#include <linux/types.h> + /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. diff --git a/include/linux/swap.h b/include/linux/swap.h index a18cf4b7c724..2787b84eaf12 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,22 +55,14 @@ static inline int current_is_kswapd(void) * actions on faults. */ -#define SWP_SWAPIN_ERROR_NUM 1 -#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ - SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ - SWP_PTE_MARKER_NUM) /* - * PTE markers are used to persist information onto PTEs that are mapped with - * file-backed memories. As its name "PTE" hints, it should only be applied to - * the leaves of pgtables. + * PTE markers are used to persist information onto PTEs that otherwise + * should be a none pte. As its name "PTE" hints, it should only be + * applied to the leaves of pgtables. */ -#ifdef CONFIG_PTE_MARKER #define SWP_PTE_MARKER_NUM 1 #define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ SWP_MIGRATION_NUM + SWP_DEVICE_NUM) -#else -#define SWP_PTE_MARKER_NUM 0 -#endif /* * Unaddressable device memory support. See include/linux/hmm.h and @@ -125,7 +117,7 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ - SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM) + SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is @@ -384,11 +376,11 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ -void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); -void lru_note_cost_folio(struct folio *); +void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated); +void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); -void lru_cache_add(struct page *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); @@ -426,7 +418,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options); + unsigned int reclaim_options, + nodemask_t *nodemask); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, @@ -470,7 +463,7 @@ static inline unsigned long total_swapcache_pages(void) extern void free_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *); -extern void free_pages_and_swap_cache(struct page **, int); +extern void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; diff --git a/include/linux/swapops.h b/include/linux/swapops.h index b07b277d6a16..b982dd614572 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -164,16 +164,6 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) return xa_mk_value(entry.val); } -static inline swp_entry_t make_swapin_error_entry(struct page *page) -{ - return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page)); -} - -static inline int is_swapin_error_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_SWAPIN_ERROR; -} - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -411,10 +401,9 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) typedef unsigned long pte_marker; -#define PTE_MARKER_UFFD_WP BIT(0) -#define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP) - -#ifdef CONFIG_PTE_MARKER +#define PTE_MARKER_UFFD_WP BIT(0) +#define PTE_MARKER_SWAPIN_ERROR BIT(1) +#define PTE_MARKER_MASK (BIT(2) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { @@ -436,35 +425,20 @@ static inline bool is_pte_marker(pte_t pte) return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); } -#else /* CONFIG_PTE_MARKER */ - -static inline swp_entry_t make_pte_marker_entry(pte_marker marker) -{ - /* This should never be called if !CONFIG_PTE_MARKER */ - WARN_ON_ONCE(1); - return swp_entry(0, 0); -} - -static inline bool is_pte_marker_entry(swp_entry_t entry) -{ - return false; -} - -static inline pte_marker pte_marker_get(swp_entry_t entry) +static inline pte_t make_pte_marker(pte_marker marker) { - return 0; + return swp_entry_to_pte(make_pte_marker_entry(marker)); } -static inline bool is_pte_marker(pte_t pte) +static inline swp_entry_t make_swapin_error_entry(void) { - return false; + return make_pte_marker_entry(PTE_MARKER_SWAPIN_ERROR); } -#endif /* CONFIG_PTE_MARKER */ - -static inline pte_t make_pte_marker(pte_marker marker) +static inline int is_swapin_error_entry(swp_entry_t entry) { - return swp_entry_to_pte(make_pte_marker_entry(marker)); + return is_pte_marker_entry(entry) && + (pte_marker_get(entry) & PTE_MARKER_SWAPIN_ERROR); } /* @@ -479,9 +453,6 @@ static inline pte_t make_pte_marker(pte_marker marker) * memory, kernel-only memory (including when the system is during-boot), * non-ram based generic file-system. It's fine to be used even there, but the * extra pte marker check will be pure overhead. - * - * For systems configured with !CONFIG_PTE_MARKER this will be automatically - * optimized to pte_none(). */ static inline int pte_none_mostly(pte_t pte) { @@ -583,8 +554,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) #ifdef CONFIG_MEMORY_FAILURE -extern atomic_long_t num_poisoned_pages __read_mostly; - /* * Support for hardware poisoned pages */ @@ -599,17 +568,7 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } -static inline void num_poisoned_pages_inc(void) -{ - atomic_long_inc(&num_poisoned_pages); -} - -static inline void num_poisoned_pages_sub(long i) -{ - atomic_long_sub(i, &num_poisoned_pages); -} - -#else /* CONFIG_MEMORY_FAILURE */ +#else static inline swp_entry_t make_hwpoison_entry(struct page *page) { @@ -620,15 +579,7 @@ static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } - -static inline void num_poisoned_pages_inc(void) -{ -} - -static inline void num_poisoned_pages_sub(long i) -{ -} -#endif /* CONFIG_MEMORY_FAILURE */ +#endif static inline int non_swap_entry(swp_entry_t entry) { diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3518dba1e02f..7f5d1caf5890 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -40,10 +40,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGREUSE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, PGSCAN_KSWAPD, PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, PGSCAN_DIRECT_THROTTLE, PGSCAN_ANON, PGSCAN_FILE, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 2a430e713ce5..a48cd0ffe57d 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -55,5 +55,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle); unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); +unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); + void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); #endif diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 760455dfa860..3e6fb05852f9 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -203,5 +203,43 @@ TRACE_EVENT(mm_khugepaged_scan_file, __print_symbolic(__entry->result, SCAN_STATUS)) ); +TRACE_EVENT(mm_khugepaged_collapse_file, + TP_PROTO(struct mm_struct *mm, struct page *hpage, pgoff_t index, + bool is_shmem, unsigned long addr, struct file *file, + int nr, int result), + TP_ARGS(mm, hpage, index, addr, is_shmem, file, nr, result), + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(unsigned long, hpfn) + __field(pgoff_t, index) + __field(unsigned long, addr) + __field(bool, is_shmem) + __string(filename, file->f_path.dentry->d_iname) + __field(int, nr) + __field(int, result) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->hpfn = hpage ? page_to_pfn(hpage) : -1; + __entry->index = index; + __entry->addr = addr; + __entry->is_shmem = is_shmem; + __assign_str(filename, file->f_path.dentry->d_iname); + __entry->nr = nr; + __entry->result = result; + ), + + TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%ld, is_shmem=%d, filename=%s, nr=%d, result=%s", + __entry->mm, + __entry->hpfn, + __entry->index, + __entry->addr, + __entry->is_shmem, + __get_str(filename), + __entry->nr, + __print_symbolic(__entry->result, SCAN_STATUS)) +); + #endif /* __HUGE_MEMORY_H */ #include <trace/define_trace.h> diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 243073cfc29d..58688768ef0f 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -346,10 +346,9 @@ TRACE_MM_PAGES TRACE_EVENT(rss_stat, TP_PROTO(struct mm_struct *mm, - int member, - long count), + int member), - TP_ARGS(mm, member, count), + TP_ARGS(mm, member), TP_STRUCT__entry( __field(unsigned int, mm_id) @@ -362,7 +361,8 @@ TRACE_EVENT(rss_stat, __entry->mm_id = mm_ptr_to_hash(mm); __entry->curr = !!(current->mm == mm); __entry->member = member; - __entry->size = (count << PAGE_SHIFT); + __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member]) + << PAGE_SHIFT); ), TP_printk("mm_id=%u curr=%d type=%s size=%ldB", diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h new file mode 100644 index 000000000000..ad4e02191f35 --- /dev/null +++ b/include/trace/events/vmalloc.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vmalloc + +#if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VMALLOC_H + +#include <linux/tracepoint.h> + +/** + * alloc_vmap_area - called when a new vmap allocation occurs + * @addr: an allocated address + * @size: a requested size + * @align: a requested alignment + * @vstart: a requested start range + * @vend: a requested end range + * @failed: an allocation failed or not + * + * This event is used for a debug purpose, it can give an extra + * information for a developer about how often it occurs and which + * parameters are passed for further validation. + */ +TRACE_EVENT(alloc_vmap_area, + + TP_PROTO(unsigned long addr, unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, int failed), + + TP_ARGS(addr, size, align, vstart, vend, failed), + + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, size) + __field(unsigned long, align) + __field(unsigned long, vstart) + __field(unsigned long, vend) + __field(int, failed) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->size = size; + __entry->align = align; + __entry->vstart = vstart; + __entry->vend = vend; + __entry->failed = failed; + ), + + TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d", + __entry->addr, __entry->size, __entry->align, + __entry->vstart, __entry->vend, __entry->failed) +); + +/** + * purge_vmap_area_lazy - called when vmap areas were lazily freed + * @start: purging start address + * @end: purging end address + * @npurged: numbed of purged vmap areas + * + * This event is used for a debug purpose. It gives some + * indication about start:end range and how many objects + * are released. + */ +TRACE_EVENT(purge_vmap_area_lazy, + + TP_PROTO(unsigned long start, unsigned long end, + unsigned int npurged), + + TP_ARGS(start, end, npurged), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, npurged) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->npurged = npurged; + ), + + TP_printk("start=0x%lx end=0x%lx num_purged=%u", + __entry->start, __entry->end, __entry->npurged) +); + +/** + * free_vmap_area_noflush - called when a vmap area is freed + * @va_start: a start address of VA + * @nr_lazy: number of current lazy pages + * @nr_lazy_max: number of maximum lazy pages + * + * This event is used for a debug purpose. It gives some + * indication about a VA that is released, number of current + * outstanding areas and a maximum allowed threshold before + * dropping all of them. + */ +TRACE_EVENT(free_vmap_area_noflush, + + TP_PROTO(unsigned long va_start, unsigned long nr_lazy, + unsigned long nr_lazy_max), + + TP_ARGS(va_start, nr_lazy, nr_lazy_max), + + TP_STRUCT__entry( + __field(unsigned long, va_start) + __field(unsigned long, nr_lazy) + __field(unsigned long, nr_lazy_max) + ), + + TP_fast_assign( + __entry->va_start = va_start; + __entry->nr_lazy = nr_lazy; + __entry->nr_lazy_max = nr_lazy_max; + ), + + TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu", + __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max) +); + +#endif /* _TRACE_VMALLOC_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> |