diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 23 | ||||
-rw-r--r-- | mm/Kconfig.debug | 6 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 14 | ||||
-rw-r--r-- | mm/compaction.c | 80 | ||||
-rw-r--r-- | mm/damon/paddr.c | 5 | ||||
-rw-r--r-- | mm/folio-compat.c | 4 | ||||
-rw-r--r-- | mm/huge_memory.c | 25 | ||||
-rw-r--r-- | mm/hugetlb.c | 65 | ||||
-rw-r--r-- | mm/kfence/Makefile | 2 | ||||
-rw-r--r-- | mm/kfence/core.c | 46 | ||||
-rw-r--r-- | mm/khugepaged.c | 4 | ||||
-rw-r--r-- | mm/kmsan/hooks.c | 55 | ||||
-rw-r--r-- | mm/kmsan/shadow.c | 27 | ||||
-rw-r--r-- | mm/ksm.c | 11 | ||||
-rw-r--r-- | mm/maccess.c | 16 | ||||
-rw-r--r-- | mm/madvise.c | 9 | ||||
-rw-r--r-- | mm/memory-failure.c | 36 | ||||
-rw-r--r-- | mm/memory.c | 16 | ||||
-rw-r--r-- | mm/mempolicy.c | 104 | ||||
-rw-r--r-- | mm/migrate.c | 185 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 60 | ||||
-rw-r--r-- | mm/mprotect.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 40 | ||||
-rw-r--r-- | mm/page_alloc.c | 22 | ||||
-rw-r--r-- | mm/shmem.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slab.h | 61 | ||||
-rw-r--r-- | mm/slab_common.c | 7 | ||||
-rw-r--r-- | mm/slob.c | 757 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 3 | ||||
-rw-r--r-- | mm/usercopy.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 46 | ||||
-rw-r--r-- | mm/vmscan.c | 4 | ||||
-rw-r--r-- | mm/zpool.c | 1 | ||||
-rw-r--r-- | mm/zswap.c | 1 |
39 files changed, 594 insertions, 1161 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 4751031f3f05..9c40844b7bc9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -238,30 +238,8 @@ config SLUB and has enhanced diagnostics. SLUB is the default choice for a slab allocator. -config SLOB_DEPRECATED - depends on EXPERT - bool "SLOB (Simple Allocator - DEPRECATED)" - depends on !PREEMPT_RT - help - Deprecated and scheduled for removal in a few cycles. SLUB - recommended as replacement. CONFIG_SLUB_TINY can be considered - on systems with 16MB or less RAM. - - If you need SLOB to stay, please contact linux-mm@kvack.org and - people listed in the SLAB ALLOCATOR section of MAINTAINERS file, - with your use case. - - SLOB replaces the stock allocator with a drastically simpler - allocator. SLOB is generally more space efficient but - does not perform as well on large systems. - endchoice -config SLOB - bool - default y - depends on SLOB_DEPRECATED - config SLUB_TINY bool "Configure SLUB for minimal memory footprint" depends on SLUB && EXPERT @@ -686,7 +664,6 @@ config BOUNCE config MMU_NOTIFIER bool - select SRCU select INTERVAL_TREE config KSM diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index c3547a373c9c..59c83ad976f7 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -60,9 +60,9 @@ config SLUB_DEBUG select STACKDEPOT if STACKTRACE_SUPPORT help SLUB has extensive debug support features. Disabling these can - result in significant savings in code size. This also disables - SLUB sysfs support. /sys/slab will not exist and there will be - no support for cache validation etc. + result in significant savings in code size. While /sys/kernel/slab + will still exist (with SYSFS enabled), it will not provide e.g. cache + validation. config SLUB_DEBUG_ON bool "SLUB debugging on by default" diff --git a/mm/Makefile b/mm/Makefile index 8e105e5b3e29..e347958fc6b2 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -22,7 +22,6 @@ KCSAN_INSTRUMENT_BARRIERS := y # flaky coverage that is not a function of syscall inputs. E.g. slab is out of # free pages, or a task is migrated between nodes. KCOV_INSTRUMENT_slab_common.o := n -KCOV_INSTRUMENT_slob.o := n KCOV_INSTRUMENT_slab.o := n KCOV_INSTRUMENT_slub.o := n KCOV_INSTRUMENT_page_alloc.o := n @@ -81,7 +80,6 @@ obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o -obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += page_poison.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a53b9360b72e..43b48750b491 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -347,7 +347,7 @@ ATTRIBUTE_GROUPS(bdi_dev); static __init int bdi_class_init(void) { - bdi_class = class_create(THIS_MODULE, "bdi"); + bdi_class = class_create("bdi"); if (IS_ERR(bdi_class)) return PTR_ERR(bdi_class); @@ -507,6 +507,15 @@ static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); +static void cgwb_free_rcu(struct rcu_head *rcu_head) +{ + struct bdi_writeback *wb = container_of(rcu_head, + struct bdi_writeback, rcu); + + percpu_ref_exit(&wb->refcnt); + kfree(wb); +} + static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, @@ -529,11 +538,10 @@ static void cgwb_release_workfn(struct work_struct *work) list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); - percpu_ref_exit(&wb->refcnt); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); - kfree_rcu(wb, rcu); + call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) diff --git a/mm/compaction.c b/mm/compaction.c index 5a9501e0ae01..9ff71239b1fc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1716,7 +1716,14 @@ typedef enum { * Allow userspace to control policy on scanning the unevictable LRU for * compactable pages. */ -int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT; +static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT; +/* + * Tunable for proactive compaction. It determines how + * aggressively the kernel should compact memory in the + * background. It takes values in the range [0, 100]. + */ +static unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +static int sysctl_extfrag_threshold = 500; static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) @@ -2572,8 +2579,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, return ret; } -int sysctl_extfrag_threshold = 500; - /** * try_to_compact_pages - Direct compact to satisfy a high-order allocation * @gfp_mask: The GFP mask of the current allocation @@ -2730,14 +2735,7 @@ static void compact_nodes(void) compact_node(nid); } -/* - * Tunable for proactive compaction. It determines how - * aggressively the kernel should compact memory in the - * background. It takes values in the range [0, 100]. - */ -unsigned int __read_mostly sysctl_compaction_proactiveness = 20; - -int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, +static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc, nid; @@ -2767,7 +2765,7 @@ int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory */ -int sysctl_compaction_handler(struct ctl_table *table, int write, +static int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { if (write) @@ -3063,6 +3061,63 @@ static int kcompactd_cpu_online(unsigned int cpu) return 0; } +static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, old; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write) + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + old = *(int *)table->data; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret) + return ret; + if (old != *(int *)table->data) + pr_warn_once("sysctl attribute %s changed by %s[%d]\n", + table->procname, current->comm, + task_pid_nr(current)); + return ret; +} + +static struct ctl_table vm_compaction[] = { + { + .procname = "compact_memory", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = sysctl_compaction_handler, + }, + { + .procname = "compaction_proactiveness", + .data = &sysctl_compaction_proactiveness, + .maxlen = sizeof(sysctl_compaction_proactiveness), + .mode = 0644, + .proc_handler = compaction_proactiveness_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "extfrag_threshold", + .data = &sysctl_extfrag_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_THOUSAND, + }, + { + .procname = "compact_unevictable_allowed", + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_warn_RT_change, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + static int __init kcompactd_init(void) { int nid; @@ -3078,6 +3133,7 @@ static int __init kcompactd_init(void) for_each_node_state(nid, N_MEMORY) kcompactd_run(nid); + register_sysctl_init("vm", vm_compaction); return 0; } subsys_initcall(kcompactd_init) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 6c655d9b5639..dd9c33fbe805 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -130,7 +130,6 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) accessed = false; else accessed = true; - folio_put(folio); goto out; } @@ -144,10 +143,10 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) if (need_lock) folio_unlock(folio); - folio_put(folio); out: *folio_sz = folio_size(folio); + folio_put(folio); return accessed; } @@ -281,8 +280,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( folio_mark_accessed(folio); else folio_deactivate(folio); - folio_put(folio); applied += folio_nr_pages(folio); + folio_put(folio); } return applied * PAGE_SIZE; } diff --git a/mm/folio-compat.c b/mm/folio-compat.c index cabcd1de9ecb..a71523a06ccd 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -106,9 +106,7 @@ EXPORT_SYMBOL(pagecache_get_page); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index) { - unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - - return pagecache_get_page(mapping, index, fgp_flags, + return pagecache_get_page(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(grab_cache_page_write_begin); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4fc43859e59a..3fae2d2496ab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1838,10 +1838,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); struct page *page = pfn_swap_entry_to_page(entry); + pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { - pmd_t newpmd; /* * A protection check is difficult so * just be safe and disable write @@ -1855,8 +1855,16 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, newpmd = pmd_swp_mksoft_dirty(newpmd); if (pmd_swp_uffd_wp(*pmd)) newpmd = pmd_swp_mkuffd_wp(newpmd); - set_pmd_at(mm, addr, pmd, newpmd); + } else { + newpmd = *pmd; } + + if (uffd_wp) + newpmd = pmd_swp_mkuffd_wp(newpmd); + else if (uffd_wp_resolve) + newpmd = pmd_swp_clear_uffd_wp(newpmd); + if (!pmd_same(*pmd, newpmd)) + set_pmd_at(mm, addr, pmd, newpmd); goto unlock; } #endif @@ -2037,7 +2045,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; pgtable_t pgtable; - pmd_t _pmd; + pmd_t _pmd, old_pmd; int i; /* @@ -2048,7 +2056,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, * * See Documentation/mm/mmu_notifier.rst */ - pmdp_huge_clear_flush(vma, haddr, pmd); + old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2057,6 +2065,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pte_t *pte, entry; entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); entry = pte_mkspecial(entry); + if (pmd_uffd_wp(old_pmd)) + entry = pte_mkuffd_wp(entry); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -2655,9 +2665,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); is_hzp = is_huge_zero_page(&folio->page); - VM_WARN_ON_ONCE_FOLIO(is_hzp, folio); - if (is_hzp) + if (is_hzp) { + pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; + } if (folio_test_writeback(folio)) return -EBUSY; @@ -3249,6 +3260,8 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); + if (pmd_uffd_wp(pmdval)) + pmdswp = pmd_swp_mkuffd_wp(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, vma, true); put_page(page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 07abcb6eb203..a93e070ab175 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4202,6 +4202,12 @@ static void __init hugetlb_sysfs_init(void) hugetlb_register_all_nodes(); } +#ifdef CONFIG_SYSCTL +static void hugetlb_sysctl_init(void); +#else +static inline void hugetlb_sysctl_init(void) { } +#endif + static int __init hugetlb_init(void) { int i; @@ -4257,6 +4263,7 @@ static int __init hugetlb_init(void) hugetlb_sysfs_init(); hugetlb_cgroup_file_init(); + hugetlb_sysctl_init(); #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); @@ -4588,7 +4595,7 @@ out: return ret; } -int hugetlb_sysctl_handler(struct ctl_table *table, int write, +static int hugetlb_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { @@ -4597,7 +4604,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, } #ifdef CONFIG_NUMA -int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, +static int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, @@ -4605,7 +4612,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, } #endif /* CONFIG_NUMA */ -int hugetlb_overcommit_handler(struct ctl_table *table, int write, +static int hugetlb_overcommit_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -4634,6 +4641,44 @@ out: return ret; } +static struct ctl_table hugetlb_table[] = { + { + .procname = "nr_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + }, +#endif + { + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nr_overcommit_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_overcommit_handler, + }, + { } +}; + +static void hugetlb_sysctl_init(void) +{ + register_sysctl_init("vm", hugetlb_table); +} #endif /* CONFIG_SYSCTL */ void hugetlb_report_meminfo(struct seq_file *m) @@ -5478,7 +5523,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, struct folio *pagecache_folio, spinlock_t *ptl) { const bool unshare = flags & FAULT_FLAG_UNSHARE; - pte_t pte; + pte_t pte = huge_ptep_get(ptep); struct hstate *h = hstate_vma(vma); struct page *old_page; struct folio *new_folio; @@ -5488,6 +5533,17 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, struct mmu_notifier_range range; /* + * Never handle CoW for uffd-wp protected pages. It should be only + * handled when the uffd-wp protection is removed. + * + * Note that only the CoW optimization path (in hugetlb_no_page()) + * can trigger this, because hugetlb_fault() will always resolve + * uffd-wp bit first. + */ + if (!unshare && huge_pte_uffd_wp(pte)) + return 0; + + /* * hugetlb does not support FOLL_FORCE-style write faults that keep the * PTE mapped R/O such as maybe_mkwrite() would do. */ @@ -5500,7 +5556,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - pte = huge_ptep_get(ptep); old_page = pte_page(pte); delayacct_wpcopy_start(); diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile index 0bb95728a784..2de2a58d11a1 100644 --- a/mm/kfence/Makefile +++ b/mm/kfence/Makefile @@ -2,5 +2,5 @@ obj-y := core.o report.o -CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls +CFLAGS_kfence_test.o := -fno-omit-frame-pointer -fno-optimize-sibling-calls obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 5349c37a5dac..7d01a2c76e80 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -556,15 +556,11 @@ static unsigned long kfence_init_pool(void) * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab = page_slab(&pages[i]); + struct slab *slab = page_slab(nth_page(pages, i)); if (!i || (i % 2)) continue; - /* Verify we do not have a compound head page. */ - if (WARN_ON(compound_head(&pages[i]) != &pages[i])) - return addr; - __folio_set_slab(slab_folio(slab)); #ifdef CONFIG_MEMCG slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | @@ -597,12 +593,26 @@ static unsigned long kfence_init_pool(void) /* Protect the right redzone. */ if (unlikely(!kfence_protect(addr + PAGE_SIZE))) - return addr; + goto reset_slab; addr += 2 * PAGE_SIZE; } return 0; + +reset_slab: + for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { + struct slab *slab = page_slab(nth_page(pages, i)); + + if (!i || (i % 2)) + continue; +#ifdef CONFIG_MEMCG + slab->memcg_data = 0; +#endif + __folio_clear_slab(slab_folio(slab)); + } + + return addr; } static bool __init kfence_init_pool_early(void) @@ -632,16 +642,6 @@ static bool __init kfence_init_pool_early(void) * fails for the first page, and therefore expect addr==__kfence_pool in * most failure cases. */ - for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) { - struct slab *slab = virt_to_slab(p); - - if (!slab) - continue; -#ifdef CONFIG_MEMCG - slab->memcg_data = 0; -#endif - __folio_clear_slab(slab_folio(slab)); - } memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; return false; @@ -726,10 +726,14 @@ static const struct seq_operations objects_sops = { }; DEFINE_SEQ_ATTRIBUTE(objects); -static int __init kfence_debugfs_init(void) +static int kfence_debugfs_init(void) { - struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL); + struct dentry *kfence_dir; + + if (!READ_ONCE(kfence_enabled)) + return 0; + kfence_dir = debugfs_create_dir("kfence", NULL); debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops); debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops); return 0; @@ -814,6 +818,10 @@ void __init kfence_alloc_pool(void) if (!kfence_sample_interval) return; + /* if the pool has already been initialized by arch, skip the below. */ + if (__kfence_pool) + return; + __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); if (!__kfence_pool) @@ -883,6 +891,8 @@ static int kfence_init_late(void) } kfence_init_enable(); + kfence_debugfs_init(); + return 0; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 92e6f56a932d..0ec69b96b497 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -572,6 +572,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PTE_NON_PRESENT; goto out; } + if (pte_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out; + } page = vm_normal_page(vma, address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 3807502766a3..ec0da72e65aa 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -148,35 +148,74 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end) * into the virtual memory. If those physical pages already had shadow/origin, * those are ignored. */ -void kmsan_ioremap_page_range(unsigned long start, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int page_shift) +int kmsan_ioremap_page_range(unsigned long start, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) { gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO; struct page *shadow, *origin; unsigned long off = 0; - int nr; + int nr, err = 0, clean = 0, mapped; if (!kmsan_enabled || kmsan_in_runtime()) - return; + return 0; nr = (end - start) / PAGE_SIZE; kmsan_enter_runtime(); - for (int i = 0; i < nr; i++, off += PAGE_SIZE) { + for (int i = 0; i < nr; i++, off += PAGE_SIZE, clean = i) { shadow = alloc_pages(gfp_mask, 1); origin = alloc_pages(gfp_mask, 1); - __vmap_pages_range_noflush( + if (!shadow || !origin) { + err = -ENOMEM; + goto ret; + } + mapped = __vmap_pages_range_noflush( vmalloc_shadow(start + off), vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow, PAGE_SHIFT); - __vmap_pages_range_noflush( + if (mapped) { + err = mapped; + goto ret; + } + shadow = NULL; + mapped = __vmap_pages_range_noflush( vmalloc_origin(start + off), vmalloc_origin(start + off + PAGE_SIZE), prot, &origin, PAGE_SHIFT); + if (mapped) { + __vunmap_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE)); + err = mapped; + goto ret; + } + origin = NULL; + } + /* Page mapping loop finished normally, nothing to clean up. */ + clean = 0; + +ret: + if (clean > 0) { + /* + * Something went wrong. Clean up shadow/origin pages allocated + * on the last loop iteration, then delete mappings created + * during the previous iterations. + */ + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + __vunmap_range_noflush( + vmalloc_shadow(start), + vmalloc_shadow(start + clean * PAGE_SIZE)); + __vunmap_range_noflush( + vmalloc_origin(start), + vmalloc_origin(start + clean * PAGE_SIZE)); } flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); kmsan_leave_runtime(); + return err; } void kmsan_iounmap_page_range(unsigned long start, unsigned long end) diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index a787c04e9583..b8bb95eea5e3 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -216,27 +216,29 @@ void kmsan_free_page(struct page *page, unsigned int order) kmsan_leave_runtime(); } -void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages, - unsigned int page_shift) +int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) { unsigned long shadow_start, origin_start, shadow_end, origin_end; struct page **s_pages, **o_pages; - int nr, mapped; + int nr, mapped, err = 0; if (!kmsan_enabled) - return; + return 0; shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW); shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW); if (!shadow_start) - return; + return 0; nr = (end - start) / PAGE_SIZE; s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); - if (!s_pages || !o_pages) + if (!s_pages || !o_pages) { + err = -ENOMEM; goto ret; + } for (int i = 0; i < nr; i++) { s_pages[i] = shadow_page_for(pages[i]); o_pages[i] = origin_page_for(pages[i]); @@ -249,10 +251,16 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, kmsan_enter_runtime(); mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, s_pages, page_shift); - KMSAN_WARN_ON(mapped); + if (mapped) { + err = mapped; + goto ret; + } mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, o_pages, page_shift); - KMSAN_WARN_ON(mapped); + if (mapped) { + err = mapped; + goto ret; + } kmsan_leave_runtime(); flush_tlb_kernel_range(shadow_start, shadow_end); flush_tlb_kernel_range(origin_start, origin_end); @@ -262,6 +270,7 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, ret: kfree(s_pages); kfree(o_pages); + return err; } /* Allocate metadata for pages allocated at boot time. */ @@ -988,9 +988,15 @@ static int unmerge_and_remove_all_rmap_items(void) mm = mm_slot->slot.mm; mmap_read_lock(mm); + + /* + * Exit right away if mm is exiting to avoid lockdep issue in + * the maple tree + */ + if (ksm_test_exit(mm)) + goto mm_exiting; + for_each_vma(vmi, vma) { - if (ksm_test_exit(mm)) - break; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, @@ -999,6 +1005,7 @@ static int unmerge_and_remove_all_rmap_items(void) goto error; } +mm_exiting: remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); diff --git a/mm/maccess.c b/mm/maccess.c index 074f6b086671..518a25667323 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -5,6 +5,7 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/uaccess.h> +#include <asm/tlb.h> bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) @@ -113,11 +114,16 @@ Efault: long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; - if (access_ok(src, size)) { - pagefault_disable(); - ret = __copy_from_user_inatomic(dst, src, size); - pagefault_enable(); - } + + if (!__access_ok(src, size)) + return ret; + + if (!nmi_uaccess_okay()) + return ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, size); + pagefault_enable(); if (ret) return -EFAULT; diff --git a/mm/madvise.c b/mm/madvise.c index 340125d08c03..9f389c5304d2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1456,7 +1456,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; - struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; @@ -1503,12 +1503,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, total_len = iov_iter_count(&iter); while (iov_iter_count(&iter)) { - iovec = iov_iter_iovec(&iter); - ret = do_madvise(mm, (unsigned long)iovec.iov_base, - iovec.iov_len, behavior); + ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), + iter_iov_len(&iter), behavior); if (ret < 0) break; - iov_iter_advance(&iter, iovec.iov_len); + iov_iter_advance(&iter, iter_iov_len(&iter)); } ret = (total_len - iov_iter_count(&iter)) ? : ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fae9baf3be16..10e60b6b2447 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -62,13 +62,14 @@ #include <linux/page-isolation.h> #include <linux/pagewalk.h> #include <linux/shmem_fs.h> +#include <linux/sysctl.h> #include "swap.h" #include "internal.h" #include "ras/ras_event.h" -int sysctl_memory_failure_early_kill __read_mostly = 0; +static int sysctl_memory_failure_early_kill __read_mostly; -int sysctl_memory_failure_recovery __read_mostly = 1; +static int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); @@ -122,6 +123,37 @@ const struct attribute_group memory_failure_attr_group = { .attrs = memory_failure_attr, }; +#ifdef CONFIG_SYSCTL +static struct ctl_table memory_failure_table[] = { + { + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init memory_failure_sysctl_init(void) +{ + register_sysctl_init("vm", memory_failure_table); + return 0; +} +late_initcall(memory_failure_sysctl_init); +#endif /* CONFIG_SYSCTL */ + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, diff --git a/mm/memory.c b/mm/memory.c index f456f3b5049c..01a23ad48a04 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3563,8 +3563,21 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; - if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) + /* + * We need a reference to lock the folio because we don't hold + * the PTL so a racing thread can remove the device-exclusive + * entry and unmap it. If the folio is free the entry must + * have been removed already. If it happens to have already + * been re-allocated after being freed all we do is lock and + * unlock it. + */ + if (!folio_try_get(folio)) + return 0; + + if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) { + folio_put(folio); return VM_FAULT_RETRY; + } mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); @@ -3577,6 +3590,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); folio_unlock(folio); + folio_put(folio); mmu_notifier_invalidate_range_end(&range); return 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a256a241fd1d..2068b594dc88 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -790,61 +790,50 @@ static int vma_replace_policy(struct vm_area_struct *vma, return err; } -/* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct mempolicy *new_pol) +/* Split or merge the VMA (if required) and apply the new policy */ +static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { - VMA_ITERATOR(vmi, mm, start); - struct vm_area_struct *prev; - struct vm_area_struct *vma; - int err = 0; + struct vm_area_struct *merged; + unsigned long vmstart, vmend; pgoff_t pgoff; + int err; - prev = vma_prev(&vmi); - vma = vma_find(&vmi, end); - if (WARN_ON(!vma)) + vmend = min(end, vma->vm_end); + if (start > vma->vm_start) { + *prev = vma; + vmstart = start; + } else { + vmstart = vma->vm_start; + } + + if (mpol_equal(vma_policy(vma), new_pol)) return 0; - if (start > vma->vm_start) - prev = vma; - - do { - unsigned long vmstart = max(start, vma->vm_start); - unsigned long vmend = min(end, vma->vm_end); - - if (mpol_equal(vma_policy(vma), new_pol)) - goto next; - - pgoff = vma->vm_pgoff + - ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); - if (prev) { - vma = prev; - goto replace; - } - if (vma->vm_start != vmstart) { - err = split_vma(&vmi, vma, vmstart, 1); - if (err) - goto out; - } - if (vma->vm_end != vmend) { - err = split_vma(&vmi, vma, vmend, 0); - if (err) - goto out; - } -replace: - err = vma_replace_policy(vma, new_pol); + pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); + merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol, + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (merged) { + *prev = merged; + return vma_replace_policy(merged, new_pol); + } + + if (vma->vm_start != vmstart) { + err = split_vma(vmi, vma, vmstart, 1); if (err) - goto out; -next: - prev = vma; - } for_each_vma_range(vmi, vma, end); + return err; + } -out: - return err; + if (vma->vm_end != vmend) { + err = split_vma(vmi, vma, vmend, 0); + if (err) + return err; + } + + *prev = vma; + return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ @@ -1259,6 +1248,8 @@ static long do_mbind(unsigned long start, unsigned long len, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vma_iterator vmi; struct mempolicy *new; unsigned long end; int err; @@ -1328,7 +1319,13 @@ static long do_mbind(unsigned long start, unsigned long len, goto up_out; } - err = mbind_range(mm, start, end, new); + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } if (!err) { int nr_failed = 0; @@ -1489,10 +1486,8 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; - unsigned long vmstart; - unsigned long vmend; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); @@ -1521,6 +1516,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); + prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND @@ -1541,9 +1537,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le } new->home_node = home_node; - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); - err = mbind_range(mm, vmstart, vmend, new); + err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; diff --git a/mm/migrate.c b/mm/migrate.c index 98f1c11197a8..db3f154446af 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1112,9 +1112,8 @@ static void migrate_folio_done(struct folio *src, /* Obtain the lock on page, remove all ptes. */ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct folio *src, - struct folio **dstp, int force, bool avoid_force_lock, - enum migrate_mode mode, enum migrate_reason reason, - struct list_head *ret) + struct folio **dstp, enum migrate_mode mode, + enum migrate_reason reason, struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; @@ -1144,7 +1143,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page dst->private = NULL; if (!folio_trylock(src)) { - if (!force || mode == MIGRATE_ASYNC) + if (mode == MIGRATE_ASYNC) goto out; /* @@ -1163,17 +1162,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page if (current->flags & PF_MEMALLOC) goto out; - /* - * We have locked some folios and are going to wait to lock - * this folio. To avoid a potential deadlock, let's bail - * out and not do that. The locked folios will be moved and - * unlocked, then we can wait to lock this folio. - */ - if (avoid_force_lock) { - rc = -EDEADLOCK; - goto out; - } - folio_lock(src); } locked = true; @@ -1193,8 +1181,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page rc = -EBUSY; goto out; } - if (!force) - goto out; folio_wait_writeback(src); } @@ -1253,7 +1239,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page /* Establish migration ptes */ VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); - try_to_migrate(src, TTU_BATCH_FLUSH); + try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0); page_was_mapped = 1; } @@ -1267,7 +1253,7 @@ out: * A folio that has not been unmapped will be restored to * right list unless we want to retry. */ - if (rc == -EAGAIN || rc == -EDEADLOCK) + if (rc == -EAGAIN) ret = NULL; migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret); @@ -1508,6 +1494,9 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f #define NR_MAX_BATCHED_MIGRATION 512 #endif #define NR_MAX_MIGRATE_PAGES_RETRY 10 +#define NR_MAX_MIGRATE_ASYNC_RETRY 3 +#define NR_MAX_MIGRATE_SYNC_RETRY \ + (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY) struct migrate_pages_stats { int nr_succeeded; /* Normal and large folios migrated successfully, in @@ -1618,13 +1607,19 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, /* * migrate_pages_batch() first unmaps folios in the from list as many as * possible, then move the unmapped folios. + * + * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a + * lock or bit when we have locked more than one folio. Which may cause + * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the + * length of the from list must be <= 1. */ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, enum migrate_mode mode, int reason, struct list_head *ret_folios, - struct migrate_pages_stats *stats) + struct list_head *split_folios, struct migrate_pages_stats *stats, + int nr_pass) { - int retry; + int retry = 1; int large_retry = 1; int thp_retry = 1; int nr_failed = 0; @@ -1634,21 +1629,15 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, bool is_large = false; bool is_thp = false; struct folio *folio, *folio2, *dst = NULL, *dst2; - int rc, rc_saved, nr_pages; - LIST_HEAD(split_folios); + int rc, rc_saved = 0, nr_pages; LIST_HEAD(unmap_folios); LIST_HEAD(dst_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); - bool no_split_folio_counting = false; - bool avoid_force_lock; -retry: - rc_saved = 0; - avoid_force_lock = false; - retry = 1; - for (pass = 0; - pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); - pass++) { + VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC && + !list_empty(from) && !list_is_singular(from)); + + for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) { retry = 0; large_retry = 0; thp_retry = 0; @@ -1679,7 +1668,7 @@ retry: if (!thp_migration_supported() && is_thp) { nr_large_failed++; stats->nr_thp_failed++; - if (!try_split_folio(folio, &split_folios)) { + if (!try_split_folio(folio, split_folios)) { stats->nr_thp_split++; continue; } @@ -1689,15 +1678,13 @@ retry: } rc = migrate_folio_unmap(get_new_page, put_new_page, private, - folio, &dst, pass > 2, avoid_force_lock, - mode, reason, ret_folios); + folio, &dst, mode, reason, ret_folios); /* * The rules are: * Success: folio will be freed * Unmap: folio will be put on unmap_folios list, * dst folio put on dst_folios list * -EAGAIN: stay on the from list - * -EDEADLOCK: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list */ @@ -1712,7 +1699,7 @@ retry: stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (!nosplit) { - int ret = try_split_folio(folio, &split_folios); + int ret = try_split_folio(folio, split_folios); if (!ret) { stats->nr_thp_split += is_thp; @@ -1729,18 +1716,11 @@ retry: break; } } - } else if (!no_split_folio_counting) { + } else { nr_failed++; } stats->nr_failed_pages += nr_pages + nr_retry_pages; - /* - * There might be some split folios of fail-to-migrate large - * folios left in split_folios list. Move them to ret_folios - * list so that they could be put back to the right list by - * the caller otherwise the folio refcnt will be leaked. - */ - list_splice_init(&split_folios, ret_folios); /* nr_failed isn't updated for not used */ nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; @@ -1749,19 +1729,11 @@ retry: goto out; else goto move; - case -EDEADLOCK: - /* - * The folio cannot be locked for potential deadlock. - * Go move (and unlock) all locked folios. Then we can - * try again. - */ - rc_saved = rc; - goto move; case -EAGAIN: if (is_large) { large_retry++; thp_retry += is_thp; - } else if (!no_split_folio_counting) { + } else { retry++; } nr_retry_pages += nr_pages; @@ -1771,11 +1743,6 @@ retry: stats->nr_thp_succeeded += is_thp; break; case MIGRATEPAGE_UNMAP: - /* - * We have locked some folios, don't force lock - * to avoid deadlock. - */ - avoid_force_lock = true; list_move_tail(&folio->lru, &unmap_folios); list_add_tail(&dst->lru, &dst_folios); break; @@ -1789,7 +1756,7 @@ retry: if (is_large) { nr_large_failed++; stats->nr_thp_failed += is_thp; - } else if (!no_split_folio_counting) { + } else { nr_failed++; } @@ -1807,9 +1774,7 @@ move: try_to_unmap_flush(); retry = 1; - for (pass = 0; - pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); - pass++) { + for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) { retry = 0; large_retry = 0; thp_retry = 0; @@ -1838,7 +1803,7 @@ move: if (is_large) { large_retry++; thp_retry += is_thp; - } else if (!no_split_folio_counting) { + } else { retry++; } nr_retry_pages += nr_pages; @@ -1851,7 +1816,7 @@ move: if (is_large) { nr_large_failed++; stats->nr_thp_failed += is_thp; - } else if (!no_split_folio_counting) { + } else { nr_failed++; } @@ -1888,30 +1853,52 @@ out: dst2 = list_next_entry(dst, lru); } - /* - * Try to migrate split folios of fail-to-migrate large folios, no - * nr_failed counting in this round, since all split folios of a - * large folio is counted as 1 failure in the first round. - */ - if (rc >= 0 && !list_empty(&split_folios)) { - /* - * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY - * retries) to ret_folios to avoid migrating them again. - */ - list_splice_init(from, ret_folios); - list_splice_init(&split_folios, from); - no_split_folio_counting = true; - goto retry; - } + return rc; +} +static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason, struct list_head *ret_folios, + struct list_head *split_folios, struct migrate_pages_stats *stats) +{ + int rc, nr_failed = 0; + LIST_HEAD(folios); + struct migrate_pages_stats astats; + + memset(&astats, 0, sizeof(astats)); + /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */ + rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC, + reason, &folios, split_folios, &astats, + NR_MAX_MIGRATE_ASYNC_RETRY); + stats->nr_succeeded += astats.nr_succeeded; + stats->nr_thp_succeeded += astats.nr_thp_succeeded; + stats->nr_thp_split += astats.nr_thp_split; + if (rc < 0) { + stats->nr_failed_pages += astats.nr_failed_pages; + stats->nr_thp_failed += astats.nr_thp_failed; + list_splice_tail(&folios, ret_folios); + return rc; + } + stats->nr_thp_failed += astats.nr_thp_split; + nr_failed += astats.nr_thp_split; /* - * We have unlocked all locked folios, so we can force lock now, let's - * try again. + * Fall back to migrate all failed folios one by one synchronously. All + * failed folios except split THPs will be retried, so their failure + * isn't counted */ - if (rc == -EDEADLOCK) - goto retry; + list_splice_tail_init(&folios, from); + while (!list_empty(from)) { + list_move(from->next, &folios); + rc = migrate_pages_batch(&folios, get_new_page, put_new_page, + private, mode, reason, ret_folios, + split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY); + list_splice_tail_init(&folios, ret_folios); + if (rc < 0) + return rc; + nr_failed += rc; + } - return rc; + return nr_failed; } /* @@ -1949,6 +1936,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, struct folio *folio, *folio2; LIST_HEAD(folios); LIST_HEAD(ret_folios); + LIST_HEAD(split_folios); struct migrate_pages_stats stats; trace_mm_migrate_pages_start(mode, reason); @@ -1959,6 +1947,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, mode, reason, &stats, &ret_folios); if (rc_gather < 0) goto out; + again: nr_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { @@ -1969,20 +1958,36 @@ again: } nr_pages += folio_nr_pages(folio); - if (nr_pages > NR_MAX_BATCHED_MIGRATION) + if (nr_pages >= NR_MAX_BATCHED_MIGRATION) break; } - if (nr_pages > NR_MAX_BATCHED_MIGRATION) - list_cut_before(&folios, from, &folio->lru); + if (nr_pages >= NR_MAX_BATCHED_MIGRATION) + list_cut_before(&folios, from, &folio2->lru); else list_splice_init(from, &folios); - rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, - mode, reason, &ret_folios, &stats); + if (mode == MIGRATE_ASYNC) + rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, + mode, reason, &ret_folios, &split_folios, &stats, + NR_MAX_MIGRATE_PAGES_RETRY); + else + rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private, + mode, reason, &ret_folios, &split_folios, &stats); list_splice_tail_init(&folios, &ret_folios); if (rc < 0) { rc_gather = rc; + list_splice_tail(&split_folios, &ret_folios); goto out; } + if (!list_empty(&split_folios)) { + /* + * Failure isn't counted since all split folios of a large folio + * is counted as 1 failure already. And, we only try to migrate + * with minimal effort, force MIGRATE_ASYNC mode and retry once. + */ + migrate_pages_batch(&split_folios, get_new_page, put_new_page, private, + MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1); + list_splice_tail_init(&split_folios, &ret_folios); + } rc_gather += rc; if (!list_empty(from)) goto again; diff --git a/mm/mincore.c b/mm/mincore.c index cd69b9db0081..d359650b0f75 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -33,7 +33,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, * Hugepages under user process are always in RAM and never * swapped out, but theoretically it needs to be checked. */ - present = pte && !huge_pte_none(huge_ptep_get(pte)); + present = pte && !huge_pte_none_mostly(huge_ptep_get(pte)); for (; addr != end; vec++, addr += PAGE_SIZE) *vec = present; walk->private = vec; diff --git a/mm/mmap.c b/mm/mmap.c index 740b54be3ed4..eefa6f0cda28 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -978,7 +978,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma = next; /* case 3 */ vma_start = addr; vma_end = next->vm_end; - vma_pgoff = mid->vm_pgoff; + vma_pgoff = next->vm_pgoff - pglen; err = 0; if (mid != next) { /* case 8 */ remove = mid; @@ -1518,7 +1518,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) */ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { - unsigned long length, gap; + unsigned long length, gap, low_limit; + struct vm_area_struct *tmp; MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); @@ -1527,12 +1528,29 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) if (length < info->length) return -ENOMEM; - if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1, - length)) + low_limit = info->low_limit; +retry: + if (mas_empty_area(&mas, low_limit, info->high_limit - 1, length)) return -ENOMEM; gap = mas.index; gap += (info->align_offset - gap) & info->align_mask; + tmp = mas_next(&mas, ULONG_MAX); + if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap + length - 1) { + low_limit = tmp->vm_end; + mas_reset(&mas); + goto retry; + } + } else { + tmp = mas_prev(&mas, 0); + if (tmp && vm_end_gap(tmp) > gap) { + low_limit = vm_end_gap(tmp); + mas_reset(&mas); + goto retry; + } + } + return gap; } @@ -1548,7 +1566,8 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) */ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { - unsigned long length, gap; + unsigned long length, gap, high_limit, gap_end; + struct vm_area_struct *tmp; MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ @@ -1556,12 +1575,31 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) if (length < info->length) return -ENOMEM; - if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + high_limit = info->high_limit; +retry: + if (mas_empty_area_rev(&mas, info->low_limit, high_limit - 1, length)) return -ENOMEM; gap = mas.last + 1 - info->length; gap -= (gap - info->align_offset) & info->align_mask; + gap_end = mas.last; + tmp = mas_next(&mas, ULONG_MAX); + if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) <= gap_end) { + high_limit = vm_start_gap(tmp); + mas_reset(&mas); + goto retry; + } + } else { + tmp = mas_prev(&mas, 0); + if (tmp && vm_end_gap(tmp) > gap) { + high_limit = tmp->vm_start; + mas_reset(&mas); + goto retry; + } + } + return gap; } @@ -2277,7 +2315,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, int count = 0; int error = -ENOMEM; MA_STATE(mas_detach, &mt_detach, 0, 0); - mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); + mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_set_external_lock(&mt_detach, &mm->mmap_lock); /* @@ -2621,12 +2659,7 @@ cannot_expand: if (map_deny_write_exec(vma, vma->vm_flags)) { error = -EACCES; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; + goto close_and_free_vma; } /* Allow architectures to sanity-check the vm_flags */ @@ -3042,6 +3075,7 @@ void exit_mmap(struct mm_struct *mm) */ set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); + mt_clear_in_rcu(&mm->mm_mt); free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); diff --git a/mm/mprotect.c b/mm/mprotect.c index 231929f119d9..36351a00c0e8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -805,7 +805,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (map_deny_write_exec(vma, newflags)) { error = -EACCES; - goto out; + break; } /* Allow architectures to sanity-check the new flags */ @@ -838,7 +838,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (vma_iter_end(&vmi) < end) + if (!error && vma_iter_end(&vmi) < end) error = -ENOMEM; out: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 516b1aa247e8..db7943999007 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2583,46 +2583,6 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) return ret; } -/** - * folio_write_one - write out a single folio and wait on I/O. - * @folio: The folio to write. - * - * The folio must be locked by the caller and will be unlocked upon return. - * - * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this - * function returns. - * - * Return: %0 on success, negative error code otherwise - */ -int folio_write_one(struct folio *folio) -{ - struct address_space *mapping = folio->mapping; - int ret = 0; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = folio_nr_pages(folio), - }; - - BUG_ON(!folio_test_locked(folio)); - - folio_wait_writeback(folio); - - if (folio_clear_dirty_for_io(folio)) { - folio_get(folio); - ret = mapping->a_ops->writepage(&folio->page, &wbc); - if (ret == 0) - folio_wait_writeback(folio); - folio_put(folio); - } else { - folio_unlock(folio); - } - - if (!ret) - ret = filemap_check_errors(mapping); - return ret; -} -EXPORT_SYMBOL(folio_write_one); - /* * For address_spaces which do not use buffers nor write back. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ac1fc986af44..8e39705c7bdc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1398,6 +1398,7 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; + bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1470,7 +1471,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (!should_skip_kasan_poison(page, fpi_flags)) { + if (!skip_kasan_poison) { kasan_poison_pages(page, order, init); /* Memory is already initialized if KASAN did it internally. */ @@ -6631,7 +6632,21 @@ static void __build_all_zonelists(void *data) int nid; int __maybe_unused cpu; pg_data_t *self = data; + unsigned long flags; + /* + * Explicitly disable this CPU's interrupts before taking seqlock + * to prevent any IRQ handler from calling into the page allocator + * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. + */ + local_irq_save(flags); + /* + * Explicitly disable this CPU's synchronous printk() before taking + * seqlock to prevent any printk() from trying to hold port->lock, for + * tty_insert_flip_string_and_push_buffer() on other CPU might be + * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. + */ + printk_deferred_enter(); write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA @@ -6670,6 +6685,8 @@ static void __build_all_zonelists(void *data) } write_sequnlock(&zonelist_update_seq); + printk_deferred_exit(); + local_irq_restore(flags); } static noinline void __init @@ -9449,6 +9466,9 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageReserved(page)) return false; + + if (PageHuge(page)) + return false; } return true; } diff --git a/mm/shmem.c b/mm/shmem.c index 448f393d8ab2..b76521ed372d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3339,10 +3339,6 @@ static const struct xattr_handler shmem_trusted_xattr_handler = { }; static const struct xattr_handler *shmem_xattr_handlers[] = { -#ifdef CONFIG_TMPFS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, NULL diff --git a/mm/slab.c b/mm/slab.c index dabc2a671fc6..edbe722fb906 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -839,7 +839,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) return 0; } -#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP) +#if defined(CONFIG_NUMA) || defined(CONFIG_SMP) /* * Allocates and initializes node for a node on each slab cache, used for * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node diff --git a/mm/slab.h b/mm/slab.h index 43966aa5fadf..399966b3ce52 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -51,14 +51,6 @@ struct slab { }; unsigned int __unused; -#elif defined(CONFIG_SLOB) - - struct list_head slab_list; - void *__unused_1; - void *freelist; /* first free block */ - long units; - unsigned int __unused_2; - #else #error "Unexpected slab allocator configured" #endif @@ -72,11 +64,7 @@ struct slab { #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, __page_flags); -#ifndef CONFIG_SLOB SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ -#else -SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ -#endif SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG SLAB_MATCH(memcg_data, memcg_data); @@ -200,31 +188,6 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#ifdef CONFIG_SLOB -/* - * Common fields provided in kmem_cache by all slab allocators - * This struct is either used directly by the allocator (SLOB) - * or the allocator must include definitions for all fields - * provided in kmem_cache_common in their definition of kmem_cache. - * - * Once we can do anonymous structs (C11 standard) we could put a - * anonymous struct definition in these allocators so that the - * separate allocations in the kmem_cache structure of SLAB and - * SLUB is no longer needed. - */ -struct kmem_cache { - unsigned int object_size;/* The original size of the object */ - unsigned int size; /* The aligned/padded/added on size */ - unsigned int align; /* Alignment as calculated */ - slab_flags_t flags; /* Active flags on the slab */ - const char *name; /* Slab name for sysfs */ - int refcount; /* Use counter */ - void (*ctor)(void *); /* Called on object slot creation */ - struct list_head list; /* List of all slab caches on the system */ -}; - -#endif /* CONFIG_SLOB */ - #ifdef CONFIG_SLAB #include <linux/slab_def.h> #endif @@ -274,7 +237,6 @@ extern const struct kmalloc_info_struct { unsigned int size; } kmalloc_info[]; -#ifndef CONFIG_SLOB /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(slab_flags_t); @@ -286,7 +248,6 @@ void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, size_t orig_size, unsigned long caller); void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); -#endif gfp_t kmalloc_fix_flags(gfp_t flags); @@ -303,33 +264,16 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(unsigned size, unsigned align, slab_flags_t flags, const char *name, void (*ctor)(void *)); -#ifndef CONFIG_SLOB struct kmem_cache * __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name); -#else -static inline struct kmem_cache * -__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, void (*ctor)(void *)) -{ return NULL; } - -static inline slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name) -{ - return flags; -} -#endif static inline bool is_kmalloc_cache(struct kmem_cache *s) { -#ifndef CONFIG_SLOB return (s->flags & SLAB_KMALLOC); -#else - return false; -#endif } /* Legal flag mask for kmem_cache_create(), for various configurations */ @@ -634,7 +578,6 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, } #endif /* CONFIG_MEMCG_KMEM */ -#ifndef CONFIG_SLOB static inline struct kmem_cache *virt_to_cache(const void *obj) { struct slab *slab; @@ -684,8 +627,6 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) void free_large_kmalloc(struct folio *folio, void *object); -#endif /* CONFIG_SLOB */ - size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) @@ -777,7 +718,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, memcg_slab_post_alloc_hook(s, objcg, flags, size, p); } -#ifndef CONFIG_SLOB /* * The slab lists for all objects. */ @@ -824,7 +764,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) for (__node = 0; __node < nr_node_ids; __node++) \ if ((__n = get_node(__s, __node))) -#endif #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) void dump_unreclaimable_slab(void); diff --git a/mm/slab_common.c b/mm/slab_common.c index bf4e777cfe90..607249785c07 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -625,7 +625,6 @@ void kmem_dump_obj(void *object) EXPORT_SYMBOL_GPL(kmem_dump_obj); #endif -#ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ void __init create_boot_cache(struct kmem_cache *s, const char *name, unsigned int size, slab_flags_t flags, @@ -990,12 +989,9 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); /** * kfree - free previously allocated memory - * @object: pointer returned by kmalloc. + * @object: pointer returned by kmalloc() or kmem_cache_alloc() * * If @object is NULL, no operation is performed. - * - * Don't free memory not originally allocated by kmalloc() - * or you will run into trouble. */ void kfree(const void *object) { @@ -1079,7 +1075,6 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, return ret; } EXPORT_SYMBOL(kmalloc_node_trace); -#endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) { diff --git a/mm/slob.c b/mm/slob.c deleted file mode 100644 index fe567fcfa3a3..000000000000 --- a/mm/slob.c +++ /dev/null @@ -1,757 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * SLOB Allocator: Simple List Of Blocks - * - * Matt Mackall <mpm@selenic.com> 12/30/03 - * - * NUMA support by Paul Mundt, 2007. - * - * How SLOB works: - * - * The core of SLOB is a traditional K&R style heap allocator, with - * support for returning aligned objects. The granularity of this - * allocator is as little as 2 bytes, however typically most architectures - * will require 4 bytes on 32-bit and 8 bytes on 64-bit. - * - * The slob heap is a set of linked list of pages from alloc_pages(), - * and within each page, there is a singly-linked list of free blocks - * (slob_t). The heap is grown on demand. To reduce fragmentation, - * heap pages are segregated into three lists, with objects less than - * 256 bytes, objects less than 1024 bytes, and all other objects. - * - * Allocation from heap involves first searching for a page with - * sufficient free blocks (using a next-fit-like approach) followed by - * a first-fit scan of the page. Deallocation inserts objects back - * into the free list in address order, so this is effectively an - * address-ordered first fit. - * - * Above this is an implementation of kmalloc/kfree. Blocks returned - * from kmalloc are prepended with a 4-byte header with the kmalloc size. - * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls - * alloc_pages() directly, allocating compound pages so the page order - * does not have to be separately tracked. - * These objects are detected in kfree() because folio_test_slab() - * is false for them. - * - * SLAB is emulated on top of SLOB by simply calling constructors and - * destructors for every SLAB allocation. Objects are returned with the - * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which - * case the low-level allocator will fragment blocks to create the proper - * alignment. Again, objects of page-size or greater are allocated by - * calling alloc_pages(). As SLAB objects know their size, no separate - * size bookkeeping is necessary and there is essentially no allocation - * space overhead, and compound pages aren't needed for multi-page - * allocations. - * - * NUMA support in SLOB is fairly simplistic, pushing most of the real - * logic down to the page allocator, and simply doing the node accounting - * on the upper levels. In the event that a node id is explicitly - * provided, __alloc_pages_node() with the specified node id is used - * instead. The common case (or when the node id isn't explicitly provided) - * will default to the current node, as per numa_node_id(). - * - * Node aware pages are still inserted in to the global freelist, and - * these are scanned for by matching against the node id encoded in the - * page flags. As a result, block allocations that can be satisfied from - * the freelist will only be done so on pages residing on the same node, - * in order to prevent random node placement. - */ - -#include <linux/kernel.h> -#include <linux/slab.h> - -#include <linux/mm.h> -#include <linux/swap.h> /* struct reclaim_state */ -#include <linux/cache.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/rcupdate.h> -#include <linux/list.h> -#include <linux/kmemleak.h> - -#include <trace/events/kmem.h> - -#include <linux/atomic.h> - -#include "slab.h" -/* - * slob_block has a field 'units', which indicates size of block if +ve, - * or offset of next block if -ve (in SLOB_UNITs). - * - * Free blocks of size 1 unit simply contain the offset of the next block. - * Those with larger size contain their size in the first SLOB_UNIT of - * memory, and the offset of the next free block in the second SLOB_UNIT. - */ -#if PAGE_SIZE <= (32767 * 2) -typedef s16 slobidx_t; -#else -typedef s32 slobidx_t; -#endif - -struct slob_block { - slobidx_t units; -}; -typedef struct slob_block slob_t; - -/* - * All partially free slob pages go on these lists. - */ -#define SLOB_BREAK1 256 -#define SLOB_BREAK2 1024 -static LIST_HEAD(free_slob_small); -static LIST_HEAD(free_slob_medium); -static LIST_HEAD(free_slob_large); - -/* - * slob_page_free: true for pages on free_slob_pages list. - */ -static inline int slob_page_free(struct slab *slab) -{ - return PageSlobFree(slab_page(slab)); -} - -static void set_slob_page_free(struct slab *slab, struct list_head *list) -{ - list_add(&slab->slab_list, list); - __SetPageSlobFree(slab_page(slab)); -} - -static inline void clear_slob_page_free(struct slab *slab) -{ - list_del(&slab->slab_list); - __ClearPageSlobFree(slab_page(slab)); -} - -#define SLOB_UNIT sizeof(slob_t) -#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT) - -/* - * struct slob_rcu is inserted at the tail of allocated slob blocks, which - * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free - * the block using call_rcu. - */ -struct slob_rcu { - struct rcu_head head; - int size; -}; - -/* - * slob_lock protects all slob allocator structures. - */ -static DEFINE_SPINLOCK(slob_lock); - -/* - * Encode the given size and next info into a free slob block s. - */ -static void set_slob(slob_t *s, slobidx_t size, slob_t *next) -{ - slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); - slobidx_t offset = next - base; - - if (size > 1) { - s[0].units = size; - s[1].units = offset; - } else - s[0].units = -offset; -} - -/* - * Return the size of a slob block. - */ -static slobidx_t slob_units(slob_t *s) -{ - if (s->units > 0) - return s->units; - return 1; -} - -/* - * Return the next free slob block pointer after this one. - */ -static slob_t *slob_next(slob_t *s) -{ - slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); - slobidx_t next; - - if (s[0].units < 0) - next = -s[0].units; - else - next = s[1].units; - return base+next; -} - -/* - * Returns true if s is the last free block in its page. - */ -static int slob_last(slob_t *s) -{ - return !((unsigned long)slob_next(s) & ~PAGE_MASK); -} - -static void *slob_new_pages(gfp_t gfp, int order, int node) -{ - struct page *page; - -#ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE) - page = __alloc_pages_node(node, gfp, order); - else -#endif - page = alloc_pages(gfp, order); - - if (!page) - return NULL; - - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); - return page_address(page); -} - -static void slob_free_pages(void *b, int order) -{ - struct page *sp = virt_to_page(b); - - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += 1 << order; - - mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(sp, order); -} - -/* - * slob_page_alloc() - Allocate a slob block within a given slob_page sp. - * @sp: Page to look in. - * @size: Size of the allocation. - * @align: Allocation alignment. - * @align_offset: Offset in the allocated block that will be aligned. - * @page_removed_from_list: Return parameter. - * - * Tries to find a chunk of memory at least @size bytes big within @page. - * - * Return: Pointer to memory if allocated, %NULL otherwise. If the - * allocation fills up @page then the page is removed from the - * freelist, in this case @page_removed_from_list will be set to - * true (set to false otherwise). - */ -static void *slob_page_alloc(struct slab *sp, size_t size, int align, - int align_offset, bool *page_removed_from_list) -{ - slob_t *prev, *cur, *aligned = NULL; - int delta = 0, units = SLOB_UNITS(size); - - *page_removed_from_list = false; - for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { - slobidx_t avail = slob_units(cur); - - /* - * 'aligned' will hold the address of the slob block so that the - * address 'aligned'+'align_offset' is aligned according to the - * 'align' parameter. This is for kmalloc() which prepends the - * allocated block with its size, so that the block itself is - * aligned when needed. - */ - if (align) { - aligned = (slob_t *) - (ALIGN((unsigned long)cur + align_offset, align) - - align_offset); - delta = aligned - cur; - } - if (avail >= units + delta) { /* room enough? */ - slob_t *next; - - if (delta) { /* need to fragment head to align? */ - next = slob_next(cur); - set_slob(aligned, avail - delta, next); - set_slob(cur, delta, aligned); - prev = cur; - cur = aligned; - avail = slob_units(cur); - } - - next = slob_next(cur); - if (avail == units) { /* exact fit? unlink. */ - if (prev) - set_slob(prev, slob_units(prev), next); - else - sp->freelist = next; - } else { /* fragment */ - if (prev) - set_slob(prev, slob_units(prev), cur + units); - else - sp->freelist = cur + units; - set_slob(cur + units, avail - units, next); - } - - sp->units -= units; - if (!sp->units) { - clear_slob_page_free(sp); - *page_removed_from_list = true; - } - return cur; - } - if (slob_last(cur)) - return NULL; - } -} - -/* - * slob_alloc: entry point into the slob allocator. - */ -static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, - int align_offset) -{ - struct folio *folio; - struct slab *sp; - struct list_head *slob_list; - slob_t *b = NULL; - unsigned long flags; - bool _unused; - - if (size < SLOB_BREAK1) - slob_list = &free_slob_small; - else if (size < SLOB_BREAK2) - slob_list = &free_slob_medium; - else - slob_list = &free_slob_large; - - spin_lock_irqsave(&slob_lock, flags); - /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, slob_list, slab_list) { - bool page_removed_from_list = false; -#ifdef CONFIG_NUMA - /* - * If there's a node specification, search for a partial - * page with a matching node id in the freelist. - */ - if (node != NUMA_NO_NODE && slab_nid(sp) != node) - continue; -#endif - /* Enough room on this page? */ - if (sp->units < SLOB_UNITS(size)) - continue; - - b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list); - if (!b) - continue; - - /* - * If slob_page_alloc() removed sp from the list then we - * cannot call list functions on sp. If so allocation - * did not fragment the page anyway so optimisation is - * unnecessary. - */ - if (!page_removed_from_list) { - /* - * Improve fragment distribution and reduce our average - * search time by starting our next search here. (see - * Knuth vol 1, sec 2.5, pg 449) - */ - if (!list_is_first(&sp->slab_list, slob_list)) - list_rotate_to_front(&sp->slab_list, slob_list); - } - break; - } - spin_unlock_irqrestore(&slob_lock, flags); - - /* Not enough space: must allocate a new page */ - if (!b) { - b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); - if (!b) - return NULL; - folio = virt_to_folio(b); - __folio_set_slab(folio); - sp = folio_slab(folio); - - spin_lock_irqsave(&slob_lock, flags); - sp->units = SLOB_UNITS(PAGE_SIZE); - sp->freelist = b; - INIT_LIST_HEAD(&sp->slab_list); - set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); - set_slob_page_free(sp, slob_list); - b = slob_page_alloc(sp, size, align, align_offset, &_unused); - BUG_ON(!b); - spin_unlock_irqrestore(&slob_lock, flags); - } - if (unlikely(gfp & __GFP_ZERO)) - memset(b, 0, size); - return b; -} - -/* - * slob_free: entry point into the slob allocator. - */ -static void slob_free(void *block, int size) -{ - struct slab *sp; - slob_t *prev, *next, *b = (slob_t *)block; - slobidx_t units; - unsigned long flags; - struct list_head *slob_list; - - if (unlikely(ZERO_OR_NULL_PTR(block))) - return; - BUG_ON(!size); - - sp = virt_to_slab(block); - units = SLOB_UNITS(size); - - spin_lock_irqsave(&slob_lock, flags); - - if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) { - /* Go directly to page allocator. Do not pass slob allocator */ - if (slob_page_free(sp)) - clear_slob_page_free(sp); - spin_unlock_irqrestore(&slob_lock, flags); - __folio_clear_slab(slab_folio(sp)); - slob_free_pages(b, 0); - return; - } - - if (!slob_page_free(sp)) { - /* This slob page is about to become partially free. Easy! */ - sp->units = units; - sp->freelist = b; - set_slob(b, units, - (void *)((unsigned long)(b + - SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); - if (size < SLOB_BREAK1) - slob_list = &free_slob_small; - else if (size < SLOB_BREAK2) - slob_list = &free_slob_medium; - else - slob_list = &free_slob_large; - set_slob_page_free(sp, slob_list); - goto out; - } - - /* - * Otherwise the page is already partially free, so find reinsertion - * point. - */ - sp->units += units; - - if (b < (slob_t *)sp->freelist) { - if (b + units == sp->freelist) { - units += slob_units(sp->freelist); - sp->freelist = slob_next(sp->freelist); - } - set_slob(b, units, sp->freelist); - sp->freelist = b; - } else { - prev = sp->freelist; - next = slob_next(prev); - while (b > next) { - prev = next; - next = slob_next(prev); - } - - if (!slob_last(prev) && b + units == next) { - units += slob_units(next); - set_slob(b, units, slob_next(next)); - } else - set_slob(b, units, next); - - if (prev + slob_units(prev) == b) { - units = slob_units(b) + slob_units(prev); - set_slob(prev, units, slob_next(b)); - } else - set_slob(prev, slob_units(prev), b); - } -out: - spin_unlock_irqrestore(&slob_lock, flags); -} - -#ifdef CONFIG_PRINTK -void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) -{ - kpp->kp_ptr = object; - kpp->kp_slab = slab; -} -#endif - -/* - * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. - */ - -static __always_inline void * -__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) -{ - unsigned int *m; - unsigned int minalign; - void *ret; - - minalign = max_t(unsigned int, ARCH_KMALLOC_MINALIGN, - arch_slab_minalign()); - gfp &= gfp_allowed_mask; - - might_alloc(gfp); - - if (size < PAGE_SIZE - minalign) { - int align = minalign; - - /* - * For power of two sizes, guarantee natural alignment for - * kmalloc()'d objects. - */ - if (is_power_of_2(size)) - align = max_t(unsigned int, minalign, size); - - if (!size) - return ZERO_SIZE_PTR; - - m = slob_alloc(size + minalign, gfp, align, node, minalign); - - if (!m) - return NULL; - *m = size; - ret = (void *)m + minalign; - - trace_kmalloc(caller, ret, size, size + minalign, gfp, node); - } else { - unsigned int order = get_order(size); - - if (likely(order)) - gfp |= __GFP_COMP; - ret = slob_new_pages(gfp, order, node); - - trace_kmalloc(caller, ret, size, PAGE_SIZE << order, gfp, node); - } - - kmemleak_alloc(ret, size, 1, gfp); - return ret; -} - -void *__kmalloc(size_t size, gfp_t gfp) -{ - return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc); - -void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, - int node, unsigned long caller) -{ - return __do_kmalloc_node(size, gfp, node, caller); -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); - -void kfree(const void *block) -{ - struct folio *sp; - - trace_kfree(_RET_IP_, block); - - if (unlikely(ZERO_OR_NULL_PTR(block))) - return; - kmemleak_free(block); - - sp = virt_to_folio(block); - if (folio_test_slab(sp)) { - unsigned int align = max_t(unsigned int, - ARCH_KMALLOC_MINALIGN, - arch_slab_minalign()); - unsigned int *m = (unsigned int *)(block - align); - - slob_free(m, *m + align); - } else { - unsigned int order = folio_order(sp); - - mod_node_page_state(folio_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(folio_page(sp, 0), order); - - } -} -EXPORT_SYMBOL(kfree); - -size_t kmalloc_size_roundup(size_t size) -{ - /* Short-circuit the 0 size case. */ - if (unlikely(size == 0)) - return 0; - /* Short-circuit saturated "too-large" case. */ - if (unlikely(size == SIZE_MAX)) - return SIZE_MAX; - - return ALIGN(size, ARCH_KMALLOC_MINALIGN); -} - -EXPORT_SYMBOL(kmalloc_size_roundup); - -/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ -size_t __ksize(const void *block) -{ - struct folio *folio; - unsigned int align; - unsigned int *m; - - BUG_ON(!block); - if (unlikely(block == ZERO_SIZE_PTR)) - return 0; - - folio = virt_to_folio(block); - if (unlikely(!folio_test_slab(folio))) - return folio_size(folio); - - align = max_t(unsigned int, ARCH_KMALLOC_MINALIGN, - arch_slab_minalign()); - m = (unsigned int *)(block - align); - return SLOB_UNITS(*m) * SLOB_UNIT; -} - -int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) -{ - if (flags & SLAB_TYPESAFE_BY_RCU) { - /* leave room for rcu footer at the end of object */ - c->size += sizeof(struct slob_rcu); - } - - /* Actual size allocated */ - c->size = SLOB_UNITS(c->size) * SLOB_UNIT; - c->flags = flags; - return 0; -} - -static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) -{ - void *b; - - flags &= gfp_allowed_mask; - - might_alloc(flags); - - if (c->size < PAGE_SIZE) { - b = slob_alloc(c->size, flags, c->align, node, 0); - trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); - } else { - b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); - } - - if (b && c->ctor) { - WARN_ON_ONCE(flags & __GFP_ZERO); - c->ctor(b); - } - - kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); - return b; -} - -void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) -{ - return slob_alloc_node(cachep, flags, NUMA_NO_NODE); -} -EXPORT_SYMBOL(kmem_cache_alloc); - - -void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags) -{ - return slob_alloc_node(cachep, flags, NUMA_NO_NODE); -} -EXPORT_SYMBOL(kmem_cache_alloc_lru); - -void *__kmalloc_node(size_t size, gfp_t gfp, int node) -{ - return __do_kmalloc_node(size, gfp, node, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc_node); - -void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) -{ - return slob_alloc_node(cachep, gfp, node); -} -EXPORT_SYMBOL(kmem_cache_alloc_node); - -static void __kmem_cache_free(void *b, int size) -{ - if (size < PAGE_SIZE) - slob_free(b, size); - else - slob_free_pages(b, get_order(size)); -} - -static void kmem_rcu_free(struct rcu_head *head) -{ - struct slob_rcu *slob_rcu = (struct slob_rcu *)head; - void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu)); - - __kmem_cache_free(b, slob_rcu->size); -} - -void kmem_cache_free(struct kmem_cache *c, void *b) -{ - kmemleak_free_recursive(b, c->flags); - trace_kmem_cache_free(_RET_IP_, b, c); - if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { - struct slob_rcu *slob_rcu; - slob_rcu = b + (c->size - sizeof(struct slob_rcu)); - slob_rcu->size = c->size; - call_rcu(&slob_rcu->head, kmem_rcu_free); - } else { - __kmem_cache_free(b, c->size); - } -} -EXPORT_SYMBOL(kmem_cache_free); - -void kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) -{ - size_t i; - - for (i = 0; i < nr; i++) { - if (s) - kmem_cache_free(s, p[i]); - else - kfree(p[i]); - } -} -EXPORT_SYMBOL(kmem_cache_free_bulk); - -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, - void **p) -{ - size_t i; - - for (i = 0; i < nr; i++) { - void *x = p[i] = kmem_cache_alloc(s, flags); - - if (!x) { - kmem_cache_free_bulk(s, i, p); - return 0; - } - } - return i; -} -EXPORT_SYMBOL(kmem_cache_alloc_bulk); - -int __kmem_cache_shutdown(struct kmem_cache *c) -{ - /* No way to check for remaining objects */ - return 0; -} - -void __kmem_cache_release(struct kmem_cache *c) -{ -} - -int __kmem_cache_shrink(struct kmem_cache *d) -{ - return 0; -} - -static struct kmem_cache kmem_cache_boot = { - .name = "kmem_cache", - .size = sizeof(struct kmem_cache), - .flags = SLAB_PANIC, - .align = ARCH_KMALLOC_MINALIGN, -}; - -void __init kmem_cache_init(void) -{ - kmem_cache = &kmem_cache_boot; - slab_state = UP; -} - -void __init kmem_cache_init_late(void) -{ - slab_state = FULL; -} diff --git a/mm/slub.c b/mm/slub.c index 39327e98fce3..28ca576d988d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6059,7 +6059,7 @@ static const struct sysfs_ops slab_sysfs_ops = { .store = slab_attr_store, }; -static struct kobj_type slab_ktype = { +static const struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, .release = kmem_cache_release, }; diff --git a/mm/swap.c b/mm/swap.c index 57cb01b042f6..423199ee8478 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -222,7 +222,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_init(fbatch); + folio_batch_reinit(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, diff --git a/mm/swapfile.c b/mm/swapfile.c index 62ba2bf577d7..2c718f45745f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -679,6 +679,7 @@ static void __del_from_avail_list(struct swap_info_struct *p) { int nid; + assert_spin_locked(&p->lock); for_each_node(nid) plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); } @@ -2434,8 +2435,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - del_from_avail_list(p); spin_lock(&p->lock); + del_from_avail_list(p); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; diff --git a/mm/usercopy.c b/mm/usercopy.c index 4c3164beacec..83c164aba6e0 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -173,7 +173,7 @@ static inline void check_heap_object(const void *ptr, unsigned long n, return; } - if (is_vmalloc_addr(ptr)) { + if (is_vmalloc_addr(ptr) && !pagefault_disabled()) { struct vmap_area *area = find_vmap_area(addr); if (!area) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ef910bf349e1..31ff782d368b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -313,8 +313,8 @@ int ioremap_page_range(unsigned long addr, unsigned long end, ioremap_max_page_shift); flush_cache_vmap(addr, end); if (!err) - kmsan_ioremap_page_range(addr, end, phys_addr, prot, - ioremap_max_page_shift); + err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); return err; } @@ -605,7 +605,11 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, + page_shift); + + if (ret) + return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } @@ -2883,6 +2887,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; + gfp_t alloc_gfp = gfp; + bool nofail = false; struct page *page; int i; @@ -2893,6 +2899,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * more permissive. */ if (!order) { + /* bulk allocator doesn't support nofail req. officially */ gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; while (nr_allocated < nr_pages) { @@ -2931,20 +2938,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (nr != nr_pages_request) break; } + } else if (gfp & __GFP_NOFAIL) { + /* + * Higher order nofail allocations are really expensive and + * potentially dangerous (pre-mature OOM, disruptive reclaim + * and compaction etc. + */ + alloc_gfp &= ~__GFP_NOFAIL; + nofail = true; } /* High-order pages or fallback path if "bulk" fails. */ - while (nr_allocated < nr_pages) { if (fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) - page = alloc_pages(gfp, order); + page = alloc_pages(alloc_gfp, order); else - page = alloc_pages_node(nid, gfp, order); - if (unlikely(!page)) - break; + page = alloc_pages_node(nid, alloc_gfp, order); + if (unlikely(!page)) { + if (!nofail) + break; + + /* fall back to the zero order allocations */ + alloc_gfp |= __GFP_NOFAIL; + order = 0; + continue; + } + /* * Higher order allocations must be able to be treated as * indepdenent small pages by callers (as they can with @@ -3024,9 +3046,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { - warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, page order %u, failed to allocate pages", - area->nr_pages * PAGE_SIZE, page_order); + /* vm_area_alloc_pages() can also fail due to a fatal signal */ + if (!fatal_signal_pending(current)) + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, page order %u, failed to allocate pages", + area->nr_pages * PAGE_SIZE, page_order); goto fail; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9c1c5e8b24b8..7ba6bfdd9a5f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1151,12 +1151,12 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) DEFINE_WAIT(wait); /* - * Do not throttle IO workers, kthreads other than kswapd or + * Do not throttle user workers, kthreads other than kswapd or * workqueues. They may be required for reclaim to make * forward progress (e.g. journalling workqueues or kthreads). */ if (!current_is_kswapd() && - current->flags & (PF_IO_WORKER|PF_KTHREAD)) { + current->flags & (PF_USER_WORKER|PF_KTHREAD)) { cond_resched(); return; } diff --git a/mm/zpool.c b/mm/zpool.c index 571f5c5031dd..6a19c4a58f77 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -395,6 +395,5 @@ bool zpool_can_sleep_mapped(struct zpool *zpool) return zpool->driver->sleep_mapped; } -MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zswap.c b/mm/zswap.c index f6c89049cf70..f2fc0373b967 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1540,6 +1540,5 @@ cache_fail: /* must be late so crypto has time to come up */ late_initcall(init_zswap); -MODULE_LICENSE("GPL"); MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); MODULE_DESCRIPTION("Compressed cache for swap pages"); |