diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-27 21:40:49 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-27 21:40:49 +0300 |
commit | 8291eaafed36f575f23951f3ce18407f480e9ecf (patch) | |
tree | 279b61422ba2df7b8579af8ccc81331de80affa8 | |
parent | 77fb622de1393b1d54f24f4f7ed98f84feeda502 (diff) | |
parent | fa020a2b87d24016723fff4a4237deb612478a32 (diff) | |
download | linux-8291eaafed36f575f23951f3ce18407f480e9ecf.tar.xz |
Merge tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton:
- Two follow-on fixes for the post-5.19 series "Use pageblock_order for
cma and alloc_contig_range alignment", from Zi Yan.
- A series of z3fold cleanups and fixes from Miaohe Lin.
- Some memcg selftests work from Michal Koutný <mkoutny@suse.com>
- Some swap fixes and cleanups from Miaohe Lin
- Several individual minor fixups
* tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (25 commits)
mm/shmem.c: suppress shift warning
mm: Kconfig: reorganize misplaced mm options
mm: kasan: fix input of vmalloc_to_page()
mm: fix is_pinnable_page against a cma page
mm: filter out swapin error entry in shmem mapping
mm/shmem: fix infinite loop when swap in shmem error at swapoff time
mm/madvise: free hwpoison and swapin error entry in madvise_free_pte_range
mm/swapfile: fix lost swap bits in unuse_pte()
mm/swapfile: unuse_pte can map random data if swap read fails
selftests: memcg: factor out common parts of memory.{low,min} tests
selftests: memcg: remove protection from top level memcg
selftests: memcg: adjust expected reclaim values of protected cgroups
selftests: memcg: expect no low events in unprotected sibling
selftests: memcg: fix compilation
mm/z3fold: fix z3fold_page_migrate races with z3fold_map
mm/z3fold: fix z3fold_reclaim_page races with z3fold_free
mm/z3fold: always clear PAGE_CLAIMED under z3fold page lock
mm/z3fold: put z3fold page back into unbuddied list when reclaim or migration fails
revert "mm/z3fold.c: allow __GFP_HIGHMEM in z3fold_alloc"
mm/z3fold: throw warning on failure of trylock_page in z3fold_alloc
...
-rw-r--r-- | MAINTAINERS | 1 | ||||
-rw-r--r-- | include/linux/mm.h | 9 | ||||
-rw-r--r-- | include/linux/swap.h | 7 | ||||
-rw-r--r-- | include/linux/swapops.h | 10 | ||||
-rw-r--r-- | init/Kconfig | 54 | ||||
-rw-r--r-- | lib/Kconfig.debug | 35 | ||||
-rw-r--r-- | mm/Kconfig | 56 | ||||
-rw-r--r-- | mm/Kconfig.debug | 33 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/kasan/report.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 18 | ||||
-rw-r--r-- | mm/memory.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 32 | ||||
-rw-r--r-- | mm/page_isolation.c | 36 | ||||
-rw-r--r-- | mm/shmem.c | 41 | ||||
-rw-r--r-- | mm/swap_state.c | 3 | ||||
-rw-r--r-- | mm/swapfile.c | 21 | ||||
-rw-r--r-- | mm/z3fold.c | 97 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/memcg_protection.m | 89 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/test_memcontrol.c | 247 |
20 files changed, 436 insertions, 364 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 58548f79e50a..392467e9ab73 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5062,6 +5062,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/memcontrol.c F: mm/swap_cgroup.c +F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_kmem.c F: tools/testing/selftests/cgroup/test_memcontrol.c diff --git a/include/linux/mm.h b/include/linux/mm.h index f85e5a4c070b..bc8f326be0ce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1594,8 +1594,13 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) { - return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) || - is_zero_pfn(page_to_pfn(page)); +#ifdef CONFIG_CMA + int mt = get_pageblock_migratetype(page); + + if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) + return false; +#endif + return !(is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page))); } #else static inline bool is_pinnable_page(struct page *page) diff --git a/include/linux/swap.h b/include/linux/swap.h index f3ae17b43f20..0c0fed1b348f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,6 +55,10 @@ static inline int current_is_kswapd(void) * actions on faults. */ +#define SWP_SWAPIN_ERROR_NUM 1 +#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ + SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ + SWP_PTE_MARKER_NUM) /* * PTE markers are used to persist information onto PTEs that are mapped with * file-backed memories. As its name "PTE" hints, it should only be applied to @@ -120,7 +124,8 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM) + SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ + SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM) /* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index fe220df499f1..f24775b41880 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -108,6 +108,16 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) return xa_mk_value(entry.val); } +static inline swp_entry_t make_swapin_error_entry(struct page *page) +{ + return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page)); +} + +static inline int is_swapin_error_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_SWAPIN_ERROR; +} + #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { diff --git a/init/Kconfig b/init/Kconfig index 7efe7c56893f..583675727b85 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1842,60 +1842,6 @@ config DEBUG_PERF_USE_VMALLOC endmenu -config VM_EVENT_COUNTERS - default y - bool "Enable VM event counters for /proc/vmstat" if EXPERT - help - VM event counters are needed for event counts to be shown. - This option allows the disabling of the VM event counters - on EXPERT systems. /proc/vmstat will only show page counts - if VM event counters are disabled. - -config SLUB_DEBUG - default y - bool "Enable SLUB debugging support" if EXPERT - depends on SLUB && SYSFS - select STACKDEPOT if STACKTRACE_SUPPORT - help - SLUB has extensive debug support features. Disabling these can - result in significant savings in code size. This also disables - SLUB sysfs support. /sys/slab will not exist and there will be - no support for cache validation etc. - -config COMPAT_BRK - bool "Disable heap randomization" - default y - help - Randomizing heap placement makes heap exploits harder, but it - also breaks ancient binaries (including anything libc5 based). - This option changes the bootup default to heap randomization - disabled, and can be overridden at runtime by setting - /proc/sys/kernel/randomize_va_space to 2. - - On non-ancient distros (post-2000 ones) N is usually a safe choice. - -config MMAP_ALLOW_UNINITIALIZED - bool "Allow mmapped anonymous memory to be uninitialized" - depends on EXPERT && !MMU - default n - help - Normally, and according to the Linux spec, anonymous memory obtained - from mmap() has its contents cleared before it is passed to - userspace. Enabling this config option allows you to request that - mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus - providing a huge performance boost. If this option is not enabled, - then the flag will be ignored. - - This is taken advantage of by uClibc's malloc(), and also by - ELF-FDPIC binfmt's brk and stack allocator. - - Because of the obvious security issues, this option should only be - enabled on embedded devices where you control what is run in - userspace. Since that isn't generally a problem on no-MMU systems, - it is normally safe to say Y here. - - See Documentation/admin-guide/mm/nommu-mmap.rst for more information. - config SYSTEM_DATA_VERIFICATION def_bool n select SYSTEM_TRUSTED_KEYRING diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b1e506f1d328..4cea85a83321 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -699,41 +699,6 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT help Debug objects boot parameter default value -config DEBUG_SLAB - bool "Debug slab memory allocations" - depends on DEBUG_KERNEL && SLAB - help - Say Y here to have the kernel do limited verification on memory - allocation as well as poisoning memory on free to catch use of freed - memory. This can make kmalloc/kfree-intensive workloads much slower. - -config SLUB_DEBUG_ON - bool "SLUB debugging on by default" - depends on SLUB && SLUB_DEBUG - select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT - default n - help - Boot with debugging on by default. SLUB boots by default with - the runtime debug capabilities switched off. Enabling this is - equivalent to specifying the "slub_debug" parameter on boot. - There is no support for more fine grained debug control like - possible with slub_debug=xxx. SLUB debugging may be switched - off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying - "slub_debug=-". - -config SLUB_STATS - default n - bool "Enable SLUB performance statistics" - depends on SLUB && SYSFS - help - SLUB statistics are useful to debug SLUBs allocation behavior in - order find ways to optimize the allocator. This should never be - enabled for production use since keeping statistics slows down - the allocator by a few percentage points. The slabinfo command - supports the determination of the most active slabs to figure - out which slabs are relevant to a particular load. - Try running: slabinfo -DA - config HAVE_DEBUG_KMEMLEAK bool diff --git a/mm/Kconfig b/mm/Kconfig index 905c205e14f3..169e64192e48 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -270,6 +270,19 @@ config SLAB_FREELIST_HARDENED sanity-checking than others. This option is most effective with CONFIG_SLUB. +config SLUB_STATS + default n + bool "Enable SLUB performance statistics" + depends on SLUB && SYSFS + help + SLUB statistics are useful to debug SLUBs allocation behavior in + order find ways to optimize the allocator. This should never be + enabled for production use since keeping statistics slows down + the allocator by a few percentage points. The slabinfo command + supports the determination of the most active slabs to figure + out which slabs are relevant to a particular load. + Try running: slabinfo -DA + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP @@ -307,6 +320,40 @@ config SHUFFLE_PAGE_ALLOCATOR Say Y if unsure. +config COMPAT_BRK + bool "Disable heap randomization" + default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). + This option changes the bootup default to heap randomization + disabled, and can be overridden at runtime by setting + /proc/sys/kernel/randomize_va_space to 2. + + On non-ancient distros (post-2000 ones) N is usually a safe choice. + +config MMAP_ALLOW_UNINITIALIZED + bool "Allow mmapped anonymous memory to be uninitialized" + depends on EXPERT && !MMU + default n + help + Normally, and according to the Linux spec, anonymous memory obtained + from mmap() has its contents cleared before it is passed to + userspace. Enabling this config option allows you to request that + mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus + providing a huge performance boost. If this option is not enabled, + then the flag will be ignored. + + This is taken advantage of by uClibc's malloc(), and also by + ELF-FDPIC binfmt's brk and stack allocator. + + Because of the obvious security issues, this option should only be + enabled on embedded devices where you control what is run in + userspace. Since that isn't generally a problem on no-MMU systems, + it is normally safe to say Y here. + + See Documentation/admin-guide/mm/nommu-mmap.rst for more information. + config SELECT_MEMORY_MODEL def_bool y depends on ARCH_SELECT_MEMORY_MODEL @@ -964,6 +1011,15 @@ config ARCH_USES_HIGH_VMA_FLAGS config ARCH_HAS_PKEYS bool +config VM_EVENT_COUNTERS + default y + bool "Enable VM event counters for /proc/vmstat" if EXPERT + help + VM event counters are needed for event counts to be shown. + This option allows the disabling of the VM event counters + on EXPERT systems. /proc/vmstat will only show page counts + if VM event counters are disabled. + config PERCPU_STATS bool "Collect percpu memory statistics" help diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5bd5bb097252..ce8dded36de9 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -45,6 +45,39 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT Enable debug page memory allocations by default? This value can be overridden by debug_pagealloc=off|on. +config DEBUG_SLAB + bool "Debug slab memory allocations" + depends on DEBUG_KERNEL && SLAB + help + Say Y here to have the kernel do limited verification on memory + allocation as well as poisoning memory on free to catch use of freed + memory. This can make kmalloc/kfree-intensive workloads much slower. + +config SLUB_DEBUG + default y + bool "Enable SLUB debugging support" if EXPERT + depends on SLUB && SYSFS + select STACKDEPOT if STACKTRACE_SUPPORT + help + SLUB has extensive debug support features. Disabling these can + result in significant savings in code size. This also disables + SLUB sysfs support. /sys/slab will not exist and there will be + no support for cache validation etc. + +config SLUB_DEBUG_ON + bool "SLUB debugging on by default" + depends on SLUB && SLUB_DEBUG + select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT + default n + help + Boot with debugging on by default. SLUB boots by default with + the runtime debug capabilities switched off. Enabling this is + equivalent to specifying the "slub_debug" parameter on boot. + There is no support for more fine grained debug control like + possible with slub_debug=xxx. SLUB debugging may be switched + off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying + "slub_debug=-". + config PAGE_OWNER bool "Track page owner" depends on DEBUG_KERNEL && STACKTRACE_SUPPORT diff --git a/mm/internal.h b/mm/internal.h index 64e61b032dac..c0f8fbe0445b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -374,8 +374,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); -void split_free_page(struct page *free_page, - int order, unsigned long split_pfn_offset); +int split_free_page(struct page *free_page, + unsigned int order, unsigned long split_pfn_offset); #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 199d77cce21a..b341a191651d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -347,7 +347,7 @@ static void print_address_description(void *addr, u8 tag) va->addr, va->addr + va->size, va->caller); pr_err("\n"); - page = vmalloc_to_page(page); + page = vmalloc_to_page(addr); } } diff --git a/mm/madvise.c b/mm/madvise.c index 4d6592488b51..d7b4f2602949 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -248,10 +248,13 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, if (!xa_is_value(page)) continue; + swap = radix_to_swp_entry(page); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swap)) + continue; xas_pause(&xas); rcu_read_unlock(); - swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, NULL, 0, false, &splug); if (page) @@ -624,11 +627,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, swp_entry_t entry; entry = pte_to_swp_entry(ptent); - if (non_swap_entry(entry)) - continue; - nr_swap--; - free_swap_and_cache(entry); - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + if (!non_swap_entry(entry)) { + nr_swap--; + free_swap_and_cache(entry); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } else if (is_hwpoison_entry(entry) || + is_swapin_error_entry(entry)) { + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } continue; } diff --git a/mm/memory.c b/mm/memory.c index 54bcd5327b74..21dadf03f089 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1487,7 +1487,8 @@ again: /* Only drop the uffd-wp marker if explicitly requested */ if (!zap_drop_file_uffd_wp(details)) continue; - } else if (is_hwpoison_entry(entry)) { + } else if (is_hwpoison_entry(entry) || + is_swapin_error_entry(entry)) { if (!should_zap_cows(details)) continue; } else { @@ -3727,6 +3728,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; + } else if (is_swapin_error_entry(entry)) { + ret = VM_FAULT_SIGBUS; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 149f2ab5063b..e008a3df0485 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -482,8 +482,12 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page, bitidx = pfn_to_bitidx(page, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; + /* + * This races, without locks, with set_pfnblock_flags_mask(). Ensure + * a consistent read of the memory array, so that results, even though + * racy, are not corrupted. + */ + word = READ_ONCE(bitmap[word_bitidx]); return (word >> bitidx) & mask; } @@ -1100,30 +1104,44 @@ done_merging: * @order: the order of the page * @split_pfn_offset: split offset within the page * + * Return -ENOENT if the free page is changed, otherwise 0 + * * It is used when the free page crosses two pageblocks with different migratetypes * at split_pfn_offset within the page. The split free page will be put into * separate migratetype lists afterwards. Otherwise, the function achieves * nothing. */ -void split_free_page(struct page *free_page, - int order, unsigned long split_pfn_offset) +int split_free_page(struct page *free_page, + unsigned int order, unsigned long split_pfn_offset) { struct zone *zone = page_zone(free_page); unsigned long free_page_pfn = page_to_pfn(free_page); unsigned long pfn; unsigned long flags; int free_page_order; + int mt; + int ret = 0; if (split_pfn_offset == 0) - return; + return ret; spin_lock_irqsave(&zone->lock, flags); + + if (!PageBuddy(free_page) || buddy_order(free_page) != order) { + ret = -ENOENT; + goto out; + } + + mt = get_pageblock_migratetype(free_page); + if (likely(!is_migrate_isolate(mt))) + __mod_zone_freepage_state(zone, -(1UL << order), mt); + del_page_from_free_list(free_page, zone, order); for (pfn = free_page_pfn; pfn < free_page_pfn + (1UL << order);) { int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); - free_page_order = min_t(int, + free_page_order = min_t(unsigned int, pfn ? __ffs(pfn) : order, __fls(split_pfn_offset)); __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, @@ -1134,7 +1152,9 @@ void split_free_page(struct page *free_page, if (split_pfn_offset == 0) split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); } +out: spin_unlock_irqrestore(&zone->lock, flags); + return ret; } /* * A bad page could be due to a number of fields. Instead of multiple branches, diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c643c8420809..6021f8444b5a 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -300,7 +300,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * the in-use page then splitting the free page. */ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - gfp_t gfp_flags, bool isolate_before) + gfp_t gfp_flags, bool isolate_before, bool skip_isolation) { unsigned char saved_mt; unsigned long start_pfn; @@ -327,11 +327,16 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); - ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, - isolate_pageblock, isolate_pageblock + pageblock_nr_pages); - if (ret) - return ret; + if (skip_isolation) + VM_BUG_ON(!is_migrate_isolate(saved_mt)); + else { + ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, + isolate_pageblock, isolate_pageblock + pageblock_nr_pages); + + if (ret) + return ret; + } /* * Bail out early when the to-be-isolated pageblock does not form @@ -366,9 +371,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, if (PageBuddy(page)) { int order = buddy_order(page); - if (pfn + (1UL << order) > boundary_pfn) - split_free_page(page, order, boundary_pfn - pfn); - pfn += (1UL << order); + if (pfn + (1UL << order) > boundary_pfn) { + /* free page changed before split, check it again */ + if (split_free_page(page, order, boundary_pfn - pfn)) + continue; + } + + pfn += 1UL << order; continue; } /* @@ -463,7 +472,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, return 0; failed: /* restore the original migratetype */ - unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); + if (!skip_isolation) + unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); return -EBUSY; } @@ -522,14 +532,18 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); int ret; + bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false); + ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation); if (ret) return ret; + if (isolate_start == isolate_end - pageblock_nr_pages) + skip_isolation = true; + /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true); + ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); return ret; diff --git a/mm/shmem.c b/mm/shmem.c index da30c769b376..a6f565308133 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1174,6 +1174,10 @@ static int shmem_find_swap_entries(struct address_space *mapping, continue; entry = radix_to_swp_entry(folio); + /* + * swapin error entries can be found in the mapping. But they're + * deliberately ignored here as we've done everything we can do. + */ if (swp_type(entry) != type) continue; @@ -1671,6 +1675,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, return error; } +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, + struct folio *folio, swp_entry_t swap) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + swp_entry_t swapin_error; + void *old; + + swapin_error = make_swapin_error_entry(&folio->page); + old = xa_cmpxchg_irq(&mapping->i_pages, index, + swp_to_radix_entry(swap), + swp_to_radix_entry(swapin_error), 0); + if (old != swp_to_radix_entry(swap)) + return; + + folio_wait_writeback(folio); + delete_from_swap_cache(&folio->page); + spin_lock_irq(&info->lock); + /* + * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't + * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in + * shmem_evict_inode. + */ + info->alloced--; + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + swap_free(swap); +} + /* * Swap in the page pointed to by *pagep. * Caller has to make sure that *pagep contains a valid swapped page. @@ -1694,6 +1728,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = radix_to_swp_entry(*foliop); *foliop = NULL; + if (is_swapin_error_entry(swap)) + return -EIO; + /* Look it up and read it in.. */ page = lookup_swap_cache(swap, NULL, 0); if (!page) { @@ -1761,6 +1798,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, failed: if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; + if (error == -EIO) + shmem_set_folio_swapin_error(inode, index, folio, swap); unlock: if (folio) { folio_unlock(folio); @@ -1906,7 +1945,7 @@ alloc_nohuge: spin_lock_irq(&info->lock); info->alloced += folio_nr_pages(folio); - inode->i_blocks += BLOCKS_PER_PAGE << folio_order(folio); + inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); alloced = true; diff --git a/mm/swap_state.c b/mm/swap_state.c index b9e4ed2e90bf..778d57d2d92d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -410,6 +410,9 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) return NULL; swp = radix_to_swp_entry(page); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swp)) + return NULL; /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) diff --git a/mm/swapfile.c b/mm/swapfile.c index 94b4ff43ead0..a2e66d855b19 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1775,7 +1775,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, { struct page *swapcache; spinlock_t *ptl; - pte_t *pte; + pte_t *pte, new_pte; int ret = 1; swapcache = page; @@ -1789,6 +1789,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } + if (unlikely(!PageUptodate(page))) { + pte_t pteval; + + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + pteval = swp_entry_to_pte(make_swapin_error_entry(page)); + set_pte_at(vma->vm_mm, addr, pte, pteval); + swap_free(entry); + ret = 0; + goto out; + } + /* See do_swap_page() */ BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); BUG_ON(PageAnon(page) && PageAnonExclusive(page)); @@ -1813,8 +1824,12 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); + new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); + if (pte_swp_soft_dirty(*pte)) + new_pte = pte_mksoft_dirty(new_pte); + if (pte_swp_uffd_wp(*pte)) + new_pte = pte_mkuffd_wp(new_pte); + set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: pte_unmap_unlock(pte, ptl); diff --git a/mm/z3fold.c b/mm/z3fold.c index 83b5a3514427..f41f8b0d9e9a 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -181,6 +181,7 @@ enum z3fold_page_flags { NEEDS_COMPACTING, PAGE_STALE, PAGE_CLAIMED, /* by either reclaim or free */ + PAGE_MIGRATED, /* page is migrated and soon to be released */ }; /* @@ -212,10 +213,8 @@ static int size_to_chunks(size_t size) static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, gfp_t gfp) { - struct z3fold_buddy_slots *slots; - - slots = kmem_cache_zalloc(pool->c_handle, - (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); + struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle, + gfp); if (slots) { /* It will be freed separately in free_handle(). */ @@ -272,8 +271,13 @@ static inline struct z3fold_header *get_z3fold_header(unsigned long handle) zhdr = (struct z3fold_header *)(addr & PAGE_MASK); locked = z3fold_page_trylock(zhdr); read_unlock(&slots->lock); - if (locked) - break; + if (locked) { + struct page *page = virt_to_page(zhdr); + + if (!test_bit(PAGE_MIGRATED, &page->private)) + break; + z3fold_page_unlock(zhdr); + } cpu_relax(); } while (true); } else { @@ -391,6 +395,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); clear_bit(PAGE_CLAIMED, &page->private); + clear_bit(PAGE_MIGRATED, &page->private); if (headless) return zhdr; @@ -521,13 +526,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) atomic64_dec(&pool->pages_nr); } -static void release_z3fold_page(struct kref *ref) -{ - struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, - refcount); - __release_z3fold_page(zhdr, false); -} - static void release_z3fold_page_locked(struct kref *ref) { struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, @@ -940,10 +938,19 @@ lookup: } } - if (zhdr && !zhdr->slots) - zhdr->slots = alloc_slots(pool, - can_sleep ? GFP_NOIO : GFP_ATOMIC); + if (zhdr && !zhdr->slots) { + zhdr->slots = alloc_slots(pool, GFP_ATOMIC); + if (!zhdr->slots) + goto out_fail; + } return zhdr; + +out_fail: + if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + add_to_unbuddied(pool, zhdr); + z3fold_page_unlock(zhdr); + } + return NULL; } /* @@ -1066,7 +1073,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, enum buddy bud; bool can_sleep = gfpflags_allow_blocking(gfp); - if (!size) + if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE) @@ -1093,28 +1100,7 @@ retry: bud = FIRST; } - page = NULL; - if (can_sleep) { - spin_lock(&pool->stale_lock); - zhdr = list_first_entry_or_null(&pool->stale, - struct z3fold_header, buddy); - /* - * Before allocating a page, let's see if we can take one from - * the stale pages list. cancel_work_sync() can sleep so we - * limit this case to the contexts where we can sleep - */ - if (zhdr) { - list_del(&zhdr->buddy); - spin_unlock(&pool->stale_lock); - cancel_work_sync(&zhdr->work); - page = virt_to_page(zhdr); - } else { - spin_unlock(&pool->stale_lock); - } - } - if (!page) - page = alloc_page(gfp); - + page = alloc_page(gfp); if (!page) return -ENOMEM; @@ -1134,10 +1120,9 @@ retry: __SetPageMovable(page, pool->inode->i_mapping); unlock_page(page); } else { - if (trylock_page(page)) { - __SetPageMovable(page, pool->inode->i_mapping); - unlock_page(page); - } + WARN_ON(!trylock_page(page)); + __SetPageMovable(page, pool->inode->i_mapping); + unlock_page(page); } z3fold_page_lock(zhdr); @@ -1236,8 +1221,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) return; } if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { - put_z3fold_header(zhdr); clear_bit(PAGE_CLAIMED, &page->private); + put_z3fold_header(zhdr); return; } if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { @@ -1332,12 +1317,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) break; } - if (kref_get_unless_zero(&zhdr->refcount) == 0) { - zhdr = NULL; - break; - } if (!z3fold_page_trylock(zhdr)) { - kref_put(&zhdr->refcount, release_z3fold_page); zhdr = NULL; continue; /* can't evict at this point */ } @@ -1348,14 +1328,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) */ if (zhdr->foreign_handles || test_and_set_bit(PAGE_CLAIMED, &page->private)) { - if (!kref_put(&zhdr->refcount, - release_z3fold_page_locked)) - z3fold_page_unlock(zhdr); + z3fold_page_unlock(zhdr); zhdr = NULL; continue; /* can't evict such page */ } list_del_init(&zhdr->buddy); zhdr->cpu = -1; + /* See comment in __z3fold_alloc. */ + kref_get(&zhdr->refcount); break; } @@ -1437,8 +1417,10 @@ next: spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); - z3fold_page_unlock(zhdr); + if (list_empty(&zhdr->buddy)) + add_to_unbuddied(pool, zhdr); clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); } /* We started off locked to we need to lock the pool back */ @@ -1590,8 +1572,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa if (!z3fold_page_trylock(zhdr)) return -EAGAIN; if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { - z3fold_page_unlock(zhdr); clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); return -EBUSY; } if (work_pending(&zhdr->work)) { @@ -1601,7 +1583,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa new_zhdr = page_address(newpage); memcpy(new_zhdr, zhdr, PAGE_SIZE); newpage->private = page->private; - page->private = 0; + set_bit(PAGE_MIGRATED, &page->private); z3fold_page_unlock(zhdr); spin_lock_init(&new_zhdr->page_lock); INIT_WORK(&new_zhdr->work, compact_page_work); @@ -1631,7 +1613,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); - clear_bit(PAGE_CLAIMED, &page->private); + /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */ + page->private = 0; put_page(page); return 0; } @@ -1653,6 +1636,8 @@ static void z3fold_page_putback(struct page *page) spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); + if (list_empty(&zhdr->buddy)) + add_to_unbuddied(pool, zhdr); clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } diff --git a/tools/testing/selftests/cgroup/memcg_protection.m b/tools/testing/selftests/cgroup/memcg_protection.m new file mode 100644 index 000000000000..051daa3477b6 --- /dev/null +++ b/tools/testing/selftests/cgroup/memcg_protection.m @@ -0,0 +1,89 @@ +% SPDX-License-Identifier: GPL-2.0 +% +% run as: octave-cli memcg_protection.m +% +% This script simulates reclaim protection behavior on a single level of memcg +% hierarchy to illustrate how overcommitted protection spreads among siblings +% (as it depends also on their current consumption). +% +% Simulation assumes siblings consumed the initial amount of memory (w/out +% reclaim) and then the reclaim starts, all memory is reclaimable, i.e. treated +% same. It simulates only non-low reclaim and assumes all memory.min = 0. +% +% Input configurations +% -------------------- +% E number parent effective protection +% n vector nominal protection of siblings set at the given level (memory.low) +% c vector current consumption -,,- (memory.current) + +% example from testcase (values in GB) +E = 50 / 1024; +n = [75 25 0 500 ] / 1024; +c = [50 50 50 0] / 1024; + +% Reclaim parameters +% ------------------ + +% Minimal reclaim amount (GB) +cluster = 32*4 / 2**20; + +% Reclaim coefficient (think as 0.5^sc->priority) +alpha = .1 + +% Simulation parameters +% --------------------- +epsilon = 1e-7; +timeout = 1000; + +% Simulation loop +% --------------- + +ch = []; +eh = []; +rh = []; + +for t = 1:timeout + % low_usage + u = min(c, n); + siblings = sum(u); + + % effective_protection() + protected = min(n, c); % start with nominal + e = protected * min(1, E / siblings); % normalize overcommit + + % recursive protection + unclaimed = max(0, E - siblings); + parent_overuse = sum(c) - siblings; + if (unclaimed > 0 && parent_overuse > 0) + overuse = max(0, c - protected); + e += unclaimed * (overuse / parent_overuse); + endif + + % get_scan_count() + r = alpha * c; % assume all memory is in a single LRU list + + % commit 1bc63fb1272b ("mm, memcg: make scan aggression always exclude protection") + sz = max(e, c); + r .*= (1 - (e+epsilon) ./ (sz+epsilon)); + + % uncomment to debug prints + % e, c, r + + % nothing to reclaim, reached equilibrium + if max(r) < epsilon + break; + endif + + % SWAP_CLUSTER_MAX roundup + r = max(r, (r > epsilon) .* cluster); + % XXX here I do parallel reclaim of all siblings + % in reality reclaim is serialized and each sibling recalculates own residual + c = max(c - r, 0); + + ch = [ch ; c]; + eh = [eh ; e]; + rh = [rh ; r]; +endfor + +t +c, e diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 44a974ec472c..8833359556f3 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -190,13 +190,6 @@ cleanup: return ret; } -static int alloc_pagecache_50M(const char *cgroup, void *arg) -{ - int fd = (long)arg; - - return alloc_pagecache(fd, MB(50)); -} - static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg) { int fd = (long)arg; @@ -247,33 +240,39 @@ static int cg_test_proc_killed(const char *cgroup) /* * First, this test creates the following hierarchy: - * A memory.min = 50M, memory.max = 200M - * A/B memory.min = 50M, memory.current = 50M + * A memory.min = 0, memory.max = 200M + * A/B memory.min = 50M * A/B/C memory.min = 75M, memory.current = 50M * A/B/D memory.min = 25M, memory.current = 50M * A/B/E memory.min = 0, memory.current = 50M * A/B/F memory.min = 500M, memory.current = 0 * - * Usages are pagecache, but the test keeps a running + * (or memory.low if we test soft protection) + * + * Usages are pagecache and the test keeps a running * process in every leaf cgroup. * Then it creates A/G and creates a significant - * memory pressure in it. + * memory pressure in A. * + * Then it checks actual memory usages and expects that: * A/B memory.current ~= 50M - * A/B/C memory.current ~= 33M - * A/B/D memory.current ~= 17M - * A/B/F memory.current ~= 0 + * A/B/C memory.current ~= 29M + * A/B/D memory.current ~= 21M + * A/B/E memory.current ~= 0 + * A/B/F memory.current = 0 + * (for origin of the numbers, see model in memcg_protection.m.) * * After that it tries to allocate more than there is - * unprotected memory in A available, and checks - * checks that memory.min protects pagecache even - * in this case. + * unprotected memory in A available, and checks that: + * a) memory.min protects pagecache even in this case, + * b) memory.low allows reclaiming page cache with low events. */ -static int test_memcg_min(const char *root) +static int test_memcg_protection(const char *root, bool min) { - int ret = KSFT_FAIL; + int ret = KSFT_FAIL, rc; char *parent[3] = {NULL}; char *children[4] = {NULL}; + const char *attribute = min ? "memory.min" : "memory.low"; long c[4]; int i, attempts; int fd; @@ -297,8 +296,10 @@ static int test_memcg_min(const char *root) if (cg_create(parent[0])) goto cleanup; - if (cg_read_long(parent[0], "memory.min")) { - ret = KSFT_SKIP; + if (cg_read_long(parent[0], attribute)) { + /* No memory.min on older kernels is fine */ + if (min) + ret = KSFT_SKIP; goto cleanup; } @@ -335,17 +336,15 @@ static int test_memcg_min(const char *root) (void *)(long)fd); } - if (cg_write(parent[0], "memory.min", "50M")) + if (cg_write(parent[1], attribute, "50M")) goto cleanup; - if (cg_write(parent[1], "memory.min", "50M")) + if (cg_write(children[0], attribute, "75M")) goto cleanup; - if (cg_write(children[0], "memory.min", "75M")) + if (cg_write(children[1], attribute, "25M")) goto cleanup; - if (cg_write(children[1], "memory.min", "25M")) + if (cg_write(children[2], attribute, "0")) goto cleanup; - if (cg_write(children[2], "memory.min", "0")) - goto cleanup; - if (cg_write(children[3], "memory.min", "500M")) + if (cg_write(children[3], attribute, "500M")) goto cleanup; attempts = 0; @@ -365,170 +364,35 @@ static int test_memcg_min(const char *root) for (i = 0; i < ARRAY_SIZE(children); i++) c[i] = cg_read_long(children[i], "memory.current"); - if (!values_close(c[0], MB(33), 10)) + if (!values_close(c[0], MB(29), 10)) goto cleanup; - if (!values_close(c[1], MB(17), 10)) + if (!values_close(c[1], MB(21), 10)) goto cleanup; if (c[3] != 0) goto cleanup; - if (!cg_run(parent[2], alloc_anon, (void *)MB(170))) - goto cleanup; - - if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) - goto cleanup; - - ret = KSFT_PASS; - -cleanup: - for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) { - if (!children[i]) - continue; - - cg_destroy(children[i]); - free(children[i]); - } - - for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) { - if (!parent[i]) - continue; - - cg_destroy(parent[i]); - free(parent[i]); - } - close(fd); - return ret; -} - -/* - * First, this test creates the following hierarchy: - * A memory.low = 50M, memory.max = 200M - * A/B memory.low = 50M, memory.current = 50M - * A/B/C memory.low = 75M, memory.current = 50M - * A/B/D memory.low = 25M, memory.current = 50M - * A/B/E memory.low = 0, memory.current = 50M - * A/B/F memory.low = 500M, memory.current = 0 - * - * Usages are pagecache. - * Then it creates A/G an creates a significant - * memory pressure in it. - * - * Then it checks actual memory usages and expects that: - * A/B memory.current ~= 50M - * A/B/ memory.current ~= 33M - * A/B/D memory.current ~= 17M - * A/B/F memory.current ~= 0 - * - * After that it tries to allocate more than there is - * unprotected memory in A available, - * and checks low and oom events in memory.events. - */ -static int test_memcg_low(const char *root) -{ - int ret = KSFT_FAIL; - char *parent[3] = {NULL}; - char *children[4] = {NULL}; - long low, oom; - long c[4]; - int i; - int fd; - - fd = get_temp_fd(); - if (fd < 0) - goto cleanup; - - parent[0] = cg_name(root, "memcg_test_0"); - if (!parent[0]) - goto cleanup; - - parent[1] = cg_name(parent[0], "memcg_test_1"); - if (!parent[1]) - goto cleanup; - - parent[2] = cg_name(parent[0], "memcg_test_2"); - if (!parent[2]) - goto cleanup; - - if (cg_create(parent[0])) - goto cleanup; - - if (cg_read_long(parent[0], "memory.low")) - goto cleanup; - - if (cg_write(parent[0], "cgroup.subtree_control", "+memory")) - goto cleanup; - - if (cg_write(parent[0], "memory.max", "200M")) - goto cleanup; - - if (cg_write(parent[0], "memory.swap.max", "0")) - goto cleanup; - - if (cg_create(parent[1])) - goto cleanup; - - if (cg_write(parent[1], "cgroup.subtree_control", "+memory")) + rc = cg_run(parent[2], alloc_anon, (void *)MB(170)); + if (min && !rc) goto cleanup; - - if (cg_create(parent[2])) + else if (!min && rc) { + fprintf(stderr, + "memory.low prevents from allocating anon memory\n"); goto cleanup; - - for (i = 0; i < ARRAY_SIZE(children); i++) { - children[i] = cg_name_indexed(parent[1], "child_memcg", i); - if (!children[i]) - goto cleanup; - - if (cg_create(children[i])) - goto cleanup; - - if (i > 2) - continue; - - if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd)) - goto cleanup; } - if (cg_write(parent[0], "memory.low", "50M")) - goto cleanup; - if (cg_write(parent[1], "memory.low", "50M")) - goto cleanup; - if (cg_write(children[0], "memory.low", "75M")) - goto cleanup; - if (cg_write(children[1], "memory.low", "25M")) - goto cleanup; - if (cg_write(children[2], "memory.low", "0")) - goto cleanup; - if (cg_write(children[3], "memory.low", "500M")) - goto cleanup; - - if (cg_run(parent[2], alloc_anon, (void *)MB(148))) - goto cleanup; - if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) goto cleanup; - for (i = 0; i < ARRAY_SIZE(children); i++) - c[i] = cg_read_long(children[i], "memory.current"); - - if (!values_close(c[0], MB(33), 10)) - goto cleanup; - - if (!values_close(c[1], MB(17), 10)) - goto cleanup; - - if (c[3] != 0) - goto cleanup; - - if (cg_run(parent[2], alloc_anon, (void *)MB(166))) { - fprintf(stderr, - "memory.low prevents from allocating anon memory\n"); + if (min) { + ret = KSFT_PASS; goto cleanup; } for (i = 0; i < ARRAY_SIZE(children); i++) { - int no_low_events_index = has_recursiveprot ? 2 : 1; + int no_low_events_index = 1; + long low, oom; oom = cg_read_key_long(children[i], "memory.events", "oom "); low = cg_read_key_long(children[i], "memory.events", "low "); @@ -564,6 +428,16 @@ cleanup: return ret; } +static int test_memcg_min(const char *root) +{ + return test_memcg_protection(root, true); +} + +static int test_memcg_low(const char *root) +{ + return test_memcg_protection(root, false); +} + static int alloc_pagecache_max_30M(const char *cgroup, void *arg) { size_t size = MB(50); @@ -1241,7 +1115,16 @@ static int test_memcg_oom_group_leaf_events(const char *root) if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0) goto cleanup; - if (cg_read_key_long(parent, "memory.events", "oom_kill ") <= 0) + parent_oom_events = cg_read_key_long( + parent, "memory.events", "oom_kill "); + /* + * If memory_localevents is not enabled (the default), the parent should + * count OOM events in its children groups. Otherwise, it should not + * have observed any events. + */ + if (has_localevents && parent_oom_events != 0) + goto cleanup; + else if (!has_localevents && parent_oom_events <= 0) goto cleanup; ret = KSFT_PASS; @@ -1349,20 +1232,14 @@ static int test_memcg_oom_group_score_events(const char *root) if (!cg_run(memcg, alloc_anon, (void *)MB(100))) goto cleanup; - parent_oom_events = cg_read_key_long( - parent, "memory.events", "oom_kill "); - /* - * If memory_localevents is not enabled (the default), the parent should - * count OOM events in its children groups. Otherwise, it should not - * have observed any events. - */ - if ((has_localevents && parent_oom_events == 0) || - parent_oom_events > 0) - ret = KSFT_PASS; + if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3) + goto cleanup; if (kill(safe_pid, SIGKILL)) goto cleanup; + ret = KSFT_PASS; + cleanup: if (memcg) cg_destroy(memcg); |