From 3b8abb3239530c423c0b97e42af7f7e856e1ee96 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 2 May 2023 09:08:38 -0700 Subject: mm: kmem: fix a NULL pointer dereference in obj_stock_flush_required() KCSAN found an issue in obj_stock_flush_required(): stock->cached_objcg can be reset between the check and dereference: ================================================================== BUG: KCSAN: data-race in drain_all_stock / drain_obj_stock write to 0xffff888237c2a2f8 of 8 bytes by task 19625 on cpu 0: drain_obj_stock+0x408/0x4e0 mm/memcontrol.c:3306 refill_obj_stock+0x9c/0x1e0 mm/memcontrol.c:3340 obj_cgroup_uncharge+0xe/0x10 mm/memcontrol.c:3408 memcg_slab_free_hook mm/slab.h:587 [inline] __cache_free mm/slab.c:3373 [inline] __do_kmem_cache_free mm/slab.c:3577 [inline] kmem_cache_free+0x105/0x280 mm/slab.c:3602 __d_free fs/dcache.c:298 [inline] dentry_free fs/dcache.c:375 [inline] __dentry_kill+0x422/0x4a0 fs/dcache.c:621 dentry_kill+0x8d/0x1e0 dput+0x118/0x1f0 fs/dcache.c:913 __fput+0x3bf/0x570 fs/file_table.c:329 ____fput+0x15/0x20 fs/file_table.c:349 task_work_run+0x123/0x160 kernel/task_work.c:179 resume_user_mode_work include/linux/resume_user_mode.h:49 [inline] exit_to_user_mode_loop+0xcf/0xe0 kernel/entry/common.c:171 exit_to_user_mode_prepare+0x6a/0xa0 kernel/entry/common.c:203 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] syscall_exit_to_user_mode+0x26/0x140 kernel/entry/common.c:296 do_syscall_64+0x4d/0xc0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff888237c2a2f8 of 8 bytes by task 19632 on cpu 1: obj_stock_flush_required mm/memcontrol.c:3319 [inline] drain_all_stock+0x174/0x2a0 mm/memcontrol.c:2361 try_charge_memcg+0x6d0/0xd10 mm/memcontrol.c:2703 try_charge mm/memcontrol.c:2837 [inline] mem_cgroup_charge_skmem+0x51/0x140 mm/memcontrol.c:7290 sock_reserve_memory+0xb1/0x390 net/core/sock.c:1025 sk_setsockopt+0x800/0x1e70 net/core/sock.c:1525 udp_lib_setsockopt+0x99/0x6c0 net/ipv4/udp.c:2692 udp_setsockopt+0x73/0xa0 net/ipv4/udp.c:2817 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3668 __sys_setsockopt+0x1c3/0x230 net/socket.c:2271 __do_sys_setsockopt net/socket.c:2282 [inline] __se_sys_setsockopt net/socket.c:2279 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2279 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0xffff8881382d52c0 -> 0xffff888138893740 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 19632 Comm: syz-executor.0 Not tainted 6.3.0-rc2-syzkaller-00387-g534293368afa #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/02/2023 Fix it by using READ_ONCE()/WRITE_ONCE() for all accesses to stock->cached_objcg. Link: https://lkml.kernel.org/r/20230502160839.361544-1-roman.gushchin@linux.dev Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API") Signed-off-by: Roman Gushchin Reported-by: syzbot+774c29891415ab0fd29d@syzkaller.appspotmail.com Reported-by: Dmitry Vyukov Link: https://lore.kernel.org/linux-mm/CACT4Y+ZfucZhM60YPphWiCLJr6+SGFhT+jjm8k1P-a_8Kkxsjg@mail.gmail.com/T/#t Reviewed-by: Yosry Ahmed Acked-by: Shakeel Butt Reviewed-by: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/memcontrol.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b27e245a055..c823c35c2ed4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3208,12 +3208,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, * accumulating over a page of vmstat data or when pgdat or idx * changes. */ - if (stock->cached_objcg != objcg) { + if (READ_ONCE(stock->cached_objcg) != objcg) { old = drain_obj_stock(stock); obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - stock->cached_objcg = objcg; + WRITE_ONCE(stock->cached_objcg, objcg); stock->cached_pgdat = pgdat; } else if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ @@ -3267,7 +3267,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { + if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; } @@ -3279,7 +3279,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) { - struct obj_cgroup *old = stock->cached_objcg; + struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); if (!old) return NULL; @@ -3332,7 +3332,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) stock->cached_pgdat = NULL; } - stock->cached_objcg = NULL; + WRITE_ONCE(stock->cached_objcg, NULL); /* * The `old' objects needs to be released by the caller via * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. @@ -3343,10 +3343,11 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg) { + struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); struct mem_cgroup *memcg; - if (stock->cached_objcg) { - memcg = obj_cgroup_memcg(stock->cached_objcg); + if (objcg) { + memcg = obj_cgroup_memcg(objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) return true; } @@ -3365,10 +3366,10 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (stock->cached_objcg != objcg) { /* reset if necessary */ + if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ old = drain_obj_stock(stock); obj_cgroup_get(objcg); - stock->cached_objcg = objcg; + WRITE_ONCE(stock->cached_objcg, objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; allow_uncharge = true; /* Allow uncharge when objcg changes */ -- cgit v1.2.3 From f785a8f21a9cc46fced9f53c51a6f2dc647ed484 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 2 May 2023 09:08:39 -0700 Subject: mm: memcg: use READ_ONCE()/WRITE_ONCE() to access stock->cached A memcg pointer in the percpu stock can be accessed by drain_all_stock() from another cpu in a lockless way. In theory it might lead to an issue, similar to the one which has been discovered with stock->cached_objcg, where the pointer was zeroed between the check for being NULL and dereferencing. In this case the issue is unlikely a real problem, but to make it bulletproof and similar to stock->cached_objcg, let's annotate all accesses to stock->cached with READ_ONCE()/WTRITE_ONCE(). Link: https://lkml.kernel.org/r/20230502160839.361544-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Dmitry Vyukov Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/memcontrol.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c823c35c2ed4..1e364ad495a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2275,7 +2275,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (memcg == stock->cached && stock->nr_pages >= nr_pages) { + if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; ret = true; } @@ -2290,7 +2290,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) */ static void drain_stock(struct memcg_stock_pcp *stock) { - struct mem_cgroup *old = stock->cached; + struct mem_cgroup *old = READ_ONCE(stock->cached); if (!old) return; @@ -2303,7 +2303,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) } css_put(&old->css); - stock->cached = NULL; + WRITE_ONCE(stock->cached, NULL); } static void drain_local_stock(struct work_struct *dummy) @@ -2338,10 +2338,10 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) struct memcg_stock_pcp *stock; stock = this_cpu_ptr(&memcg_stock); - if (stock->cached != memcg) { /* reset if necessary */ + if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ drain_stock(stock); css_get(&memcg->css); - stock->cached = memcg; + WRITE_ONCE(stock->cached, memcg); } stock->nr_pages += nr_pages; @@ -2383,7 +2383,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) bool flush = false; rcu_read_lock(); - memcg = stock->cached; + memcg = READ_ONCE(stock->cached); if (memcg && stock->nr_pages && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; -- cgit v1.2.3 From 5b42360c73b0679505dac6ec44d234cfec61120c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 28 Apr 2023 13:24:05 +0000 Subject: memcg: use seq_buf_do_printk() with mem_cgroup_print_oom_meminfo() Currently, we format all the memcg stats into a buffer in mem_cgroup_print_oom_meminfo() and use pr_info() to dump it to the logs. However, this buffer is large in size. Although it is currently working as intended, ther is a dependency between the memcg stats buffer and the printk record size limit. If we add more stats in the future and the buffer becomes larger than the printk record size limit, or if the prink record size limit is reduced, the logs may be truncated. It is safer to use seq_buf_do_printk(), which will automatically break up the buffer at line breaks and issue small printk() calls. Refactor the code to move the seq_buf from memory_stat_format() to its callers, and use seq_buf_do_printk() to print the seq_buf in mem_cgroup_print_oom_meminfo(). Link: https://lkml.kernel.org/r/20230428132406.2540811-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Michal Hocko Reviewed-by: Sergey Senozhatsky Acked-by: Shakeel Butt Reviewed-by: Muchun Song Cc: Johannes Weiner Cc: Michal Hocko Cc: Petr Mladek Cc: Roman Gushchin Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1e364ad495a3..a79a47f7fc20 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1580,13 +1580,10 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) +static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { - struct seq_buf s; int i; - seq_buf_init(&s, buf, bufsize); - /* * Provide statistics on the state of the memory subsystem as * well as cumulative event counters that show past behavior. @@ -1603,21 +1600,21 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) u64 size; size = memcg_page_state_output(memcg, memory_stats[i].idx); - seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); + seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size); if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { size += memcg_page_state_output(memcg, NR_SLAB_RECLAIMABLE_B); - seq_buf_printf(&s, "slab %llu\n", size); + seq_buf_printf(s, "slab %llu\n", size); } } /* Accumulated memory events */ - seq_buf_printf(&s, "pgscan %lu\n", + seq_buf_printf(s, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + memcg_events(memcg, PGSCAN_DIRECT) + memcg_events(memcg, PGSCAN_KHUGEPAGED)); - seq_buf_printf(&s, "pgsteal %lu\n", + seq_buf_printf(s, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT) + memcg_events(memcg, PGSTEAL_KHUGEPAGED)); @@ -1627,13 +1624,13 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) memcg_vm_event_stat[i] == PGPGOUT) continue; - seq_buf_printf(&s, "%s %lu\n", + seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); } /* The above should easily fit into one page */ - WARN_ON_ONCE(seq_buf_has_overflowed(&s)); + WARN_ON_ONCE(seq_buf_has_overflowed(s)); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -1671,6 +1668,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { /* Use static buffer, for the caller is holding oom_lock. */ static char buf[PAGE_SIZE]; + struct seq_buf s; lockdep_assert_held(&oom_lock); @@ -1693,8 +1691,9 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) pr_info("Memory cgroup stats for "); pr_cont_cgroup_path(memcg->css.cgroup); pr_cont(":"); - memory_stat_format(memcg, buf, sizeof(buf)); - pr_info("%s", buf); + seq_buf_init(&s, buf, sizeof(buf)); + memory_stat_format(memcg, &s); + seq_buf_do_printk(&s, KERN_INFO); } /* @@ -6635,10 +6634,12 @@ static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + struct seq_buf s; if (!buf) return -ENOMEM; - memory_stat_format(memcg, buf, PAGE_SIZE); + seq_buf_init(&s, buf, PAGE_SIZE); + memory_stat_format(memcg, &s); seq_puts(m, buf); kfree(buf); return 0; -- cgit v1.2.3 From dddb44ffa0d59c8a3f2a5cb9690ccebe3150810c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 28 Apr 2023 13:24:06 +0000 Subject: memcg: dump memory.stat during cgroup OOM for v1 Patch series "memcg: OOM log improvements", v2. This short patch series brings back some cgroup v1 stats in OOM logs that were unnecessarily changed before. It also makes memcg OOM logs less reliant on printk() internals. This patch (of 2): Commit c8713d0b2312 ("mm: memcontrol: dump memory.stat during cgroup OOM") made sure we dump all the stats in memory.stat during a cgroup OOM, but it also introduced a slight behavioral change. The code used to print the non-hierarchical v1 cgroup stats for the entire cgroup subtree, now it only prints the v2 cgroup stats for the cgroup under OOM. For cgroup v1 users, this introduces a few problems: (a) The non-hierarchical stats of the memcg under OOM are no longer shown. (b) A couple of v1-only stats (e.g. pgpgin, pgpgout) are no longer shown. (c) We show the list of cgroup v2 stats, even in cgroup v1. This list of stats is not tracked with v1 in mind. While most of the stats seem to be working on v1, there may be some stats that are not fully or correctly tracked. Although OOM log is not set in stone, we should not change it for no reason. When upgrading the kernel version to a version including commit c8713d0b2312 ("mm: memcontrol: dump memory.stat during cgroup OOM"), these behavioral changes are noticed in cgroup v1. The fix is simple. Commit c8713d0b2312 ("mm: memcontrol: dump memory.stat during cgroup OOM") separated stats formatting from stats display for v2, to reuse the stats formatting in the OOM logs. Do the same for v1. Move the v2 specific formatting from memory_stat_format() to memcg_stat_format(), add memcg1_stat_format() for v1, and make memory_stat_format() select between them based on cgroup version. Since memory_stat_show() now works for both v1 & v2, drop memcg_stat_show(). Link: https://lkml.kernel.org/r/20230428132406.2540811-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20230428132406.2540811-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Cc: Petr Mladek Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Rostedt (Google) Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 60 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 25 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a79a47f7fc20..a05c53ab5238 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1580,7 +1580,7 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) +static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { int i; @@ -1633,6 +1633,17 @@ static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) WARN_ON_ONCE(seq_buf_has_overflowed(s)); } +static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); + +static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) +{ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + memcg_stat_format(memcg, s); + else + memcg1_stat_format(memcg, s); + WARN_ON_ONCE(seq_buf_has_overflowed(s)); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /** * mem_cgroup_print_oom_context: Print OOM information relevant to @@ -4135,9 +4146,8 @@ static const unsigned int memcg1_events[] = { PGMAJFAULT, }; -static int memcg_stat_show(struct seq_file *m, void *v) +static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { - struct mem_cgroup *memcg = mem_cgroup_from_seq(m); unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; @@ -4152,18 +4162,18 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) - seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), - memcg_events_local(memcg, memcg1_events[i])); + seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), + memcg_events_local(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) - seq_printf(m, "%s %lu\n", lru_list_name(i), - memcg_page_state_local(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); + seq_buf_printf(s, "%s %lu\n", lru_list_name(i), + memcg_page_state_local(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; @@ -4171,11 +4181,11 @@ static int memcg_stat_show(struct seq_file *m, void *v) memory = min(memory, READ_ONCE(mi->memory.max)); memsw = min(memsw, READ_ONCE(mi->memsw.max)); } - seq_printf(m, "hierarchical_memory_limit %llu\n", - (u64)memory * PAGE_SIZE); + seq_buf_printf(s, "hierarchical_memory_limit %llu\n", + (u64)memory * PAGE_SIZE); if (do_memsw_account()) - seq_printf(m, "hierarchical_memsw_limit %llu\n", - (u64)memsw * PAGE_SIZE); + seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", + (u64)memsw * PAGE_SIZE); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4183,19 +4193,19 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); - seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], + seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], (u64)nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) - seq_printf(m, "total_%s %llu\n", - vm_event_name(memcg1_events[i]), - (u64)memcg_events(memcg, memcg1_events[i])); + seq_buf_printf(s, "total_%s %llu\n", + vm_event_name(memcg1_events[i]), + (u64)memcg_events(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) - seq_printf(m, "total_%s %llu\n", lru_list_name(i), - (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); + seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); #ifdef CONFIG_DEBUG_VM { @@ -4210,12 +4220,10 @@ static int memcg_stat_show(struct seq_file *m, void *v) anon_cost += mz->lruvec.anon_cost; file_cost += mz->lruvec.file_cost; } - seq_printf(m, "anon_cost %lu\n", anon_cost); - seq_printf(m, "file_cost %lu\n", file_cost); + seq_buf_printf(s, "anon_cost %lu\n", anon_cost); + seq_buf_printf(s, "file_cost %lu\n", file_cost); } #endif - - return 0; } static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, @@ -5059,6 +5067,8 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p) } #endif +static int memory_stat_show(struct seq_file *m, void *v); + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5091,7 +5101,7 @@ static struct cftype mem_cgroup_legacy_files[] = { }, { .name = "stat", - .seq_show = memcg_stat_show, + .seq_show = memory_stat_show, }, { .name = "force_empty", -- cgit v1.2.3 From 190409caaf7e3eee6926943488e486590efe6fde Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Apr 2023 17:40:17 +0000 Subject: memcg: flush stats non-atomically in mem_cgroup_wb_stats() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous patch moved the wb_over_bg_thresh()->mem_cgroup_wb_stats() code path in wb_writeback() outside the lock section. We no longer need to flush the stats atomically. Flush the stats non-atomically. Link: https://lkml.kernel.org/r/20230421174020.2994750-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Michal Koutný Acked-by: Shakeel Butt Acked-by: Tejun Heo Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a05c53ab5238..929162c5f45f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4656,11 +4656,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - /* - * wb_writeback() takes a spinlock and calls - * wb_over_bg_thresh()->mem_cgroup_wb_stats(). Do not sleep. - */ - mem_cgroup_flush_stats_atomic(); + mem_cgroup_flush_stats(); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); -- cgit v1.2.3 From f82a7a86dbfbd0ee81a4907ca41ba78bda1846f4 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Apr 2023 17:40:18 +0000 Subject: memcg: calculate root usage from global state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, we approximate the root usage by adding the memcg stats for anon, file, and conditionally swap (for memsw). To read the memcg stats we need to invoke an rstat flush. rstat flushes can be expensive, they scale with the number of cpus and cgroups on the system. mem_cgroup_usage() is called by memcg_events()->mem_cgroup_threshold() with irqs disabled, so such an expensive operation with irqs disabled can cause problems. Instead, approximate the root usage from global state. This is not 100% accurate, but the root usage has always been ill-defined anyway. Link: https://lkml.kernel.org/r/20230421174020.2994750-4-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Michal Koutný Acked-by: Shakeel Butt Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/memcontrol.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 929162c5f45f..7474aa8e4026 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3710,27 +3710,13 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) if (mem_cgroup_is_root(memcg)) { /* - * We can reach here from irq context through: - * uncharge_batch() - * |--memcg_check_events() - * |--mem_cgroup_threshold() - * |--__mem_cgroup_threshold() - * |--mem_cgroup_usage - * - * rstat flushing is an expensive operation that should not be - * done from irq context; use stale stats in this case. - * Arguably, usage threshold events are not reliable on the root - * memcg anyway since its usage is ill-defined. - * - * Additionally, other call paths through memcg_check_events() - * disable irqs, so make sure we are flushing stats atomically. + * Approximate root's usage from global state. This isn't + * perfect, but the root usage was always an approximation. */ - if (in_task()) - mem_cgroup_flush_stats_atomic(); - val = memcg_page_state(memcg, NR_FILE_PAGES) + - memcg_page_state(memcg, NR_ANON_MAPPED); + val = global_node_page_state(NR_FILE_PAGES) + + global_node_page_state(NR_ANON_MAPPED); if (swap) - val += memcg_page_state(memcg, MEMCG_SWAP); + val += total_swap_pages - get_nr_swap_pages(); } else { if (!swap) val = page_counter_read(&memcg->memory); -- cgit v1.2.3 From 35822fdae3bf509532b0954088070f17de51ff15 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Apr 2023 17:40:19 +0000 Subject: memcg: remove mem_cgroup_flush_stats_atomic() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous patches removed all callers of mem_cgroup_flush_stats_atomic(). Remove the function and simplify the code. Link: https://lkml.kernel.org/r/20230421174020.2994750-5-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 ----- mm/memcontrol.c | 24 +++++------------------- 2 files changed, 5 insertions(+), 24 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 222d7370134c..00a88cf947e1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1038,7 +1038,6 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, } void mem_cgroup_flush_stats(void); -void mem_cgroup_flush_stats_atomic(void); void mem_cgroup_flush_stats_ratelimited(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, @@ -1537,10 +1536,6 @@ static inline void mem_cgroup_flush_stats(void) { } -static inline void mem_cgroup_flush_stats_atomic(void) -{ -} - static inline void mem_cgroup_flush_stats_ratelimited(void) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7474aa8e4026..2184a9c566f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -639,7 +639,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) } } -static void do_flush_stats(bool atomic) +static void do_flush_stats(void) { /* * We always flush the entire tree, so concurrent flushers can just @@ -652,30 +652,16 @@ static void do_flush_stats(bool atomic) WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME); - if (atomic) - cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup); - else - cgroup_rstat_flush(root_mem_cgroup->css.cgroup); + cgroup_rstat_flush(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); atomic_set(&stats_flush_ongoing, 0); } -static bool should_flush_stats(void) -{ - return atomic_read(&stats_flush_threshold) > num_online_cpus(); -} - void mem_cgroup_flush_stats(void) { - if (should_flush_stats()) - do_flush_stats(false); -} - -void mem_cgroup_flush_stats_atomic(void) -{ - if (should_flush_stats()) - do_flush_stats(true); + if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + do_flush_stats(); } void mem_cgroup_flush_stats_ratelimited(void) @@ -690,7 +676,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w) * Always flush here so that flushing in latency-sensitive paths is * as cheap as possible. */ - do_flush_stats(false); + do_flush_stats(); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } -- cgit v1.2.3 From 857f21397f7113e4f764aa65fb0160eb7f404808 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Wed, 19 Apr 2023 03:07:38 +0000 Subject: memcg, oom: remove unnecessary check in mem_cgroup_oom_synchronize() mem_cgroup_oom_synchronize() is only used when the memcg oom handling is handed over to the edge of the #PF path. Since commit 29ef680ae7c2 ("memcg, oom: move out_of_memory back to the charge path") this is the case only when the kernel memcg oom killer is disabled (current->memcg_in_oom is only set if memcg->oom_kill_disable). Therefore a check for oom_kill_disable in mem_cgroup_oom_synchronize() is not required. Link: https://lkml.kernel.org/r/20230419030739.115845-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2184a9c566f1..e8aead97454b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2024,16 +2024,9 @@ bool mem_cgroup_oom_synchronize(bool handle) if (locked) mem_cgroup_oom_notify(memcg); - if (locked && !READ_ONCE(memcg->oom_kill_disable)) { - mem_cgroup_unmark_under_oom(memcg); - finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, - current->memcg_oom_order); - } else { - schedule(); - mem_cgroup_unmark_under_oom(memcg); - finish_wait(&memcg_oom_waitq, &owait.wait); - } + schedule(); + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); if (locked) { mem_cgroup_oom_unlock(memcg); -- cgit v1.2.3 From 18b1d18bc2bd8f54e8df6bc8096185361a6d1b15 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Wed, 19 Apr 2023 03:07:39 +0000 Subject: memcg, oom: remove explicit wakeup in mem_cgroup_oom_synchronize() Before commit 29ef680ae7c2 ("memcg, oom: move out_of_memory back to the charge path"), all memcg oom killers were delayed to page fault path. And the explicit wakeup is used in this case: thread A: ... if (locked) { // complete oom-kill, hold the lock mem_cgroup_oom_unlock(memcg); ... } ... thread B: ... if (locked && !memcg->oom_kill_disable) { ... } else { schedule(); // can't acquire the lock ... } ... The reason is that thread A kicks off the OOM-killer, which leads to wakeups from the uncharges of the exiting task. But thread B is not guaranteed to see them if it enters the OOM path after the OOM kills but before thread A releases the lock. Now only oom_kill_disable case is handled from the #PF path. In that case it is userspace to trigger the wake up not the #PF path itself. All potential paths to free some charges are responsible to call memcg_oom_recover() , so the explicit wakeup is not needed in the mem_cgroup_oom_synchronize() path which doesn't release any memory itself. Link: https://lkml.kernel.org/r/20230419030739.115845-2-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e8aead97454b..d31fb1e2cb33 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2028,15 +2028,8 @@ bool mem_cgroup_oom_synchronize(bool handle) mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); - if (locked) { + if (locked) mem_cgroup_oom_unlock(memcg); - /* - * There is no guarantee that an OOM-lock contender - * sees the wakeups triggered by the OOM kill - * uncharges. Wake any sleepers explicitly. - */ - memcg_oom_recover(memcg); - } cleanup: current->memcg_in_oom = NULL; css_put(&memcg->css); -- cgit v1.2.3 From 08e0f49e9991b175f2cda7ba32a7e9f1320dcbad Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Mon, 22 May 2023 09:52:32 +0000 Subject: mm/memcontrol: fix typo in comment Replace 'then' with 'than'. Link: https://lkml.kernel.org/r/20230522095233.4246-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d31fb1e2cb33..7c681492b47b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6862,7 +6862,7 @@ static unsigned long effective_protection(unsigned long usage, protected = min(usage, setting); /* * If all cgroups at this level combined claim and use more - * protection then what the parent affords them, distribute + * protection than what the parent affords them, distribute * shares in proportion to utilization. * * We are using actual utilization rather than the statically -- cgit v1.2.3 From 5c7e7a0d79072eb02780a2c0dee730b23cde711d Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Mon, 22 May 2023 11:20:56 +0000 Subject: mm: multi-gen LRU: cleanup lru_gen_soft_reclaim() lru_gen_soft_reclaim() gets the lruvec from the memcg and node ID to keep a cleaner interface on the caller side. Link: https://lkml.kernel.org/r/20230522112058.2965866-2-talumbau@google.com Signed-off-by: T.J. Alumbaugh Reviewed-by: Yuanchu Xie Cc: David Hildenbrand Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ++-- mm/memcontrol.c | 2 +- mm/vmscan.c | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3a68326c9989..5a7ada0413da 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -534,7 +534,7 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); -void lru_gen_soft_reclaim(struct lruvec *lruvec); +void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); #else /* !CONFIG_MEMCG */ @@ -585,7 +585,7 @@ static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } -static inline void lru_gen_soft_reclaim(struct lruvec *lruvec) +static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7c681492b47b..6a3d4ce87b8a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -485,7 +485,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) if (lru_gen_enabled()) { if (soft_limit_excess(memcg)) - lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); + lru_gen_soft_reclaim(memcg, nid); return; } diff --git a/mm/vmscan.c b/mm/vmscan.c index cafb933d609f..a51a7e0f8b63 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4846,8 +4846,10 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg) } } -void lru_gen_soft_reclaim(struct lruvec *lruvec) +void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + /* see the comment on MEMCG_NR_GENS */ if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); -- cgit v1.2.3 From e0e0b4126c1f1effd480777507a61bd09360dc8f Mon Sep 17 00:00:00 2001 From: "Lars R. Damerow" Date: Wed, 24 May 2023 11:17:33 -0700 Subject: mm/memcontrol: export memcg.swap watermark via sysfs for v2 memcg This patch is similar to commit 8e20d4b33266 ("mm/memcontrol: export memcg->watermark via sysfs for v2 memcg"), but exports the swap counter's watermark. We allocate jobs to our compute farm using heuristics determined by memory and swap usage from previous jobs. Tracking the peak swap usage for new jobs is important for determining when jobs are exceeding their expected bounds, or when our baseline metrics are getting outdated. Our toolset was written to use the "memory.memsw.max_usage_in_bytes" file in cgroups v1, and altering it to poll cgroups v2's "memory.swap.current" would give less accurate results as well as add complication to the code. Having this watermark exposed in sysfs is much preferred. Link: https://lkml.kernel.org/r/20230524181734.125696-1-lars@pixar.com Signed-off-by: Lars R. Damerow Acked-by: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 7 +++++++ mm/memcontrol.c | 13 +++++++++++++ 2 files changed, 20 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index f67c0829350b..1ffe019483ac 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1582,6 +1582,13 @@ PAGE_SIZE multiple when read back. Healthy workloads are not expected to reach this limit. + memory.swap.peak + A read-only single value file which exists on non-root + cgroups. + + The max swap usage recorded for the cgroup and its + descendants since the creation of the cgroup. + memory.swap.max A read-write single value file which exists on non-root cgroups. The default is "max". diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a3d4ce87b8a..6ee433be4c3b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7622,6 +7622,14 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } +static u64 swap_peak_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)memcg->swap.watermark * PAGE_SIZE; +} + static int swap_high_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -7700,6 +7708,11 @@ static struct cftype swap_files[] = { .seq_show = swap_max_show, .write = swap_max_write, }, + { + .name = "swap.peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = swap_peak_read, + }, { .name = "swap.events", .flags = CFTYPE_NOT_ON_ROOT, -- cgit v1.2.3 From 396faf88981917f975773d8589ce4f6e6c679e13 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 3 Jun 2023 15:21:16 +0800 Subject: memcg: use helper macro FLUSH_TIME Use helper macro FLUSH_TIME to indicate the flush time to improve the readability a bit. No functional change intended. Link: https://lkml.kernel.org/r/20230603072116.1101690-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Shakeel Butt Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6ee433be4c3b..caf6ab55f8e3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5428,7 +5428,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (unlikely(mem_cgroup_is_root(memcg))) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, - 2UL*HZ); + FLUSH_TIME); lru_gen_online_memcg(memcg); return 0; offline_kmem: -- cgit v1.2.3 From 04dee9e85cf50a2f24738e456d66b88de109b806 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:29:22 -0700 Subject: mm/various: give up if pte_offset_map[_lock]() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following the examples of nearby code, various functions can just give up if pte_offset_map() or pte_offset_map_lock() fails. And there's no need for a preliminary pmd_trans_unstable() or other such check, since such cases are now safely handled inside. Link: https://lkml.kernel.org/r/7b9bd85d-1652-cbf2-159d-f503b45e5b@google.com Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- mm/gup.c | 9 ++++++--- mm/ksm.c | 7 ++++--- mm/memcontrol.c | 8 ++++---- mm/memory-failure.c | 8 +++++--- mm/migrate.c | 3 +++ 5 files changed, 22 insertions(+), 13 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/gup.c b/mm/gup.c index d448fd286b8c..598e8c98367b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -545,10 +545,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); - if (unlikely(pmd_bad(*pmd))) - return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + return no_page_table(vma, flags); pte = *ptep; if (!pte_present(pte)) goto no_page; @@ -852,8 +852,9 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; - VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, address); + if (!pte) + return -EFAULT; if (pte_none(*pte)) goto unmap; *vma = get_gate_vma(mm); @@ -2468,6 +2469,8 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); + if (!ptep) + return 0; do { pte_t pte = ptep_get_lockless(ptep); struct page *page; diff --git a/mm/ksm.c b/mm/ksm.c index df2aa281d49d..3dc15459dd20 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -431,10 +431,9 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex pte_t *pte; int ret; - if (pmd_leaf(*pmd) || !pmd_present(*pmd)) - return 0; - pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) + return 0; if (pte_present(*pte)) { page = vm_normal_page(walk->vma, addr, *pte); } else if (!pte_none(*pte)) { @@ -1203,6 +1202,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!ptep) + goto out_mn; if (!pte_same(*ptep, orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index caf6ab55f8e3..77d8d2d14fcf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6021,9 +6021,9 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, return 0; } - if (pmd_trans_unstable(pmd)) - return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) + return 0; for (; addr != end; pte++, addr += PAGE_SIZE) if (get_mctgt_type(vma, addr, *pte, NULL)) mc.precharge++; /* increment precharge temporarily */ @@ -6241,10 +6241,10 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, return 0; } - if (pmd_trans_unstable(pmd)) - return 0; retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) + return 0; for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); bool device = false; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 004a02f44271..d5116f0eb1b6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -405,6 +405,8 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, if (pmd_devmap(*pmd)) return PMD_SHIFT; pte = pte_offset_map(pmd, address); + if (!pte) + return 0; if (pte_present(*pte) && pte_devmap(*pte)) ret = PAGE_SHIFT; pte_unmap(pte); @@ -791,11 +793,11 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, goto out; } - if (pmd_trans_unstable(pmdp)) - goto out; - mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl); + if (!ptep) + goto out; + for (; addr != end; ptep++, addr += PAGE_SIZE) { ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, hwp->pfn, &hwp->tk); diff --git a/mm/migrate.c b/mm/migrate.c index c1f2c40441e1..363562992046 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -305,6 +305,9 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, swp_entry_t entry; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + return; + pte = *ptep; pte_unmap(ptep); -- cgit v1.2.3 From c33c794828f21217f72ce6fc140e0d34e0d56bff Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 12 Jun 2023 16:15:45 +0100 Subject: mm: ptep_get() conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert all instances of direct pte_t* dereferencing to instead use ptep_get() helper. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. Conversion was done using Coccinelle: ---- // $ make coccicheck \ // COCCI=ptepget.cocci \ // SPFLAGS="--include-headers" \ // MODE=patch virtual patch @ depends on patch @ pte_t *v; @@ - *v + ptep_get(v) ---- Then reviewed and hand-edited to avoid multiple unnecessary calls to ptep_get(), instead opting to store the result of a single call in a variable, where it is correct to do so. This aims to negate any cost of READ_ONCE() and will benefit arch-overrides that may be more complex. Included is a fix for an issue in an earlier version of this patch that was pointed out by kernel test robot. The issue arose because config MMU=n elides definition of the ptep helper functions, including ptep_get(). HUGETLB_PAGE=n configs still define a simple huge_ptep_clear_flush() for linking purposes, which dereferences the ptep. So when both configs are disabled, this caused a build error because ptep_get() is not defined. Fix by continuing to do a direct dereference when MMU=n. This is safe because for this config the arch code cannot be trying to virtualize the ptes because none of the ptep helpers are defined. Link: https://lkml.kernel.org/r/20230612151545.3317766-4-ryan.roberts@arm.com Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202305120142.yXsNEo6H-lkp@intel.com/ Signed-off-by: Ryan Roberts Cc: Adrian Hunter Cc: Alexander Potapenko Cc: Alexander Shishkin Cc: Alex Williamson Cc: Al Viro Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christian Brauner Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dave Airlie Cc: Dimitri Sivanich Cc: Dmitry Vyukov Cc: Ian Rogers Cc: Jason Gunthorpe Cc: Jérôme Glisse Cc: Jiri Olsa Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mark Rutland Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Namhyung Kim Cc: Naoya Horiguchi Cc: Oleksandr Tyshchenko Cc: Pavel Tatashin Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Cc: Yu Zhao Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c | 8 +- drivers/misc/sgi-gru/grufault.c | 2 +- drivers/vfio/vfio_iommu_type1.c | 7 +- drivers/xen/privcmd.c | 2 +- fs/proc/task_mmu.c | 33 +++---- fs/userfaultfd.c | 6 +- include/linux/hugetlb.h | 4 + include/linux/mm_inline.h | 2 +- include/linux/pgtable.h | 6 +- kernel/events/uprobes.c | 2 +- mm/damon/ops-common.c | 2 +- mm/damon/paddr.c | 2 +- mm/damon/vaddr.c | 10 ++- mm/filemap.c | 2 +- mm/gup.c | 21 +++-- mm/highmem.c | 12 +-- mm/hmm.c | 2 +- mm/huge_memory.c | 4 +- mm/hugetlb.c | 2 +- mm/hugetlb_vmemmap.c | 6 +- mm/kasan/init.c | 9 +- mm/kasan/shadow.c | 10 +-- mm/khugepaged.c | 22 ++--- mm/ksm.c | 22 ++--- mm/madvise.c | 6 +- mm/mapping_dirty_helpers.c | 4 +- mm/memcontrol.c | 4 +- mm/memory-failure.c | 26 +++--- mm/memory.c | 100 +++++++++++---------- mm/mempolicy.c | 6 +- mm/migrate.c | 14 +-- mm/migrate_device.c | 15 ++-- mm/mincore.c | 2 +- mm/mlock.c | 6 +- mm/mprotect.c | 8 +- mm/mremap.c | 2 +- mm/page_table_check.c | 4 +- mm/page_vma_mapped.c | 27 +++--- mm/pgtable-generic.c | 2 +- mm/rmap.c | 34 ++++--- mm/sparse-vmemmap.c | 8 +- mm/swap_state.c | 8 +- mm/swapfile.c | 20 +++-- mm/userfaultfd.c | 4 +- mm/vmalloc.c | 6 +- mm/vmscan.c | 14 +-- virt/kvm/kvm_main.c | 11 ++- 47 files changed, 301 insertions(+), 228 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c index 56279908ed30..01e271b6ad21 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c @@ -1681,7 +1681,9 @@ static int igt_mmap_gpu(void *arg) static int check_present_pte(pte_t *pte, unsigned long addr, void *data) { - if (!pte_present(*pte) || pte_none(*pte)) { + pte_t ptent = ptep_get(pte); + + if (!pte_present(ptent) || pte_none(ptent)) { pr_err("missing PTE:%lx\n", (addr - (unsigned long)data) >> PAGE_SHIFT); return -EINVAL; @@ -1692,7 +1694,9 @@ static int check_present_pte(pte_t *pte, unsigned long addr, void *data) static int check_absent_pte(pte_t *pte, unsigned long addr, void *data) { - if (pte_present(*pte) && !pte_none(*pte)) { + pte_t ptent = ptep_get(pte); + + if (pte_present(ptent) && !pte_none(ptent)) { pr_err("present PTE:%lx; expected to be revoked\n", (addr - (unsigned long)data) >> PAGE_SHIFT); return -EINVAL; diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index 378cf02a2aa1..629edb6486de 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -228,7 +228,7 @@ static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr, goto err; #ifdef CONFIG_X86_64 if (unlikely(pmd_large(*pmdp))) - pte = *(pte_t *) pmdp; + pte = ptep_get((pte_t *)pmdp); else #endif pte = *pte_offset_kernel(pmdp, vaddr); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 306e6f1d1c70..ebe0ad31d0b0 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -514,6 +514,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, bool write_fault) { pte_t *ptep; + pte_t pte; spinlock_t *ptl; int ret; @@ -536,10 +537,12 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, return ret; } - if (write_fault && !pte_write(*ptep)) + pte = ptep_get(ptep); + + if (write_fault && !pte_write(pte)) ret = -EFAULT; else - *pfn = pte_pfn(*ptep); + *pfn = pte_pfn(pte); pte_unmap_unlock(ptep, ptl); return ret; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index e2f580e30a86..f447cd37cc4c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -949,7 +949,7 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) */ static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) { - return pte_none(*pte) ? 0 : -EBUSY; + return pte_none(ptep_get(pte)) ? 0 : -EBUSY; } static int privcmd_vma_range_is_mapped( diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0d63b6a0f0d8..507cd4e59d07 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -538,13 +538,14 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool migration = false, young = false, dirty = false; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - young = pte_young(*pte); - dirty = pte_dirty(*pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + young = pte_young(ptent); + dirty = pte_dirty(ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { int mapcount; @@ -732,11 +733,12 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; struct page *page = NULL; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) page = pfn_swap_entry_to_page(swpent); @@ -1105,7 +1107,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { pte_t old_pte; @@ -1194,7 +1196,7 @@ out: return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); @@ -1550,7 +1552,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, *pte); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -1893,10 +1895,11 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, return 0; } do { - struct page *page = can_gather_numa_stats(*pte, vma, addr); + pte_t ptent = ptep_get(pte); + struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; - gather_stats(page, md, pte_dirty(*pte), 1); + gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ca83423f8d54..478e2b169c13 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -335,6 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; + pte_t ptent; bool ret = true; mmap_assert_locked(mm); @@ -374,9 +375,10 @@ again: * changes under us. PTE markers should be handled the same as none * ptes here. */ - if (pte_none_mostly(*pte)) + ptent = ptep_get(pte); + if (pte_none_mostly(ptent)) ret = true; - if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + if (!pte_write(ptent) && (reason & VM_UFFD_WP)) ret = true; pte_unmap(pte); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 21f942025fec..beb7c63d2871 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1185,7 +1185,11 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_MMU + return ptep_get(ptep); +#else return *ptep; +#endif } static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 0e1d239a882c..08c2bcefcb2b 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -555,7 +555,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ - WARN_ON_ONCE(!pte_none(*pte)); + WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index fc06f6419661..5063b482e34f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -231,7 +231,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; @@ -318,7 +318,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, address, pte); return pte; @@ -519,7 +519,7 @@ extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t old_pte = *ptep; + pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 607d742caa61..f0ac5b874919 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); ptep_clear_flush_notify(vma, addr, pvmw.pte); if (new_page) set_pte_at_notify(mm, addr, pvmw.pte, diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index d4ab81229136..e940802a15a4 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -39,7 +39,7 @@ struct folio *damon_get_folio(unsigned long pfn) void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = damon_get_folio(pte_pfn(*pte)); + struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte))); if (!folio) return; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 5b3a3463d078..40801e38fcf0 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -89,7 +89,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - *accessed = pte_young(*pvmw.pte) || + *accessed = pte_young(ptep_get(pvmw.pte)) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); } else { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index e814f66dfc2e..2fcc9731528a 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -323,7 +323,7 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, walk->action = ACTION_AGAIN; return 0; } - if (!pte_present(*pte)) + if (!pte_present(ptep_get(pte))) goto out; damon_ptep_mkold(pte, walk->vma, addr); out: @@ -433,6 +433,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; + pte_t ptent; spinlock_t *ptl; struct folio *folio; struct damon_young_walk_private *priv = walk->private; @@ -471,12 +472,13 @@ regular_page: walk->action = ACTION_AGAIN; return 0; } - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) goto out; - folio = damon_get_folio(pte_pfn(*pte)); + folio = damon_get_folio(pte_pfn(ptent)); if (!folio) goto out; - if (pte_young(*pte) || !folio_test_idle(folio) || + if (pte_young(ptent) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; *priv->folio_sz = folio_size(folio); diff --git a/mm/filemap.c b/mm/filemap.c index 1893048ec9ff..00933089b8b6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3523,7 +3523,7 @@ again: * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(*vmf->pte)) + if (!pte_none(ptep_get(vmf->pte))) goto unlock; /* We're about to handle the fault */ diff --git a/mm/gup.c b/mm/gup.c index 838db6c0bfc2..38986e522d34 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -477,13 +477,14 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { if (flags & FOLL_TOUCH) { - pte_t entry = *pte; + pte_t orig_entry = ptep_get(pte); + pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); - if (!pte_same(*pte, entry)) { + if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } @@ -549,7 +550,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags); - pte = *ptep; + pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) @@ -821,6 +822,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t entry; int ret = -EFAULT; /* user gate pages are read-only */ @@ -844,16 +846,17 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; - if (pte_none(*pte)) + entry = ptep_get(pte); + if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; - *page = vm_normal_page(*vma, address, *pte); + *page = vm_normal_page(*vma, address, entry); if (!*page) { - if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; - *page = pte_page(*pte); + *page = pte_page(entry); } ret = try_grab_page(*page, gup_flags); if (unlikely(ret)) @@ -2496,7 +2499,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || - unlikely(pte_val(pte) != pte_val(*ptep))) { + unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2693,7 +2696,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if (!folio) return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { + if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); return 0; } diff --git a/mm/highmem.c b/mm/highmem.c index db251e77f98f..e19269093a93 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -161,7 +161,7 @@ struct page *__kmap_to_page(void *vaddr) /* kmap() mappings */ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP))) - return pte_page(pkmap_page_table[PKMAP_NR(addr)]); + return pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(addr)])); /* kmap_local_page() mappings */ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && @@ -191,6 +191,7 @@ static void flush_all_zero_pkmaps(void) for (i = 0; i < LAST_PKMAP; i++) { struct page *page; + pte_t ptent; /* * zero means we don't have anything to do, @@ -203,7 +204,8 @@ static void flush_all_zero_pkmaps(void) pkmap_count[i] = 0; /* sanity check */ - BUG_ON(pte_none(pkmap_page_table[i])); + ptent = ptep_get(&pkmap_page_table[i]); + BUG_ON(pte_none(ptent)); /* * Don't need an atomic fetch-and-clear op here; @@ -212,7 +214,7 @@ static void flush_all_zero_pkmaps(void) * getting the kmap_lock (which is held here). * So no dangers, even with speculative execution. */ - page = pte_page(pkmap_page_table[i]); + page = pte_page(ptent); pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); set_page_address(page, NULL); @@ -511,7 +513,7 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr) { #ifdef ARCH_NEEDS_KMAP_HIGH_GET if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { - kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); + kunmap_high(pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(vaddr)]))); return true; } #endif @@ -548,7 +550,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); kmap_pte = kmap_get_pte(vaddr, idx); - BUG_ON(!pte_none(*kmap_pte)); + BUG_ON(!pte_none(ptep_get(kmap_pte))); pteval = pfn_pte(pfn, prot); arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval); arch_kmap_local_post_map(vaddr, pteval); diff --git a/mm/hmm.c b/mm/hmm.c index b1a9159d7c92..855e25e59d8f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -228,7 +228,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; unsigned long cpu_flags; - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); uint64_t pfn_req_flags = *hmm_pfn; if (pte_none_mostly(pte)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76f970aa5b4d..e94fe292f30a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2063,7 +2063,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, entry = pte_mkspecial(entry); if (pmd_uffd_wp(old_pmd)) entry = pte_mkuffd_wp(entry); - VM_BUG_ON(!pte_none(*pte)); + VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } @@ -2257,7 +2257,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mkuffd_wp(entry); page_add_anon_rmap(page + i, vma, addr, false); } - VM_BUG_ON(!pte_none(*pte)); + VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1d3d8a61b336..d76574425da3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7246,7 +7246,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte = (pte_t *)pmd_alloc(mm, pud, addr); } } - BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); + BUG_ON(pte && pte_present(ptep_get(pte)) && !pte_huge(ptep_get(pte))); return pte; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index f42079b73f82..c2007ef5e9b0 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -105,7 +105,7 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, * remapping (which is calling @walk->remap_pte). */ if (!walk->reuse_page) { - walk->reuse_page = pte_page(*pte); + walk->reuse_page = pte_page(ptep_get(pte)); /* * Because the reuse address is part of the range that we are * walking, skip the reuse address range. @@ -239,7 +239,7 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; - struct page *page = pte_page(*pte); + struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ @@ -286,7 +286,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct page *page; void *to; - BUG_ON(pte_page(*pte) != walk->reuse_page); + BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); diff --git a/mm/kasan/init.c b/mm/kasan/init.c index cc64ed6858c6..dcfec277e839 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -286,7 +286,7 @@ static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) for (i = 0; i < PTRS_PER_PTE; i++) { pte = pte_start + i; - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return; } @@ -343,16 +343,19 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, unsigned long end) { unsigned long next; + pte_t ptent; for (; addr < end; addr = next, pte++) { next = (addr + PAGE_SIZE) & PAGE_MASK; if (next > end) next = end; - if (!pte_present(*pte)) + ptent = ptep_get(pte); + + if (!pte_present(ptent)) continue; - if (WARN_ON(!kasan_early_shadow_page_entry(*pte))) + if (WARN_ON(!kasan_early_shadow_page_entry(ptent))) continue; pte_clear(&init_mm, addr, pte); } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 3e62728ae25d..dd772f9d0f08 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -226,7 +226,7 @@ static bool shadow_mapped(unsigned long addr) if (pmd_bad(*pmd)) return true; pte = pte_offset_kernel(pmd, addr); - return !pte_none(*pte); + return !pte_none(ptep_get(pte)); } static int __meminit kasan_mem_notifier(struct notifier_block *nb, @@ -317,7 +317,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, unsigned long page; pte_t pte; - if (likely(!pte_none(*ptep))) + if (likely(!pte_none(ptep_get(ptep)))) return 0; page = __get_free_page(GFP_KERNEL); @@ -328,7 +328,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); - if (likely(pte_none(*ptep))) { + if (likely(pte_none(ptep_get(ptep)))) { set_pte_at(&init_mm, addr, ptep, pte); page = 0; } @@ -418,11 +418,11 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, { unsigned long page; - page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT); + page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT); spin_lock(&init_mm.page_table_lock); - if (likely(!pte_none(*ptep))) { + if (likely(!pte_none(ptep_get(ptep)))) { pte_clear(&init_mm, addr, ptep); free_page(page); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 881669e738c0..0b4f00712895 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -511,7 +511,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, struct folio *folio, *tmp; while (--_pte >= pte) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); unsigned long pfn; if (pte_none(pteval)) @@ -555,7 +555,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { ++none_or_zero; @@ -699,7 +699,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { - pteval = *_pte; + pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (is_zero_pfn(pte_pfn(pteval))) { @@ -797,7 +797,7 @@ static int __collapse_huge_page_copy(pte_t *pte, */ for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; _pte++, page++, _address += PAGE_SIZE) { - pteval = *_pte; + pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, _address); continue; @@ -1274,7 +1274,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; if (!cc->is_khugepaged || @@ -1650,18 +1650,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; + pte_t ptent = ptep_get(pte); /* empty pte, skip */ - if (pte_none(*pte)) + if (pte_none(ptent)) continue; /* page swapped out, abort */ - if (!pte_present(*pte)) { + if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } - page = vm_normal_page(vma, addr, *pte); + page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) page = NULL; /* @@ -1677,10 +1678,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; + pte_t ptent = ptep_get(pte); - if (pte_none(*pte)) + if (pte_none(ptent)) continue; - page = vm_normal_page(vma, addr, *pte); + page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) goto abort; page_remove_rmap(page, vma, false); diff --git a/mm/ksm.c b/mm/ksm.c index 3dc15459dd20..d995779dc1fe 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -429,15 +429,17 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex struct page *page = NULL; spinlock_t *ptl; pte_t *pte; + pte_t ptent; int ret; pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) return 0; - if (pte_present(*pte)) { - page = vm_normal_page(walk->vma, addr, *pte); - } else if (!pte_none(*pte)) { - swp_entry_t entry = pte_to_swp_entry(*pte); + ptent = ptep_get(pte); + if (pte_present(ptent)) { + page = vm_normal_page(walk->vma, addr, ptent); + } else if (!pte_none(ptent)) { + swp_entry_t entry = pte_to_swp_entry(ptent); /* * As KSM pages remain KSM pages until freed, no need to wait @@ -1085,6 +1087,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; + pte_t entry; pvmw.address = page_address_in_vma(page, vma); if (pvmw.address == -EFAULT) @@ -1102,10 +1105,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, goto out_unlock; anon_exclusive = PageAnonExclusive(page); - if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || + entry = ptep_get(pvmw.pte); + if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { - pte_t entry; - swapped = PageSwapCache(page); flush_cache_page(vma, pvmw.address, page_to_pfn(page)); /* @@ -1147,7 +1149,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } - *orig_pte = *pvmw.pte; + *orig_pte = entry; err = 0; out_unlock: @@ -1204,7 +1206,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!ptep) goto out_mn; - if (!pte_same(*ptep, orig_pte)) { + if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } @@ -1231,7 +1233,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, dec_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. diff --git a/mm/madvise.c b/mm/madvise.c index 9b3c9610052f..886f06066622 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -207,7 +207,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, break; } - pte = *ptep; + pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); @@ -438,7 +438,7 @@ regular_folio: flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (pte_none(ptent)) continue; @@ -642,7 +642,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (pte_none(ptent)) continue; diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 87b4beeda4fa..a26dd8bcfcdb 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -35,7 +35,7 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct wp_walk *wpwalk = walk->private; - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_write(ptent)) { pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); @@ -91,7 +91,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr, { struct wp_walk *wpwalk = walk->private; struct clean_walk *cwalk = to_clean_walk(wpwalk); - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_dirty(ptent)) { pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 77d8d2d14fcf..93056918e956 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6025,7 +6025,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, if (!pte) return 0; for (; addr != end; pte++, addr += PAGE_SIZE) - if (get_mctgt_type(vma, addr, *pte, NULL)) + if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) mc.precharge++; /* increment precharge temporarily */ pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -6246,7 +6246,7 @@ retry: if (!pte) return 0; for (; addr != end; addr += PAGE_SIZE) { - pte_t ptent = *(pte++); + pte_t ptent = ptep_get(pte++); bool device = false; swp_entry_t ent; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d5116f0eb1b6..e245191e6b04 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -6,16 +6,16 @@ * High level machine check handler. Handles pages reported by the * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. - * + * * In addition there is a "soft offline" entry point that allows stop using * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronously in respect to - * other VM users, because memory failures could happen anytime and - * anywhere. This could violate some of their assumptions. This is why - * this code has to be extremely careful. Generally it tries to use - * normal locking rules, as in get the standard locks, even if that means + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. * * It can be very tempting to add handling for obscure cases here. @@ -25,12 +25,12 @@ * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ * - The case actually shows up as a frequent (top 10) page state in * tools/mm/page-types when running a real workload. - * + * * There are several operations here with exponential complexity because - * of unsuitable VM data structures. For example the operation to map back - * from RMAP chains to processes has to walk the complete process list and + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and * has non linear complexity with the number. But since memory corruptions - * are rare we hope to get away with this. This avoids impacting the core + * are rare we hope to get away with this. This avoids impacting the core * VM. */ @@ -386,6 +386,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t ptent; VM_BUG_ON_VMA(address == -EFAULT, vma); pgd = pgd_offset(vma->vm_mm, address); @@ -407,7 +408,8 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pte = pte_offset_map(pmd, address); if (!pte) return 0; - if (pte_present(*pte) && pte_devmap(*pte)) + ptent = ptep_get(pte); + if (pte_present(ptent) && pte_devmap(ptent)) ret = PAGE_SHIFT; pte_unmap(pte); return ret; @@ -799,7 +801,7 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, goto out; for (; addr != end; ptep++, addr += PAGE_SIZE) { - ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT, hwp->pfn, &hwp->tk); if (ret == 1) break; diff --git a/mm/memory.c b/mm/memory.c index 63c30f58142b..3d78b552866d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -699,15 +699,17 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, struct page *page, unsigned long address, pte_t *ptep) { + pte_t orig_pte; pte_t pte; swp_entry_t entry; + orig_pte = ptep_get(ptep); pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); - if (pte_swp_soft_dirty(*ptep)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*ptep); - if (pte_swp_uffd_wp(*ptep)) + entry = pte_to_swp_entry(orig_pte); + if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -744,7 +746,7 @@ static int try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr) { - swp_entry_t entry = pte_to_swp_entry(*src_pte); + swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); struct page *page = pfn_swap_entry_to_page(entry); if (trylock_page(page)) { @@ -768,9 +770,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; - pte_t pte = *src_pte; + pte_t orig_pte = ptep_get(src_pte); + pte_t pte = orig_pte; struct page *page; - swp_entry_t entry = pte_to_swp_entry(pte); + swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) @@ -785,8 +788,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, spin_unlock(&mmlist_lock); } /* Mark the swap entry as shared. */ - if (pte_swp_exclusive(*src_pte)) { - pte = pte_swp_clear_exclusive(*src_pte); + if (pte_swp_exclusive(orig_pte)) { + pte = pte_swp_clear_exclusive(orig_pte); set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; @@ -805,9 +808,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_migration_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -840,7 +843,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_device_private_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -904,7 +907,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma /* All done, just insert the new page copy in the child */ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); - if (userfaultfd_pte_wp(dst_vma, *src_pte)) + if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); @@ -922,7 +925,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, { struct mm_struct *src_mm = src_vma->vm_mm; unsigned long vm_flags = src_vma->vm_flags; - pte_t pte = *src_pte; + pte_t pte = ptep_get(src_pte); struct page *page; struct folio *folio; @@ -1002,6 +1005,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; + pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, ret = 0; int rss[NR_MM_COUNTERS]; @@ -1047,17 +1051,18 @@ again: spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } - if (pte_none(*src_pte)) { + ptent = ptep_get(src_pte); + if (pte_none(ptent)) { progress++; continue; } - if (unlikely(!pte_present(*src_pte))) { + if (unlikely(!pte_present(ptent))) { ret = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(*src_pte); + entry = pte_to_swp_entry(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -1407,7 +1412,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); struct page *page; if (pte_none(ptent)) @@ -1822,7 +1827,7 @@ static int validate_page_before_insert(struct page *page) static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); @@ -2116,7 +2121,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; - if (!pte_none(*pte)) { + entry = ptep_get(pte); + if (!pte_none(entry)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed @@ -2128,11 +2134,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); + if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } - entry = pte_mkyoung(*pte); + entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); @@ -2344,7 +2350,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return -ENOMEM; arch_enter_lazy_mmu_mode(); do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; @@ -2585,7 +2591,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { - if (create || !pte_none(*pte)) { + if (create || !pte_none(ptep_get(pte))) { err = fn(pte++, addr, data); if (err) break; @@ -2787,7 +2793,7 @@ static inline int pte_unmap_same(struct vm_fault *vmf) #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spin_lock(vmf->ptl); - same = pte_same(*vmf->pte, vmf->orig_pte); + same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); spin_unlock(vmf->ptl); } #endif @@ -2838,7 +2844,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only @@ -2866,7 +2872,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); @@ -3114,7 +3120,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) { + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(&old_folio->page)); @@ -3241,7 +3247,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) * We might have raced with another page fault while we released the * pte_offset_map_lock. */ - if (!pte_same(*vmf->pte, vmf->orig_pte)) { + if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; @@ -3336,7 +3342,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) struct folio *folio = NULL; if (likely(!unshare)) { - if (userfaultfd_pte_wp(vma, *vmf->pte)) { + if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } @@ -3598,7 +3604,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); if (vmf->pte) @@ -3643,7 +3649,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. * So is_pte_marker() check is not enough to safely drop the pte. */ - if (pte_same(vmf->orig_pte, *vmf->pte)) + if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3739,7 +3745,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || - !pte_same(*vmf->pte, vmf->orig_pte))) + !pte_same(ptep_get(vmf->pte), + vmf->orig_pte))) goto unlock; /* @@ -3816,7 +3823,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && + pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; goto unlock; } @@ -3886,7 +3894,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; if (unlikely(!folio_test_uptodate(folio))) { @@ -4331,9 +4339,9 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) static bool vmf_pte_changed(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) - return !pte_same(*vmf->pte, vmf->orig_pte); + return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); - return !pte_none(*vmf->pte); + return !pte_none(ptep_get(vmf->pte)); } /** @@ -4643,7 +4651,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) * we don't have concurrent modification by hardware * followed by an update. */ - if (unlikely(pte_none(*vmf->pte))) + if (unlikely(pte_none(ptep_get(vmf->pte)))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; @@ -4699,7 +4707,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * the pfn may be screwed if the read is non atomic. */ spin_lock(vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4772,7 +4780,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) goto out; - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4930,7 +4938,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) spin_lock(vmf->ptl); entry = vmf->orig_pte; - if (unlikely(!pte_same(*vmf->pte, entry))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } @@ -5416,7 +5424,7 @@ int follow_pte(struct mm_struct *mm, unsigned long address, ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!ptep) goto out; - if (!pte_present(*ptep)) + if (!pte_present(ptep_get(ptep))) goto unlock; *ptepp = ptep; return 0; @@ -5453,7 +5461,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; - *pfn = pte_pfn(*ptep); + *pfn = pte_pfn(ptep_get(ptep)); pte_unmap_unlock(ptep, ptl); return 0; } @@ -5473,7 +5481,7 @@ int follow_phys(struct vm_area_struct *vma, if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; - pte = *ptep; + pte = ptep_get(ptep); if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -5517,7 +5525,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) return -EINVAL; - pte = *ptep; + pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); prot = pgprot_val(pte_pgprot(pte)); @@ -5533,7 +5541,7 @@ retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) goto out_unmap; - if (!pte_same(pte, *ptep)) { + if (!pte_same(pte, ptep_get(ptep))) { pte_unmap_unlock(ptep, ptl); iounmap(maddr); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0241bb64978b..edc25195f5bd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -508,6 +508,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long flags = qp->flags; bool has_unmovable = false; pte_t *pte, *mapped_pte; + pte_t ptent; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); @@ -520,9 +521,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) continue; - folio = vm_normal_folio(vma, addr, *pte); + folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* diff --git a/mm/migrate.c b/mm/migrate.c index 363562992046..ce35afdbc1e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -188,6 +188,7 @@ static bool remove_migration_pte(struct folio *folio, while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; + pte_t old_pte; pte_t pte; swp_entry_t entry; struct page *new; @@ -210,17 +211,18 @@ static bool remove_migration_pte(struct folio *folio, folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pvmw.pte)) + old_pte = ptep_get(pvmw.pte); + if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*pvmw.pte); + entry = pte_to_swp_entry(old_pte); if (!is_migration_entry_young(entry)) pte = pte_mkold(pte); if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) pte = pte_mkwrite(pte); - else if (pte_swp_uffd_wp(*pvmw.pte)) + else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) @@ -234,9 +236,9 @@ static bool remove_migration_pte(struct folio *folio, entry = make_readable_device_private_entry( page_to_pfn(new)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*pvmw.pte)) + if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*pvmw.pte)) + if (pte_swp_uffd_wp(old_pte)) pte = pte_swp_mkuffd_wp(pte); } @@ -308,7 +310,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, if (!ptep) return; - pte = *ptep; + pte = ptep_get(ptep); pte_unmap(ptep); if (!is_swap_pte(pte)) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index a14af6b12b04..02d272b909b5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -111,7 +111,7 @@ again: swp_entry_t entry; pte_t pte; - pte = *ptep; + pte = ptep_get(ptep); if (pte_none(pte)) { if (vma_is_anonymous(vma)) { @@ -194,7 +194,7 @@ again: bool anon_exclusive; pte_t swp_pte; - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(pte)); anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive) { pte = ptep_clear_flush(vma, addr, ptep); @@ -573,6 +573,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pud_t *pudp; pmd_t *pmdp; pte_t *ptep; + pte_t orig_pte; /* Only allow populating anonymous memory */ if (!vma_is_anonymous(vma)) @@ -628,16 +629,18 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); if (!ptep) goto abort; + orig_pte = ptep_get(ptep); + if (check_stable_address_space(mm)) goto unlock_abort; - if (pte_present(*ptep)) { - unsigned long pfn = pte_pfn(*ptep); + if (pte_present(orig_pte)) { + unsigned long pfn = pte_pfn(orig_pte); if (!is_zero_pfn(pfn)) goto unlock_abort; flush = true; - } else if (!pte_none(*ptep)) + } else if (!pte_none(orig_pte)) goto unlock_abort; /* @@ -654,7 +657,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, get_page(page); if (flush) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(orig_pte)); ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, entry); update_mmu_cache(vma, addr, ptep); diff --git a/mm/mincore.c b/mm/mincore.c index f33f6a0b1ded..b7f7a516b26c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -119,7 +119,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } for (; addr != end; ptep++, addr += PAGE_SIZE) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); /* We need to do cache lookup too for pte markers */ if (pte_none_mostly(pte)) diff --git a/mm/mlock.c b/mm/mlock.c index 9f2b1173b1b1..d7db94519884 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -312,6 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; + pte_t ptent; struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); @@ -334,9 +335,10 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, return 0; } for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) continue; - folio = vm_normal_folio(vma, addr, *pte); + folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; if (folio_test_large(folio)) diff --git a/mm/mprotect.c b/mm/mprotect.c index 64e1df0af514..327a6eb90afb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -105,7 +105,7 @@ static long change_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { - oldpte = *pte; + oldpte = ptep_get(pte); if (pte_present(oldpte)) { pte_t ptent; @@ -544,7 +544,8 @@ long change_protection(struct mmu_gather *tlb, static int prot_none_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } @@ -552,7 +553,8 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } diff --git a/mm/mremap.c b/mm/mremap.c index bfc3d1902a94..8ec184ac90ff 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -188,7 +188,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { - if (pte_none(*old_pte)) + if (pte_none(ptep_get(old_pte))) continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 0c511330dbc9..8f89f9c8f0df 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -190,7 +190,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, addr, *ptep); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); if (pte_user_accessible_page(pte)) { page_table_check_set(mm, addr, pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, @@ -243,7 +243,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, addr, *ptep); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 2af734274073..49e0d28f0379 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -15,6 +15,8 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) { + pte_t ptent; + if (pvmw->flags & PVMW_SYNC) { /* Use the stricter lookup */ pvmw->pte = pte_offset_map_lock(pvmw->vma->vm_mm, pvmw->pmd, @@ -35,10 +37,12 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) if (!pvmw->pte) return false; + ptent = ptep_get(pvmw->pte); + if (pvmw->flags & PVMW_MIGRATION) { - if (!is_swap_pte(*pvmw->pte)) + if (!is_swap_pte(ptent)) return false; - } else if (is_swap_pte(*pvmw->pte)) { + } else if (is_swap_pte(ptent)) { swp_entry_t entry; /* * Handle un-addressable ZONE_DEVICE memory. @@ -56,11 +60,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) * For more details on device private memory see HMM * (include/linux/hmm.h or mm/hmm.c). */ - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; - } else if (!pte_present(*pvmw->pte)) { + } else if (!pte_present(ptent)) { return false; } pvmw->ptl = *ptlp; @@ -90,33 +94,34 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) static bool check_pte(struct page_vma_mapped_walk *pvmw) { unsigned long pfn; + pte_t ptent = ptep_get(pvmw->pte); if (pvmw->flags & PVMW_MIGRATION) { swp_entry_t entry; - if (!is_swap_pte(*pvmw->pte)) + if (!is_swap_pte(ptent)) return false; - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_migration_entry(entry) && !is_device_exclusive_entry(entry)) return false; pfn = swp_offset_pfn(entry); - } else if (is_swap_pte(*pvmw->pte)) { + } else if (is_swap_pte(ptent)) { swp_entry_t entry; /* Handle un-addressable ZONE_DEVICE memory */ - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; pfn = swp_offset_pfn(entry); } else { - if (!pte_present(*pvmw->pte)) + if (!pte_present(ptent)) return false; - pfn = pte_pfn(*pvmw->pte); + pfn = pte_pfn(ptent); } return (pfn - pvmw->pfn) < pvmw->nr_pages; @@ -294,7 +299,7 @@ next_pte: goto restart; } pvmw->pte++; - } while (pte_none(*pvmw->pte)); + } while (pte_none(ptep_get(pvmw->pte))); if (!pvmw->ptl) { pvmw->ptl = ptl; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c7ab18a5fb77..4d454953046f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -68,7 +68,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { - int changed = !pte_same(*ptep, entry); + int changed = !pte_same(ptep_get(ptep), entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address, ptep); diff --git a/mm/rmap.c b/mm/rmap.c index cd918cb9a431..0c0d8857dfce 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -826,7 +826,8 @@ static bool folio_referenced_one(struct folio *folio, } if (pvmw.pte) { - if (lru_gen_enabled() && pte_young(*pvmw.pte)) { + if (lru_gen_enabled() && + pte_young(ptep_get(pvmw.pte))) { lru_gen_look_around(&pvmw); referenced++; } @@ -956,13 +957,13 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) address = pvmw->address; if (pvmw->pte) { - pte_t entry; pte_t *pte = pvmw->pte; + pte_t entry = ptep_get(pte); - if (!pte_dirty(*pte) && !pte_write(*pte)) + if (!pte_dirty(entry) && !pte_write(entry)) continue; - flush_cache_page(vma, address, pte_pfn(*pte)); + flush_cache_page(vma, address, pte_pfn(entry)); entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); @@ -1137,7 +1138,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) * @folio: Folio which contains page. * @page: Page to add to rmap. * @vma: VM area to add page to. - * @address: User virtual address of the mapping + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct folio *folio, struct page *page, @@ -1458,6 +1459,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long pfn; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1508,8 +1510,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + pfn = pte_pfn(ptep_get(pvmw.pte)); + subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); @@ -1571,7 +1573,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* @@ -1818,6 +1820,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long pfn; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1877,6 +1880,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); + pfn = pte_pfn(ptep_get(pvmw.pte)); + if (folio_is_zone_device(folio)) { /* * Our PTE is a non-present device exclusive entry and @@ -1891,8 +1896,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); subpage = &folio->page; } else { - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + subpage = folio_page(folio, pfn - folio_pfn(folio)); } address = pvmw.address; anon_exclusive = folio_test_anon(folio) && @@ -1952,7 +1956,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* @@ -2187,6 +2191,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, struct mmu_notifier_range range; swp_entry_t entry; pte_t swp_pte; + pte_t ptent; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, @@ -2198,18 +2203,19 @@ static bool page_make_device_exclusive_one(struct folio *folio, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); - if (!pte_present(*pvmw.pte)) { + ptent = ptep_get(pvmw.pte); + if (!pte_present(ptent)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + pte_pfn(ptent) - folio_pfn(folio)); address = pvmw.address; /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pte_pfn(ptent)); pteval = ptep_clear_flush(vma, address, pvmw.pte); /* Set the dirty flag on the folio now the pte is gone. */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 10d73a0dfcec..a044a130405b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -133,7 +133,7 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size, void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { - unsigned long pfn = pte_pfn(*pte); + unsigned long pfn = pte_pfn(ptep_get(pte)); int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) @@ -146,7 +146,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct page *reuse) { pte_t *pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) { + if (pte_none(ptep_get(pte))) { pte_t entry; void *p; @@ -414,7 +414,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, * with just tail struct pages. */ return vmemmap_populate_range(start, end, node, NULL, - pte_page(*pte)); + pte_page(ptep_get(pte))); } size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); @@ -438,7 +438,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, */ next += PAGE_SIZE; rc = vmemmap_populate_range(next, last, node, NULL, - pte_page(*pte)); + pte_page(ptep_get(pte))); if (rc) return -ENOMEM; } diff --git a/mm/swap_state.c b/mm/swap_state.c index a33c60e0158f..4a5c7b748051 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -275,9 +275,9 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, } } -/* - * If we are the only user, then try to free up the swap cache. - * +/* + * If we are the only user, then try to free up the swap cache. + * * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside * folio_free_swap() _with_ the lock. @@ -294,7 +294,7 @@ void free_swap_cache(struct page *page) } } -/* +/* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 74dd4d2337b7..a6945c2e0d03 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1745,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; - pte_t *pte, new_pte; + pte_t *pte, new_pte, old_pte; bool hwposioned = false; int ret = 1; @@ -1757,11 +1757,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, hwposioned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte || !pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), + swp_entry_to_pte(entry)))) { ret = 0; goto out; } + old_pte = ptep_get(pte); + if (unlikely(hwposioned || !PageUptodate(page))) { swp_entry_t swp_entry; @@ -1793,7 +1796,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * call and have the page locked. */ VM_BUG_ON_PAGE(PageWriteback(page), page); - if (pte_swp_exclusive(*pte)) + if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; page_add_anon_rmap(page, vma, addr, rmap_flags); @@ -1802,9 +1805,9 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, lru_cache_add_inactive_or_unevictable(page, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pte)) + if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); - if (pte_swp_uffd_wp(*pte)) + if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); @@ -1833,6 +1836,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned char swp_count; swp_entry_t entry; int ret; + pte_t ptent; if (!pte++) { pte = pte_offset_map(pmd, addr); @@ -1840,10 +1844,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, break; } - if (!is_swap_pte(*pte)) + ptent = ptep_get_lockless(pte); + + if (!is_swap_pte(ptent)) continue; - entry = pte_to_swp_entry(*pte); + entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5fd787158c70..a2bf37ee276d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -97,7 +97,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ - if (!pte_none_mostly(*dst_pte)) + if (!pte_none_mostly(ptep_get(dst_pte))) goto out_unlock; folio = page_folio(page); @@ -230,7 +230,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, goto out_unlock; } ret = -EEXIST; - if (!pte_none(*dst_pte)) + if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7382e0a60ce1..5a3bf408251b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -103,7 +103,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); @@ -472,7 +472,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, do { struct page *page = pages[*nr]; - if (WARN_ON(!pte_none(*pte))) + if (WARN_ON(!pte_none(ptep_get(pte)))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; @@ -704,7 +704,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) return NULL; ptep = pte_offset_kernel(pmd, addr); - pte = *ptep; + pte = ptep_get(ptep); if (pte_present(pte)) page = pte_page(pte); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3f64c8d9f629..e305c11ec8fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4037,15 +4037,16 @@ restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; struct folio *folio; + pte_t ptent = ptep_get(pte + i); total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(pte[i], args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) { + if (!pte_young(ptent)) { walk->mm_stats[MM_LEAF_OLD]++; continue; } @@ -4060,7 +4061,7 @@ restart: young++; walk->mm_stats[MM_LEAF_YOUNG]++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); @@ -4703,12 +4704,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; + pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + pfn = get_pte_pfn(ptent, pvmw->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) + if (!pte_young(ptent)) continue; folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); @@ -4720,7 +4722,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) young++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 51e4882d0873..fb37adecfc91 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2578,6 +2578,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, { kvm_pfn_t pfn; pte_t *ptep; + pte_t pte; spinlock_t *ptl; int r; @@ -2601,14 +2602,16 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, return r; } - if (write_fault && !pte_write(*ptep)) { + pte = ptep_get(ptep); + + if (write_fault && !pte_write(pte)) { pfn = KVM_PFN_ERR_RO_FAULT; goto out; } if (writable) - *writable = pte_write(*ptep); - pfn = pte_pfn(*ptep); + *writable = pte_write(pte); + pfn = pte_pfn(pte); /* * Get a reference here because callers of *hva_to_pfn* and @@ -2626,7 +2629,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * tail pages of non-compound higher order allocations, which * would then underflow the refcount when the caller does the * required put_page. Don't allow those pages here. - */ + */ if (!kvm_try_get_pfn(pfn)) r = -EFAULT; -- cgit v1.2.3 From 6c77b607ee26472fb945aa41734281c39d06d68f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 14 Jun 2023 22:36:12 +0800 Subject: mm: kill lock|unlock_page_memcg() Since commit c7c3dec1c9db ("mm: rmap: remove lock_page_memcg()"), no more user, kill lock_page_memcg() and unlock_page_memcg(). Link: https://lkml.kernel.org/r/20230614143612.62575-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Johannes Weiner Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 2 +- include/linux/memcontrol.h | 12 +----------- mm/filemap.c | 2 +- mm/memcontrol.c | 18 ++++-------------- mm/page-writeback.c | 6 +++--- 5 files changed, 10 insertions(+), 30 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 47d1d7d932a8..fabaad3fd9c2 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -297,7 +297,7 @@ Lock order is as follows:: Page lock (PG_locked bit of page->flags) mm->page_table_lock or split pte_lock - lock_page_memcg (memcg->move_lock) + folio_memcg_lock (memcg->move_lock) mapping->i_pages lock lruvec->lru_lock. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 00a88cf947e1..c3d3a0c09315 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -419,7 +419,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * * - the folio lock * - LRU isolation - * - lock_page_memcg() + * - folio_memcg_lock() * - exclusive reference * - mem_cgroup_trylock_pages() * @@ -949,8 +949,6 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); void folio_memcg_lock(struct folio *folio); void folio_memcg_unlock(struct folio *folio); -void lock_page_memcg(struct page *page); -void unlock_page_memcg(struct page *page); void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); @@ -1438,14 +1436,6 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } -static inline void lock_page_memcg(struct page *page) -{ -} - -static inline void unlock_page_memcg(struct page *page) -{ -} - static inline void folio_memcg_lock(struct folio *folio) { } diff --git a/mm/filemap.c b/mm/filemap.c index 00933089b8b6..758bbdf300e7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -117,7 +117,7 @@ * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) + * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93056918e956..cf06b1c9b3bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2148,17 +2148,12 @@ again: * When charge migration first begins, we can have multiple * critical sections holding the fast-path RCU lock and one * holding the slowpath move_lock. Track the task who has the - * move_lock for unlock_page_memcg(). + * move_lock for folio_memcg_unlock(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; } -void lock_page_memcg(struct page *page) -{ - folio_memcg_lock(page_folio(page)); -} - static void __folio_memcg_unlock(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { @@ -2186,11 +2181,6 @@ void folio_memcg_unlock(struct folio *folio) __folio_memcg_unlock(folio_memcg(folio)); } -void unlock_page_memcg(struct page *page) -{ - folio_memcg_unlock(page_folio(page)); -} - struct memcg_stock_pcp { local_lock_t stock_lock; struct mem_cgroup *cached; /* this never be root cgroup */ @@ -2866,7 +2856,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) * * - the page lock * - LRU isolation - * - lock_page_memcg() + * - folio_memcg_lock() * - exclusive reference * - mem_cgroup_trylock_pages() */ @@ -5829,7 +5819,7 @@ static int mem_cgroup_move_account(struct page *page, * with (un)charging, migration, LRU putback, or anything else * that would rely on a stable page's memory cgroup. * - * Note that lock_page_memcg is a memcg lock, not a page lock, + * Note that folio_memcg_lock is a memcg lock, not a page lock, * to save space. As soon as we switch page's memory cgroup to a * new memcg that isn't locked, the above state can change * concurrently again. Make sure we're truly done with it. @@ -6320,7 +6310,7 @@ static void mem_cgroup_move_charge(void) { lru_add_drain_all(); /* - * Signal lock_page_memcg() to take the memcg's move_lock + * Signal folio_memcg_lock() to take the memcg's move_lock * while we're moving its pages to another memcg. Then wait * for already started RCU-only updates to finish. */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index db7943999007..1d17fb1ec863 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2597,7 +2597,7 @@ EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. * - * Caller must hold lock_page_memcg(). + * Caller must hold folio_memcg_lock(). * * NOTE: This relies on being atomic wrt interrupts. */ @@ -2631,7 +2631,7 @@ static void folio_account_dirtied(struct folio *folio, /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold lock_page_memcg(). + * Caller must hold folio_memcg_lock(). */ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { @@ -2650,7 +2650,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold lock_page_memcg(). Most callers have the folio + * The caller must hold folio_memcg_lock(). Most callers have the folio * locked. A few have the folio blocked from truncation through other * means (eg zap_vma_pages() has it mapped and is holding the page table * lock). This can also be called from mark_buffer_dirty(), which I -- cgit v1.2.3 From 025b7799b35d32e46988ba0614ea2f91b85d6375 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 16 Jun 2023 14:30:30 +0800 Subject: mm/memcg: remove return value of mem_cgroup_scan_tasks() No user checks the return value of mem_cgroup_scan_tasks(). Make the return value void. Link: https://lkml.kernel.org/r/20230616063030.977586-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Johannes Weiner Cc: Kefeng Wang Cc: Michal Hocko Cc: Muchun Song Cc: Nanyong Sun Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 +++---- mm/memcontrol.c | 9 ++++----- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c3d3a0c09315..5818af8eca5a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -820,8 +820,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); -int mem_cgroup_scan_tasks(struct mem_cgroup *, - int (*)(struct task_struct *, void *), void *); +void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*)(struct task_struct *, void *), void *arg); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { @@ -1364,10 +1364,9 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root, { } -static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, +static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) { - return 0; } static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cf06b1c9b3bb..a834b1edcde9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1259,13 +1259,13 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * * This function iterates over tasks attached to @memcg or to any of its * descendants and calls @fn for each task. If @fn returns a non-zero - * value, the function breaks the iteration loop and returns the value. - * Otherwise, it will iterate over all tasks and return 0. + * value, the function breaks the iteration loop. Otherwise, it will iterate + * over all tasks and return 0. * * This function must not be called for the root memory cgroup. */ -int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, - int (*fn)(struct task_struct *, void *), void *arg) +void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*fn)(struct task_struct *, void *), void *arg) { struct mem_cgroup *iter; int ret = 0; @@ -1285,7 +1285,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, break; } } - return ret; } #ifdef CONFIG_DEBUG_VM -- cgit v1.2.3 From 91f0dccef141483f8399299c39ce9114d19bb147 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Mon, 19 Jun 2023 13:04:42 +0000 Subject: mm/memcontrol: do not tweak node in mem_cgroup_init() mem_cgroup_init() request for allocations from each possible node, and it's used to be a problem because NODE_DATA is not allocated for offline node. Things have already changed since commit 09f49dca570a9 ("mm: handle uninitialized numa nodes gracefully"), so it's unnecessary to check for !node_online nodes here. How to test? qemu-system-x86_64 \ -kernel vmlinux \ -initrd full.rootfs.cpio.gz \ -append "console=ttyS0,115200 root=/dev/ram0 nokaslr earlyprintk=serial oops=panic panic_on_warn" \ -drive format=qcow2,file=vm_disk.qcow2,media=disk,if=ide \ -enable-kvm \ -cpu host \ -m 8G,slots=2,maxmem=16G \ -smp cores=4,threads=1,sockets=2 \ -object memory-backend-ram,id=mem0,size=4G \ -object memory-backend-ram,id=mem1,size=4G \ -numa node,memdev=mem0,cpus=0-3,nodeid=0 \ -numa node,memdev=mem1,cpus=4-7,nodeid=1 \ -numa node,nodeid=2 \ -net nic,model=virtio,macaddr=52:54:00:12:34:58 \ -net user \ -nographic \ -rtc base=localtime \ -gdb tcp::6000 Guest state when booting: [ 0.048881] NUMA: Node 0 [mem 0x00000000-0x0009ffff] + [mem 0x00100000-0xbfffffff] -> [mem 0x00000000-0xbfffffff] [ 0.050489] NUMA: Node 0 [mem 0x00000000-0xbfffffff] + [mem 0x100000000-0x13fffffff] -> [mem 0x00000000-0x13fffffff] [ 0.052173] NODE_DATA(0) allocated [mem 0x13fffc000-0x13fffffff] [ 0.053164] NODE_DATA(1) allocated [mem 0x23fffa000-0x23fffdfff] [ 0.054187] Zone ranges: [ 0.054587] DMA [mem 0x0000000000001000-0x0000000000ffffff] [ 0.055551] DMA32 [mem 0x0000000001000000-0x00000000ffffffff] [ 0.056515] Normal [mem 0x0000000100000000-0x000000023fffffff] [ 0.057484] Movable zone start for each node [ 0.058149] Early memory node ranges [ 0.058705] node 0: [mem 0x0000000000001000-0x000000000009efff] [ 0.059679] node 0: [mem 0x0000000000100000-0x00000000bffdffff] [ 0.060659] node 0: [mem 0x0000000100000000-0x000000013fffffff] [ 0.061649] node 1: [mem 0x0000000140000000-0x000000023fffffff] [ 0.062638] Initmem setup node 0 [mem 0x0000000000001000-0x000000013fffffff] [ 0.063745] Initmem setup node 1 [mem 0x0000000140000000-0x000000023fffffff] [ 0.064855] DMA zone: 158 reserved pages exceeds freesize 0 [ 0.065746] Initializing node 2 as memoryless [ 0.066437] Initmem setup node 2 as memoryless [ 0.067132] DMA zone: 158 reserved pages exceeds freesize 0 [ 0.068037] On node 0, zone DMA: 1 pages in unavailable ranges [ 0.068265] On node 0, zone DMA: 97 pages in unavailable ranges [ 0.124755] On node 0, zone Normal: 32 pages in unavailable ranges cat /sys/devices/system/node/online 0-1 cat /sys/devices/system/node/possible 0-2 Link: https://lkml.kernel.org/r/20230619130442.2487-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a834b1edcde9..e8ca4bdcb03c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7376,8 +7376,7 @@ static int __init mem_cgroup_init(void) for_each_node(node) { struct mem_cgroup_tree_per_node *rtpn; - rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, - node_online(node) ? node : NUMA_NO_NODE); + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); rtpn->rb_root = RB_ROOT; rtpn->rb_rightmost = NULL; -- cgit v1.2.3