diff options
author | Rik van Riel <riel@fb.com> | 2024-12-05 05:03:16 +0300 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2025-02-21 15:50:00 +0300 |
commit | b47002ed65ade940839b7f439ff4a194e7d5ec28 (patch) | |
tree | d80e624b54f703a2afa6a6c54118b8a8456a4f78 | |
parent | 04bac40ad5d6c339650fc72ea9b074a899ffd303 (diff) | |
download | linux-b47002ed65ade940839b7f439ff4a194e7d5ec28.tar.xz |
x86/mm/tlb: Only trim the mm_cpumask once a second
[ Upstream commit 6db2526c1d694c91c6e05e2f186c085e9460f202 ]
Setting and clearing CPU bits in the mm_cpumask is only ever done
by the CPU itself, from the context switch code or the TLB flush
code.
Synchronization is handled by switch_mm_irqs_off() blocking interrupts.
Sending TLB flush IPIs to CPUs that are in the mm_cpumask, but no
longer running the program causes a regression in the will-it-scale
tlbflush2 test. This test is contrived, but a large regression here
might cause a small regression in some real world workload.
Instead of always sending IPIs to CPUs that are in the mm_cpumask,
but no longer running the program, send these IPIs only once a second.
The rest of the time we can skip over CPUs where the loaded_mm is
different from the target mm.
Reported-by: kernel test roboto <oliver.sang@intel.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20241204210316.612ee573@fangorn
Closes: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal@kernel.org>
-rw-r--r-- | arch/x86/include/asm/mmu.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/mmu_context.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/tlbflush.h | 1 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 35 |
4 files changed, 36 insertions, 3 deletions
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 5d7494631ea9..c07c018a1c13 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -33,6 +33,8 @@ typedef struct { */ atomic64_t tlb_gen; + unsigned long next_trim_cpumask; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct rw_semaphore ldt_usr_sem; struct ldt_struct *ldt; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index b8d40ddeab00..6f5c7584fe1e 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -106,6 +106,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); atomic64_set(&mm->context.tlb_gen, 0); + mm->context.next_trim_cpumask = jiffies + HZ; #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cda3118f3b27..d1eb6bbfd39e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -208,6 +208,7 @@ struct flush_tlb_info { unsigned int initiating_cpu; u8 stride_shift; u8 freed_tables; + u8 trim_cpumask; }; void flush_tlb_local(void); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c1e31e9a85d7..b07e2167fceb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -878,9 +878,36 @@ done: nr_invalidate); } -static bool tlb_is_not_lazy(int cpu, void *data) +static bool should_flush_tlb(int cpu, void *data) { - return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu); + struct flush_tlb_info *info = data; + + /* Lazy TLB will get flushed at the next context switch. */ + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) + return false; + + /* No mm means kernel memory flush. */ + if (!info->mm) + return true; + + /* The target mm is loaded, and the CPU is not lazy. */ + if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + return true; + + /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ + if (info->trim_cpumask) + return true; + + return false; +} + +static bool should_trim_cpumask(struct mm_struct *mm) +{ + if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) { + WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ); + return true; + } + return false; } DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); @@ -914,7 +941,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, if (info->freed_tables) on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); else - on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, + on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func, (void *)info, 1, cpumask); } @@ -965,6 +992,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, info->freed_tables = freed_tables; info->new_tlb_gen = new_tlb_gen; info->initiating_cpu = smp_processor_id(); + info->trim_cpumask = 0; return info; } @@ -1007,6 +1035,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, * flush_tlb_func_local() directly in this case. */ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + info->trim_cpumask = should_trim_cpumask(mm); flush_tlb_multi(mm_cpumask(mm), info); } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { lockdep_assert_irqs_enabled(); |