From 209954cbc7d0ce1a190fc725d20ce303d74d2680 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 14 Nov 2024 10:26:16 -0500 Subject: x86/mm/tlb: Update mm_cpumask lazily On busy multi-threaded workloads, there can be significant contention on the mm_cpumask at context switch time. Reduce that contention by updating mm_cpumask lazily, setting the CPU bit at context switch time (if not already set), and clearing the CPU bit at the first TLB flush sent to a CPU where the process isn't running. When a flurry of TLB flushes for a process happen, only the first one will be sent to CPUs where the process isn't running. The others will be sent to CPUs where the process is currently running. On an AMD Milan system with 36 cores, there is a noticeable difference: $ hackbench --groups 20 --loops 10000 Before: ~4.5s +/- 0.1s After: ~4.2s +/- 0.1s Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mel Gorman Link: https://lore.kernel.org/r/20241114152723.1294686-2-riel@surriel.com --- arch/x86/mm/tlb.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86/mm/tlb.c') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index b0d5a644fc84..cc4e57ae690f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -606,18 +606,15 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cond_mitigation(tsk); /* - * Stop remote flushes for the previous mm. - * Skip kernel threads; we never send init_mm TLB flushing IPIs, - * but the bitmap manipulation can cause cache line contention. + * Leave this CPU in prev's mm_cpumask. Atomic writes to + * mm_cpumask can be expensive under contention. The CPU + * will be removed lazily at TLB flush time. */ - if (prev != &init_mm) { - VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, - mm_cpumask(prev))); - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - } + VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, + mm_cpumask(prev))); /* Start receiving IPIs and then read tlb_gen (and LAM below) */ - if (next != &init_mm) + if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); @@ -761,8 +758,10 @@ static void flush_tlb_func(void *info) count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); /* Can only happen on remote CPUs */ - if (f->mm && f->mm != loaded_mm) + if (f->mm && f->mm != loaded_mm) { + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); return; + } } if (unlikely(loaded_mm == &init_mm)) -- cgit v1.2.3 From 2815a56e4b7252a836969f5674ee356ea1ce482c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 14 Nov 2024 10:26:17 -0500 Subject: x86/mm/tlb: Add tracepoint for TLB flush IPI to stale CPU Add a tracepoint when we send a TLB flush IPI to a CPU that used to be in the mm_cpumask, but isn't any more. Suggested-by: Dave Hansen Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241114152723.1294686-3-riel@surriel.com --- arch/x86/mm/tlb.c | 1 + include/linux/mm_types.h | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/mm/tlb.c') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index cc4e57ae690f..1aac4fa90d3d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -760,6 +760,7 @@ static void flush_tlb_func(void *info) /* Can only happen on remote CPUs */ if (f->mm && f->mm != loaded_mm) { cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); + trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); return; } } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e3bdf8e38bc..6b6f05404304 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1335,6 +1335,7 @@ enum tlb_flush_reason { TLB_LOCAL_SHOOTDOWN, TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, + TLB_REMOTE_WRONG_CPU, NR_TLB_FLUSH_REASONS, }; -- cgit v1.2.3 From 953753db887f9d70f70f61d6ecbe5cf209107672 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 5 Dec 2024 10:46:30 -0500 Subject: x86/mm/tlb: Also remove local CPU from mm_cpumask if stale The code in flush_tlb_func() that removes a remote CPU from the cpumask if it is no longer running the target mm is also needed on the originating CPU of a TLB flush, now that CPUs are no longer cleared from the mm_cpumask at context switch time. Flushing the TLB when we are not running the target mm is harmless, because the CPU's tlb_gen only gets updated to match the mm_tlb_gen, but it does hit this warning: WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); [ 210.343902][ T4668] WARNING: CPU: 38 PID: 4668 at arch/x86/mm/tlb.c:815 flush_tlb_func (arch/x86/mm/tlb.c:815) Removing both local and remote CPUs from the mm_cpumask when doing a flush for a not currently loaded mm avoids that warning. Reported-by: kernel test robot Tested-by: kernel test robot Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Cc: Dave Hansen Cc: Mathieu Desnoyers Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Linus Torvalds Link: https://lore.kernel.org/r/20241205104630.755706ca@fangorn Closes: https://lore.kernel.org/oe-lkp/202412051551.690e9656-lkp@intel.com --- arch/x86/mm/tlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm/tlb.c') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1aac4fa90d3d..3c30817ec6a2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -756,13 +756,13 @@ static void flush_tlb_func(void *info) if (!local) { inc_irq_stat(irq_tlb_count); count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + } - /* Can only happen on remote CPUs */ - if (f->mm && f->mm != loaded_mm) { - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); - trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); - return; - } + /* The CPU was left in the mm_cpumask of the target mm. Clear it. */ + if (f->mm && f->mm != loaded_mm) { + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); + trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); + return; } if (unlikely(loaded_mm == &init_mm)) -- cgit v1.2.3 From 6db2526c1d694c91c6e05e2f186c085e9460f202 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 4 Dec 2024 21:03:16 -0500 Subject: x86/mm/tlb: Only trim the mm_cpumask once a second Setting and clearing CPU bits in the mm_cpumask is only ever done by the CPU itself, from the context switch code or the TLB flush code. Synchronization is handled by switch_mm_irqs_off() blocking interrupts. Sending TLB flush IPIs to CPUs that are in the mm_cpumask, but no longer running the program causes a regression in the will-it-scale tlbflush2 test. This test is contrived, but a large regression here might cause a small regression in some real world workload. Instead of always sending IPIs to CPUs that are in the mm_cpumask, but no longer running the program, send these IPIs only once a second. The rest of the time we can skip over CPUs where the loaded_mm is different from the target mm. Reported-by: kernel test roboto Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Cc: Dave Hansen Cc: Andy Lutomirski Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Linus Torvalds Link: https://lore.kernel.org/r/20241204210316.612ee573@fangorn Closes: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/ --- arch/x86/include/asm/mmu.h | 2 ++ arch/x86/include/asm/mmu_context.h | 1 + arch/x86/include/asm/tlbflush.h | 1 + arch/x86/mm/tlb.c | 35 ++++++++++++++++++++++++++++++++--- 4 files changed, 36 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm/tlb.c') diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index ce4677b8b735..3b496cdcb74b 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -37,6 +37,8 @@ typedef struct { */ atomic64_t tlb_gen; + unsigned long next_trim_cpumask; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct rw_semaphore ldt_usr_sem; struct ldt_struct *ldt; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 2886cb668d7f..795fdd53bd0a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -151,6 +151,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); atomic64_set(&mm->context.tlb_gen, 0); + mm->context.next_trim_cpumask = jiffies + HZ; #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 69e79fff41b8..02fc2aa06e9e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -222,6 +222,7 @@ struct flush_tlb_info { unsigned int initiating_cpu; u8 stride_shift; u8 freed_tables; + u8 trim_cpumask; }; void flush_tlb_local(void); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3c30817ec6a2..458a5d5be594 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -892,9 +892,36 @@ done: nr_invalidate); } -static bool tlb_is_not_lazy(int cpu, void *data) +static bool should_flush_tlb(int cpu, void *data) { - return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu); + struct flush_tlb_info *info = data; + + /* Lazy TLB will get flushed at the next context switch. */ + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) + return false; + + /* No mm means kernel memory flush. */ + if (!info->mm) + return true; + + /* The target mm is loaded, and the CPU is not lazy. */ + if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + return true; + + /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ + if (info->trim_cpumask) + return true; + + return false; +} + +static bool should_trim_cpumask(struct mm_struct *mm) +{ + if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) { + WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ); + return true; + } + return false; } DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); @@ -928,7 +955,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, if (info->freed_tables) on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); else - on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, + on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func, (void *)info, 1, cpumask); } @@ -979,6 +1006,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, info->freed_tables = freed_tables; info->new_tlb_gen = new_tlb_gen; info->initiating_cpu = smp_processor_id(); + info->trim_cpumask = 0; return info; } @@ -1021,6 +1049,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, * flush_tlb_func_local() directly in this case. */ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + info->trim_cpumask = should_trim_cpumask(mm); flush_tlb_multi(mm_cpumask(mm), info); } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { lockdep_assert_irqs_enabled(); -- cgit v1.2.3