From 258df8fce42fecc23cd04242de3d39f1fe836433 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2026 07:22:12 -1000 Subject: mm: Add ptep_try_set() for lockless empty-slot installs Add ptep_try_set(ptep, new_pte): atomically set *ptep to new_pte iff it is currently pte_none(). Returns true on success, false if the slot was already populated or the arch has no implementation. The intended caller is the upcoming bpf_arena kernel-side fault recovery path. The install runs from a page fault that can be nested under locks held by the faulting kernel caller (e.g. a BPF program holding raw_res_spin_lock_irqsave on its arena's spinlock), so trylock-and-retry would A-A deadlock. Lock-free cmpxchg is the only viable option, which constrains this helper to special kernel page tables where concurrent writers cooperate via atomic accessors. The generic version in returns false. x86 and arm64 override with try_cmpxchg-based implementations on the underlying pteval. Other architectures get the false stub - the callers there already fall through to oops. v2: Rename to ptep_try_set(). Tighten kerneldoc. (David, Alexei) v3: Note that strict-zero cmpxchg is narrower than pte_none(). (Andrea) Suggested-by: Kumar Kartikeya Dwivedi Suggested-by: Alexei Starovoitov Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi Cc: David Hildenbrand Acked-by: David Hildenbrand (arm) Link: https://lore.kernel.org/r/20260522172219.1423324-2-tj@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/pgtable.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index cdd68ed3ae1a..b5739bb99fc1 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1036,6 +1036,31 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef ptep_try_set +/** + * ptep_try_set - atomically set an empty kernel PTE + * @ptep: page table entry + * @new_pte: value to install + * + * Atomically set *@ptep to @new_pte iff *@ptep is pte_none(). Return true on + * success, false if the slot was already populated or the arch has no + * implementation. + * + * For special kernel page tables only - never user page tables. The caller must + * prevent concurrent teardown of @ptep and must accept that other writers may + * race. Concurrent clearers must use ptep_get_and_clear() so racing accesses + * agree on the outcome. + * + * Architectures opt in by providing a cmpxchg-based override and defining + * ptep_try_set as an identity macro. The generic stub returns false, which is + * correct for callers that fall through to oops on failure. + */ +static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) +{ + return false; +} +#endif + #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same -- cgit v1.2.3 From f64c723741c911544cca4c838d7a291b06b3ad1d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Jun 2026 08:37:28 -1000 Subject: bpf: Replace scratch PTE atomically when allocating arena pages apply_range_set_cb() maps the pages for a new arena allocation and returned -EBUSY when the target PTE was already populated. Kernel-fault recovery leaves the per-arena scratch page in unallocated arena PTEs, so a later bpf_arena_alloc_pages() over such a page hits that -EBUSY, and every subsequent allocation of it fails the same way. Allocation must install the real page over scratch instead. Overwriting the scratch PTE in place is a valid->valid change, which arm64 forbids without break-before-make. Route through an invalid entry instead: ptep_try_set() fills only a none slot, so the PTE goes scratch->none->page. On finding scratch, clear it and flush_tlb_before_set() before retrying. The new flush_tlb_before_set() is a no-op except on arches like arm64 that need the break-before-make TLB invalidate. The loop also copes with a concurrent fault re-scratching the slot. Arches without ptep_try_set() never install the scratch page, so keep the must-be-empty check and set_pte_at() for them. Fixes: dc11a4dba246 ("bpf: Recover arena kernel faults with scratch page") Signed-off-by: Tejun Heo Cc: Alexei Starovoitov Cc: David Hildenbrand Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260601183728.1800490-1-tj@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/pgtable.h | 11 +++++++++++ include/linux/pgtable.h | 18 ++++++++++++++++++ kernel/bpf/arena.c | 38 +++++++++++++++++++++++++++++++++----- 3 files changed, 62 insertions(+), 5 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 984f0502c9d0..3ce0f2a6cab6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1842,6 +1842,17 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) } #define ptep_try_set ptep_try_set +/* + * arm64 mandates break-before-make: a cleared kernel PTE must have its TLB + * invalidated before a different page is installed in its place. The broadcast + * TLBI is an instruction, not an IPI, so this is safe with interrupts disabled. + */ +static inline void flush_tlb_before_set(unsigned long addr) +{ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); +} +#define flush_tlb_before_set flush_tlb_before_set + #define test_and_clear_young_ptes test_and_clear_young_ptes static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index b5739bb99fc1..4c6c4081ef71 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1061,6 +1061,24 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) } #endif +#ifndef flush_tlb_before_set +/** + * flush_tlb_before_set - invalidate a kernel PTE's TLB before re-setting it + * @addr: kernel virtual address whose PTE was just cleared + * + * Some architectures (e.g. arm64) do not allow a live page-table entry to be + * repointed at a different page in one step. The old entry must first be made + * invalid and its translation flushed from every TLB, and only then may the new + * entry be written. + * + * This is only for the lockless atomic kernel-PTE installers (ptep_try_set()). + * It must be callable with interrupts disabled. + */ +static inline void flush_tlb_before_set(unsigned long addr) +{ +} +#endif + #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 9b2dea229b38..af49c154473d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -144,6 +144,7 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr) struct apply_range_data { struct page **pages; + struct page *scratch_page; int i; }; @@ -156,19 +157,44 @@ static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) { struct apply_range_data *d = data; struct page *page; + pte_t pteval; if (!data) return 0; - /* sanity check */ - if (unlikely(!pte_none(ptep_get(pte)))) - return -EBUSY; page = d->pages[d->i]; /* paranoia, similar to vmap_pages_pte_range() */ if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) return -EINVAL; - set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + pteval = mk_pte(page, PAGE_KERNEL); +#ifdef ptep_try_set + /* + * Kernel-fault recovery may have installed the scratch page here, and + * some architectures (arm64) prohibit valid->valid PTE transitions. + * Install atomically into a none slot. If scratch is present, clear it + * and flush_tlb_before_set() (break-before-make) before retrying. + */ + while (!ptep_try_set(pte, pteval)) { + pte_t old = ptep_get(pte); + + if (pte_none(old)) + continue; + if (WARN_ON_ONCE(pte_page(old) != d->scratch_page)) + return -EBUSY; + ptep_get_and_clear(&init_mm, addr, pte); + flush_tlb_before_set(addr); + } +#else + /* + * Without ptep_try_set() there is no atomic installer, but such arches + * also do not wire up bpf_arena_handle_page_fault(), so no scratch page + * is ever installed and the slot is always none here. + */ + if (unlikely(!pte_none(ptep_get(pte)))) + return -EBUSY; + set_pte_at(&init_mm, addr, pte, pteval); +#endif d->i++; return 0; } @@ -480,7 +506,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) if (ret) goto out_sigsegv_memcg; - struct apply_range_data data = { .pages = &page, .i = 0 }; + struct apply_range_data data = { .pages = &page, .i = 0, + .scratch_page = arena->scratch_page }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { @@ -670,6 +697,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } data.pages = pages; + data.scratch_page = arena->scratch_page; if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) goto out_free_pages; -- cgit v1.2.3