From 161e393c0f63592a3b95bdd8b55752653763fc6d Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:29 -0700 Subject: mm: Make pte_mkwrite() take a VMA The x86 Shadow stack feature includes a new type of memory called shadow stack. This shadow stack memory has some unusual properties, which requires some core mm changes to function properly. One of these unusual properties is that shadow stack memory is writable, but only in limited ways. These limits are applied via a specific PTE bit combination. Nevertheless, the memory is writable, and core mm code will need to apply the writable permissions in the typical paths that call pte_mkwrite(). Future patches will make pte_mkwrite() take a VMA, so that the x86 implementation of it can know whether to create regular writable or shadow stack mappings. But there are a couple of challenges to this. Modifying the signatures of each arch pte_mkwrite() implementation would be error prone because some are generated with macros and would need to be re-implemented. Also, some pte_mkwrite() callers operate on kernel memory without a VMA. So this can be done in a three step process. First pte_mkwrite() can be renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite() added that just calls pte_mkwrite_novma(). Next callers without a VMA can be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers can be changed to take/pass a VMA. Previous work pte_mkwrite() renamed pte_mkwrite_novma() and converted callers that don't have a VMA were to use pte_mkwrite_novma(). So now change pte_mkwrite() to take a VMA and change the remaining callers to pass a VMA. Apply the same changes for pmd_mkwrite(). No functional change. Suggested-by: David Hildenbrand Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Mike Rapoport (IBM) Acked-by: David Hildenbrand Link: https://lore.kernel.org/all/20230613001108.3040476-4-rick.p.edgecombe%40intel.com --- mm/huge_memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index eb3678360b97..23c2aa612926 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -551,7 +551,7 @@ __setup("transparent_hugepage=", setup_transparent_hugepage); pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) - pmd = pmd_mkwrite(pmd); + pmd = pmd_mkwrite(pmd, vma); return pmd; } @@ -1572,7 +1572,7 @@ out_map: pmd = pmd_modify(oldpmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); if (writable) - pmd = pmd_mkwrite(pmd); + pmd = pmd_mkwrite(pmd, vma); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); @@ -1925,7 +1925,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, /* See change_pte_range(). */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && can_change_pmd_writable(vma, addr, entry)) - entry = pmd_mkwrite(entry); + entry = pmd_mkwrite(entry, vma); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); @@ -2243,7 +2243,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); if (write) - entry = pte_mkwrite(entry); + entry = pte_mkwrite(entry, vma); if (anon_exclusive) SetPageAnonExclusive(page + i); if (!young) @@ -3287,7 +3287,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) - pmde = pmd_mkwrite(pmde); + pmde = pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_mkuffd_wp(pmde); if (!is_migration_entry_young(entry)) -- cgit v1.2.3 From e5136e876581ba5b63220378e25fec9dcec7bad1 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 12 Jun 2023 17:10:43 -0700 Subject: mm: Warn on shadow stack memory in wrong vma The x86 Control-flow Enforcement Technology (CET) feature includes a new type of memory called shadow stack. This shadow stack memory has some unusual properties, which requires some core mm changes to function properly. One sharp edge is that PTEs that are both Write=0 and Dirty=1 are treated as shadow by the CPU, but this combination used to be created by the kernel on x86. Previous patches have changed the kernel to now avoid creating these PTEs unless they are for shadow stack memory. In case any missed corners of the kernel are still creating PTEs like this for non-shadow stack memory, and to catch any re-introductions of the logic, warn if any shadow stack PTEs (Write=0, Dirty=1) are found in non-shadow stack VMAs when they are being zapped. This won't catch transient cases but should have decent coverage. In order to check if a PTE is shadow stack in core mm code, add two arch breakouts arch_check_zapped_pte/pmd(). This will allow shadow stack specific code to be kept in arch/x86. Only do the check if shadow stack is supported by the CPU and configured because in rare cases older CPUs may write Dirty=1 to a Write=0 CPU on older CPUs. This check is handled in pte_shstk()/pmd_shstk(). Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Reviewed-by: Mark Brown Acked-by: Mike Rapoport (IBM) Tested-by: Pengfei Xu Tested-by: John Allen Tested-by: Kees Cook Link: https://lore.kernel.org/all/20230613001108.3040476-18-rick.p.edgecombe%40intel.com --- arch/x86/include/asm/pgtable.h | 6 ++++++ arch/x86/mm/pgtable.c | 20 ++++++++++++++++++++ include/linux/pgtable.h | 14 ++++++++++++++ mm/huge_memory.c | 1 + mm/memory.c | 1 + 5 files changed, 42 insertions(+) (limited to 'mm/huge_memory.c') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7bab1b22a354..9255b5b7a9d9 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1665,6 +1665,12 @@ static inline bool arch_has_hw_pte_young(void) return true; } +#define arch_check_zapped_pte arch_check_zapped_pte +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); + +#define arch_check_zapped_pmd arch_check_zapped_pmd +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd); + #ifdef CONFIG_XEN_PV #define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young static inline bool arch_has_hw_nonleaf_pmd_young(void) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 217c436acfd3..4bfbe4cb6978 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -886,3 +886,23 @@ pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd_clear_saveddirty(pmd); } + +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) +{ + /* + * Hardware before shadow stack can (rarely) set Dirty=1 + * on a Write=0 PTE. So the below condition + * only indicates a software bug when shadow stack is + * supported by the HW. This checking is covered in + * pte_shstk(). + */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && + pte_shstk(pte)); +} + +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) +{ + /* See note in arch_check_zapped_pte() */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && + pmd_shstk(pmd)); +} diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9462f4a87d42..dd4637d6cfaa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -313,6 +313,20 @@ static inline bool arch_has_hw_pte_young(void) } #endif +#ifndef arch_check_zapped_pte +static inline void arch_check_zapped_pte(struct vm_area_struct *vma, + pte_t pte) +{ +} +#endif + +#ifndef arch_check_zapped_pmd +static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, + pmd_t pmd) +{ +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 23c2aa612926..554f6f82d225 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1681,6 +1681,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, */ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, tlb->fullmm); + arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) diff --git a/mm/memory.c b/mm/memory.c index f093c73512c5..36289f33327e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1430,6 +1430,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); -- cgit v1.2.3