diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/init.c | 32 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 15 | ||||
-rw-r--r-- | arch/x86/mm/kaslr.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/mem_encrypt_identity.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/pat/cpa-test.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/pat/memtype.c | 52 | ||||
-rw-r--r-- | arch/x86/mm/pat/set_memory.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 29 |
8 files changed, 99 insertions, 51 deletions
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index eb503f53c319..9cbc1e6057d3 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -263,28 +263,33 @@ static void __init probe_page_size_mask(void) } /* - * INVLPG may not properly flush Global entries - * on these CPUs when PCIDs are enabled. + * INVLPG may not properly flush Global entries on + * these CPUs. New microcode fixes the issue. */ static const struct x86_cpu_id invlpg_miss_ids[] = { - X86_MATCH_VFM(INTEL_ALDERLAKE, 0), - X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0), - X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0), - X86_MATCH_VFM(INTEL_RAPTORLAKE, 0), - X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0), - X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0), + X86_MATCH_VFM(INTEL_ALDERLAKE, 0x2e), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0x42c), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11), + X86_MATCH_VFM(INTEL_RAPTORLAKE, 0x118), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0x4117), + X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0x2e), {} }; static void setup_pcid(void) { + const struct x86_cpu_id *invlpg_miss_match; + if (!IS_ENABLED(CONFIG_X86_64)) return; if (!boot_cpu_has(X86_FEATURE_PCID)) return; - if (x86_match_cpu(invlpg_miss_ids)) { + invlpg_miss_match = x86_match_cpu(invlpg_miss_ids); + + if (invlpg_miss_match && + boot_cpu_data.microcode < invlpg_miss_match->driver_data) { pr_info("Incomplete global flushes, disabling PCID"); setup_clear_cpu_cap(X86_FEATURE_PCID); return; @@ -640,8 +645,13 @@ static void __init memory_map_top_down(unsigned long map_start, */ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, map_end); - memblock_phys_free(addr, PMD_SIZE); - real_end = addr + PMD_SIZE; + if (!addr) { + pr_warn("Failed to release memory for alloc_low_pages()"); + real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE)); + } else { + memblock_phys_free(addr, PMD_SIZE); + real_end = addr + PMD_SIZE; + } /* step_size need to be small so pgt_buf from BRK could cover it */ step_size = PMD_SIZE; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ff253648706f..d8853afd314b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -967,9 +967,18 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, ret = __add_pages(nid, start_pfn, nr_pages, params); WARN_ON_ONCE(ret); - /* update max_pfn, max_low_pfn and high_memory */ - update_end_of_memory_vars(start_pfn << PAGE_SHIFT, - nr_pages << PAGE_SHIFT); + /* + * Special case: add_pages() is called by memremap_pages() for adding device + * private pages. Do not bump up max_pfn in the device private path, + * because max_pfn changes affect dma_addressing_limited(). + * + * dma_addressing_limited() returning true when max_pfn is the device's + * addressable memory can force device drivers to use bounce buffers + * and impact their performance negatively: + */ + if (!params->pgmap) + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); return ret; } diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 230f1dee4f09..e0b0ec0f8245 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -109,8 +109,14 @@ void __init kernel_randomize_memory(void) memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; - /* Adapt physical memory region size based on available memory */ - if (memory_tb < kaslr_regions[0].size_tb) + /* + * Adapt physical memory region size based on available memory, + * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the + * device BAR space assuming the direct map space is large enough + * for creating a ZONE_DEVICE mapping in the direct map corresponding + * to the physical BAR address. + */ + if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb)) kaslr_regions[0].size_tb = memory_tb; /* diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index ac33b2263a43..b922b9fea6b6 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -562,7 +562,7 @@ void __head sme_enable(struct boot_params *bp) } RIP_REL_REF(sme_me_mask) = me_mask; - physical_mask &= ~me_mask; - cc_vendor = CC_VENDOR_AMD; + RIP_REL_REF(physical_mask) &= ~me_mask; + RIP_REL_REF(cc_vendor) = CC_VENDOR_AMD; cc_set_mask(me_mask); } diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c index 3d2f7f0a6ed1..ad3c1feec990 100644 --- a/arch/x86/mm/pat/cpa-test.c +++ b/arch/x86/mm/pat/cpa-test.c @@ -183,7 +183,7 @@ static int pageattr_test(void) break; case 1: - err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); + err = change_page_attr_set(addrs, len[i], PAGE_CPA_TEST, 1); break; case 2: diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index feb8cc6a12bf..d721cc19addb 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -984,29 +984,42 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, return -EINVAL; } -/* - * track_pfn_copy is called when vma that is covering the pfnmap gets - * copied through copy_page_range(). - * - * If the vma has a linear pfn mapping for the entire range, we get the prot - * from pte and reserve the entire vma range with single reserve_pfn_range call. - */ -int track_pfn_copy(struct vm_area_struct *vma) +int track_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, unsigned long *pfn) { + const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start; resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; + int rc; - if (vma->vm_flags & VM_PAT) { - if (get_pat_info(vma, &paddr, &pgprot)) - return -EINVAL; - /* reserve the whole chunk covered by vma. */ - return reserve_pfn_range(paddr, vma_size, &pgprot, 1); - } + if (!(src_vma->vm_flags & VM_PAT)) + return 0; + + /* + * Duplicate the PAT information for the dst VMA based on the src + * VMA. + */ + if (get_pat_info(src_vma, &paddr, &pgprot)) + return -EINVAL; + rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1); + if (rc) + return rc; + /* Reservation for the destination VMA succeeded. */ + vm_flags_set(dst_vma, VM_PAT); + *pfn = PHYS_PFN(paddr); return 0; } +void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn) +{ + untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true); + /* + * Reservation was freed, any copied page tables will get cleaned + * up later, but without getting PAT involved again. + */ +} + /* * prot is passed in as a parameter for the new mapping. If the vma has * a linear pfn mapping for the entire range, or no vma is provided, @@ -1095,15 +1108,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, } } -/* - * untrack_pfn_clear is called if the following situation fits: - * - * 1) while mremapping a pfnmap for a new region, with the old vma after - * its pfnmap page table has been removed. The new vma has a new pfnmap - * to the same pfn & cache type with VM_PAT set. - * 2) while duplicating vm area, the new vma fails to copy the pgtable from - * old vma. - */ void untrack_pfn_clear(struct vm_area_struct *vma) { vm_flags_clear(vma, VM_PAT); diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 44f7b2ea6a07..69ceb967d73e 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2422,7 +2422,7 @@ static int __set_pages_np(struct page *page, int numpages) .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS }; /* @@ -2501,7 +2501,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), + .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)), .flags = CPA_NO_CHECK_ALIAS, }; @@ -2544,7 +2544,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS, }; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 00ffa74d0dd0..8629d90fdcd9 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -389,9 +389,9 @@ static void cond_mitigation(struct task_struct *next) prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); /* - * Avoid user/user BTB poisoning by flushing the branch predictor - * when switching between processes. This stops one process from - * doing Spectre-v2 attacks on another. + * Avoid user->user BTB/RSB poisoning by flushing them when switching + * between processes. This stops one process from doing Spectre-v2 + * attacks on another. * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the @@ -624,7 +624,11 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - /* Let nmi_uaccess_okay() know that we're changing CR3. */ + /* + * Indicate that CR3 is about to change. nmi_uaccess_okay() + * and others are sensitive to the window where mm_cpumask(), + * CR3 and cpu_tlbstate.loaded_mm are not all in sync. + */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); } @@ -895,8 +899,16 @@ done: static bool should_flush_tlb(int cpu, void *data) { + struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu); struct flush_tlb_info *info = data; + /* + * Order the 'loaded_mm' and 'is_lazy' against their + * write ordering in switch_mm_irqs_off(). Ensure + * 'is_lazy' is at least as new as 'loaded_mm'. + */ + smp_rmb(); + /* Lazy TLB will get flushed at the next context switch. */ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) return false; @@ -905,8 +917,15 @@ static bool should_flush_tlb(int cpu, void *data) if (!info->mm) return true; + /* + * While switching, the remote CPU could have state from + * either the prev or next mm. Assume the worst and flush. + */ + if (loaded_mm == LOADED_MM_SWITCHING) + return true; + /* The target mm is loaded, and the CPU is not lazy. */ - if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + if (loaded_mm == info->mm) return true; /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ |