diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-19 07:50:32 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-19 07:50:32 +0300 |
| commit | eeccf287a2a517954b57cf9d733b3cf5d47afa34 (patch) | |
| tree | 46b5cd55d8da25cbc9aa96b38470506958851005 | |
| parent | 956b9cbd7f156c8672dac94a00de3c6a0939c692 (diff) | |
| parent | ac1ea219590c09572ed5992dc233bbf7bb70fef9 (diff) | |
| download | linux-eeccf287a2a517954b57cf9d733b3cf5d47afa34.tar.xz | |
Merge tag 'mm-stable-2026-02-18-19-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton:
- "mm/vmscan: fix demotion targets checks in reclaim/demotion" fixes a
couple of issues in the demotion code - pages were failed demotion
and were finding themselves demoted into disallowed nodes (Bing Jiao)
- "Remove XA_ZERO from error recovery of dup_mmap()" fixes a rare
mapledtree race and performs a number of cleanups (Liam Howlett)
- "mm: add bitmap VMA flag helpers and convert all mmap_prepare to use
them" implements a lot of cleanups following on from the conversion
of the VMA flags into a bitmap (Lorenzo Stoakes)
- "support batch checking of references and unmapping for large folios"
implements batching to greatly improve the performance of reclaiming
clean file-backed large folios (Baolin Wang)
- "selftests/mm: add memory failure selftests" does as claimed (Miaohe
Lin)
* tag 'mm-stable-2026-02-18-19-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (36 commits)
mm/page_alloc: clear page->private in free_pages_prepare()
selftests/mm: add memory failure dirty pagecache test
selftests/mm: add memory failure clean pagecache test
selftests/mm: add memory failure anonymous page test
mm: rmap: support batched unmapping for file large folios
arm64: mm: implement the architecture-specific clear_flush_young_ptes()
arm64: mm: support batch clearing of the young flag for large folios
arm64: mm: factor out the address and ptep alignment into a new helper
mm: rmap: support batched checks of the references for large folios
tools/testing/vma: add VMA userland tests for VMA flag functions
tools/testing/vma: separate out vma_internal.h into logical headers
tools/testing/vma: separate VMA userland tests into separate files
mm: make vm_area_desc utilise vma_flags_t only
mm: update all remaining mmap_prepare users to use vma_flags_t
mm: update shmem_[kernel]_file_*() functions to use vma_flags_t
mm: update secretmem to use VMA flags on mmap_prepare
mm: update hugetlbfs to use VMA flags on mmap_prepare
mm: add basic VMA flag operation helper functions
tools: bitmap: add missing bitmap_[subset(), andnot()]
mm: add mk_vma_flags() bitmap flag macro helper
...
82 files changed, 3939 insertions, 2519 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index dc82a6bd1a61..b8d8a5c41597 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11845,6 +11845,7 @@ F: include/linux/memory-failure.h F: include/trace/events/memory-failure.h F: mm/hwpoison-inject.c F: mm/memory-failure.c +F: tools/testing/selftests/mm/memory-failure.c HYCON HY46XX TOUCHSCREEN SUPPORT M: Giulio Benetti <giulio.benetti@benettiengineering.com> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d94445b4f3df..a17eb8a76788 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1648,10 +1648,10 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full); -extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); -extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); +int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr); +int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr); extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr); extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, @@ -1823,7 +1823,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, if (likely(!pte_valid_cont(orig_pte))) return __ptep_test_and_clear_young(vma, addr, ptep); - return contpte_ptep_test_and_clear_young(vma, addr, ptep); + return contpte_test_and_clear_young_ptes(vma, addr, ptep, 1); } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH @@ -1835,7 +1835,18 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, if (likely(!pte_valid_cont(orig_pte))) return __ptep_clear_flush_young(vma, addr, ptep); - return contpte_ptep_clear_flush_young(vma, addr, ptep); + return contpte_clear_flush_young_ptes(vma, addr, ptep, 1); +} + +#define clear_flush_young_ptes clear_flush_young_ptes +static inline int clear_flush_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + unsigned int nr) +{ + if (likely(nr == 1 && !pte_cont(__ptep_get(ptep)))) + return __ptep_clear_flush_young(vma, addr, ptep); + + return contpte_clear_flush_young_ptes(vma, addr, ptep, nr); } #define wrprotect_ptes wrprotect_ptes diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 589bcf878938..b929a455103f 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -26,6 +26,26 @@ static inline pte_t *contpte_align_down(pte_t *ptep) return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); } +static inline pte_t *contpte_align_addr_ptep(unsigned long *start, + unsigned long *end, pte_t *ptep, + unsigned int nr) +{ + /* + * Note: caller must ensure these nr PTEs are consecutive (present) + * PTEs that map consecutive pages of the same large folio within a + * single VMA and a single page table. + */ + if (pte_cont(__ptep_get(ptep + nr - 1))) + *end = ALIGN(*end, CONT_PTE_SIZE); + + if (pte_cont(__ptep_get(ptep))) { + *start = ALIGN_DOWN(*start, CONT_PTE_SIZE); + ptep = contpte_align_down(ptep); + } + + return ptep; +} + static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { @@ -488,8 +508,9 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes); -int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + unsigned int nr) { /* * ptep_clear_flush_young() technically requires us to clear the access @@ -498,41 +519,45 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, * contig range when the range is covered by a single folio, we can get * away with clearing young for the whole contig range here, so we avoid * having to unfold. + * + * The 'nr' means consecutive (present) PTEs that map consecutive pages + * of the same large folio in a single VMA and a single page table. */ + unsigned long end = addr + nr * PAGE_SIZE; int young = 0; - int i; - - ptep = contpte_align_down(ptep); - addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); - for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) + ptep = contpte_align_addr_ptep(&addr, &end, ptep, nr); + for (; addr != end; ptep++, addr += PAGE_SIZE) young |= __ptep_test_and_clear_young(vma, addr, ptep); return young; } -EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young); +EXPORT_SYMBOL_GPL(contpte_test_and_clear_young_ptes); -int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + unsigned int nr) { int young; - young = contpte_ptep_test_and_clear_young(vma, addr, ptep); + young = contpte_test_and_clear_young_ptes(vma, addr, ptep, nr); if (young) { + unsigned long end = addr + nr * PAGE_SIZE; + + contpte_align_addr_ptep(&addr, &end, ptep, nr); /* * See comment in __ptep_clear_flush_young(); same rationale for * eliding the trailing DSB applies here. */ - addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); - __flush_tlb_range_nosync(vma->vm_mm, addr, addr + CONT_PTE_SIZE, + __flush_tlb_range_nosync(vma->vm_mm, addr, end, PAGE_SIZE, true, 3); } return young; } -EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young); +EXPORT_SYMBOL_GPL(contpte_clear_flush_young_ptes); void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) @@ -569,14 +594,7 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long start = addr; unsigned long end = start + nr * PAGE_SIZE; - if (pte_cont(__ptep_get(ptep + nr - 1))) - end = ALIGN(end, CONT_PTE_SIZE); - - if (pte_cont(__ptep_get(ptep))) { - start = ALIGN_DOWN(start, CONT_PTE_SIZE); - ptep = contpte_align_down(ptep); - } - + ptep = contpte_align_addr_ptep(&start, &end, ptep, nr); __clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags); } EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes); diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 9322a9287dc7..0bc36957979d 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -83,7 +83,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) encl_size = secs->size + PAGE_SIZE; backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5), - VM_NORESERVE); + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(backing)) { ret = PTR_ERR(backing); goto err_out_shrink; diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 52039fae1594..cca4529431f8 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -306,7 +306,7 @@ static unsigned zero_mmap_capabilities(struct file *file) /* can't do an in-place private mapping if there's no MMU */ static inline int private_mapping_ok(struct vm_area_desc *desc) { - return is_nommu_shared_mapping(desc->vm_flags); + return is_nommu_shared_vma_flags(&desc->vma_flags); } #else @@ -360,7 +360,7 @@ static int mmap_mem_prepare(struct vm_area_desc *desc) desc->vm_ops = &mmap_mem_ops; - /* Remap-pfn-range will mark the range VM_IO. */ + /* Remap-pfn-range will mark the range with the I/O flag. */ mmap_action_remap_full(desc, desc->pgoff); /* We filter remap errors to -EAGAIN. */ desc->action.error_hook = mmap_filter_error; @@ -520,7 +520,7 @@ static int mmap_zero_prepare(struct vm_area_desc *desc) #ifndef CONFIG_MMU return -ENOSYS; #endif - if (desc->vm_flags & VM_SHARED) + if (vma_desc_test_flags(desc, VMA_SHARED_BIT)) return shmem_zero_setup_desc(desc); desc->action.success_hook = mmap_zero_private_success; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 22999a402e02..528e81240c4d 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -13,7 +13,7 @@ #include "dax-private.h" #include "bus.h" -static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags, +static int __check_vma(struct dev_dax *dev_dax, vma_flags_t flags, unsigned long start, unsigned long end, struct file *file, const char *func) { @@ -24,7 +24,7 @@ static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags, return -ENXIO; /* prevent private mappings from being established */ - if ((vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { + if (!vma_flags_test(&flags, VMA_MAYSHARE_BIT)) { dev_info_ratelimited(dev, "%s: %s: fail, attempted private mapping\n", current->comm, func); @@ -53,7 +53,7 @@ static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags, static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { - return __check_vma(dev_dax, vma->vm_flags, vma->vm_start, vma->vm_end, + return __check_vma(dev_dax, vma->flags, vma->vm_start, vma->vm_end, vma->vm_file, func); } @@ -306,14 +306,14 @@ static int dax_mmap_prepare(struct vm_area_desc *desc) * fault time. */ id = dax_read_lock(); - rc = __check_vma(dev_dax, desc->vm_flags, desc->start, desc->end, filp, + rc = __check_vma(dev_dax, desc->vma_flags, desc->start, desc->end, filp, __func__); dax_read_unlock(id); if (rc) return rc; desc->vm_ops = &dax_vm_ops; - desc->vm_flags |= VM_HUGEPAGE; + vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); return 0; } diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index ffa7852c8f6c..f7094c4aa97a 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -186,15 +186,16 @@ int drm_gem_object_init(struct drm_device *dev, struct drm_gem_object *obj, { struct vfsmount *huge_mnt; struct file *filp; + const vma_flags_t flags = mk_vma_flags(VMA_NORESERVE_BIT); drm_gem_private_object_init(dev, obj, size); huge_mnt = drm_gem_get_huge_mnt(dev); if (huge_mnt) filp = shmem_file_setup_with_mnt(huge_mnt, "drm mm object", - size, VM_NORESERVE); + size, flags); else - filp = shmem_file_setup("drm mm object", size, VM_NORESERVE); + filp = shmem_file_setup("drm mm object", size, flags); if (IS_ERR(filp)) return PTR_ERR(filp); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 6ad1d6f99363..95b13d172913 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -499,7 +499,7 @@ static int __create_shmem(struct drm_i915_private *i915, resource_size_t size, unsigned int flags) { - unsigned long shmem_flags = VM_NORESERVE; + const vma_flags_t shmem_flags = mk_vma_flags(VMA_NORESERVE_BIT); struct vfsmount *huge_mnt; struct file *filp; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index f65fe86c02b5..7b1a7d01db2b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -200,7 +200,8 @@ static int i915_ttm_tt_shmem_populate(struct ttm_device *bdev, struct address_space *mapping; gfp_t mask; - filp = shmem_file_setup("i915-shmem-tt", size, VM_NORESERVE); + filp = shmem_file_setup("i915-shmem-tt", size, + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(filp)) return PTR_ERR(filp); diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c index 365c4b8b04f4..5f37c699a320 100644 --- a/drivers/gpu/drm/i915/gt/shmem_utils.c +++ b/drivers/gpu/drm/i915/gt/shmem_utils.c @@ -19,7 +19,8 @@ struct file *shmem_create_from_data(const char *name, void *data, size_t len) struct file *file; int err; - file = shmem_file_setup(name, PAGE_ALIGN(len), VM_NORESERVE); + file = shmem_file_setup(name, PAGE_ALIGN(len), + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(file)) return file; diff --git a/drivers/gpu/drm/ttm/tests/ttm_tt_test.c b/drivers/gpu/drm/ttm/tests/ttm_tt_test.c index 61ec6f580b62..bd5f7d0b9b62 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_tt_test.c +++ b/drivers/gpu/drm/ttm/tests/ttm_tt_test.c @@ -143,7 +143,7 @@ static void ttm_tt_fini_shmem(struct kunit *test) err = ttm_tt_init(tt, bo, 0, caching, 0); KUNIT_ASSERT_EQ(test, err, 0); - shmem = shmem_file_setup("ttm swap", BO_SIZE, 0); + shmem = shmem_file_setup("ttm swap", BO_SIZE, EMPTY_VMA_FLAGS); tt->swap_storage = shmem; ttm_tt_fini(tt); diff --git a/drivers/gpu/drm/ttm/ttm_backup.c b/drivers/gpu/drm/ttm/ttm_backup.c index 32530c75f038..6bd4c123d94c 100644 --- a/drivers/gpu/drm/ttm/ttm_backup.c +++ b/drivers/gpu/drm/ttm/ttm_backup.c @@ -178,5 +178,6 @@ EXPORT_SYMBOL_GPL(ttm_backup_bytes_avail); */ struct file *ttm_backup_shmem_create(loff_t size) { - return shmem_file_setup("ttm shmem backup", size, 0); + return shmem_file_setup("ttm shmem backup", size, + EMPTY_VMA_FLAGS); } diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index af33fa020249..fbf713abd547 100644 --- a/drivers/gpu/drm/ttm/ttm_tt.c +++ b/drivers/gpu/drm/ttm/ttm_tt.c @@ -330,7 +330,7 @@ int ttm_tt_swapout(struct ttm_device *bdev, struct ttm_tt *ttm, struct page *to_page; int i, ret; - swap_storage = shmem_file_setup("ttm swap", size, 0); + swap_storage = shmem_file_setup("ttm swap", size, EMPTY_VMA_FLAGS); if (IS_ERR(swap_storage)) { pr_err("Failed allocating swap storage\n"); return PTR_ERR(swap_storage); @@ -394,7 +394,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = { static int aio_ring_mmap_prepare(struct vm_area_desc *desc) { - desc->vm_flags |= VM_DONTEXPAND; + vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT); desc->vm_ops = &aio_ring_vm_ops; return 0; } diff --git a/fs/erofs/data.c b/fs/erofs/data.c index a28084ef796b..f79ee80627d9 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -473,11 +473,12 @@ static int erofs_file_mmap_prepare(struct vm_area_desc *desc) if (!IS_DAX(file_inode(desc->file))) return generic_file_readonly_mmap_prepare(desc); - if ((desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE)) + if (vma_desc_test_flags(desc, VMA_SHARED_BIT) && + vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) return -EINVAL; desc->vm_ops = &erofs_dax_vm_ops; - desc->vm_flags |= VM_HUGEPAGE; + vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); return 0; } #else diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4320ebff74f3..f1dc5ce791a7 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -818,13 +818,13 @@ static int ext4_file_mmap_prepare(struct vm_area_desc *desc) * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ - if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) + if (!daxdev_mapping_supported(desc, file_inode(file), dax_dev)) return -EOPNOTSUPP; file_accessed(file); if (IS_DAX(file_inode(file))) { desc->vm_ops = &ext4_dax_vm_ops; - desc->vm_flags |= VM_HUGEPAGE; + vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); } else { desc->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3b4c152c5c73..95a5b23b4808 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -109,7 +109,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); - vm_flags_t vm_flags; + vma_flags_t vma_flags; /* * vma address alignment (but not the pgoff alignment) has @@ -119,7 +119,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vma_desc_set_flags(desc, VMA_HUGETLB_BIT, VMA_DONTEXPAND_BIT); desc->vm_ops = &hugetlb_vm_ops; /* @@ -148,23 +148,23 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) ret = -ENOMEM; - vm_flags = desc->vm_flags; + vma_flags = desc->vma_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode * flag S_PRIVATE is set. */ if (inode->i_flags & S_PRIVATE) - vm_flags |= VM_NORESERVE; + vma_flags_set(&vma_flags, VMA_NORESERVE_BIT); if (hugetlb_reserve_pages(inode, desc->pgoff >> huge_page_order(h), len >> huge_page_shift(h), desc, - vm_flags) < 0) + vma_flags) < 0) goto out; ret = 0; - if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) + if (vma_desc_test_flags(desc, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); @@ -1527,7 +1527,7 @@ static int get_hstate_idx(int page_size_log) * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, int creat_flags, + vma_flags_t acctflag, int creat_flags, int page_size_log) { struct inode *inode; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index ae8c47cac406..f53037e0ecb6 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -276,7 +276,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *file = desc->file; struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); - bool rw = desc->vm_flags & VM_WRITE; + const bool rw = vma_desc_test_flags(desc, VMA_WRITE_BIT); int err; /* Avoid any operation if inode is bad. */ diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index afd610a3fc68..42591252e239 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -411,8 +411,8 @@ static int orangefs_file_mmap_prepare(struct vm_area_desc *desc) "orangefs_file_mmap: called on %pD\n", file); /* set the sequential readahead hint */ - desc->vm_flags |= VM_SEQ_READ; - desc->vm_flags &= ~VM_RAND_READ; + vma_desc_set_flags(desc, VMA_SEQ_READ_BIT); + vma_desc_clear_flags(desc, VMA_RAND_READ_BIT); file_accessed(file); desc->vm_ops = &orangefs_file_vm_ops; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 77b8ca2757e0..0f8e838ece07 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -264,7 +264,7 @@ out: */ static int ramfs_nommu_mmap_prepare(struct vm_area_desc *desc) { - if (!is_nommu_shared_mapping(desc->vm_flags)) + if (!is_nommu_shared_vma_flags(&desc->vma_flags)) return -ENOSYS; file_accessed(desc->file); diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index 0bfc13c5b96d..e81d71abfe54 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -1044,7 +1044,7 @@ static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc) * Ensure changes are carried directly to the memory being mapped, * do not allow copy-on-write mapping. */ - if (!(desc->vm_flags & VM_SHARED)) { + if (!vma_desc_test_flags(desc, VMA_SHARED_BIT)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index 4b77c6dc4418..7c3a1a7fecee 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -63,7 +63,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file, */ static int romfs_mmap_prepare(struct vm_area_desc *desc) { - return is_nommu_shared_mapping(desc->vm_flags) ? 0 : -ENOSYS; + return is_nommu_shared_vma_flags(&desc->vma_flags) ? 0 : -ENOSYS; } static unsigned romfs_mmap_capabilities(struct file *file) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 2998c9b62f4b..bee0662fbdb6 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -61,7 +61,8 @@ xfile_create( if (!xf) return -ENOMEM; - xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE); + xf->file = shmem_kernel_file_setup(description, isize, + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(xf->file)) { error = PTR_ERR(xf->file); goto out_xfile; diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 0106da0a9f44..f1f23623e4a4 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -62,7 +62,7 @@ xmbuf_alloc( if (!btp) return -ENOMEM; - file = shmem_kernel_file_setup(descr, 0, 0); + file = shmem_kernel_file_setup(descr, 0, EMPTY_VMA_FLAGS); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_btp; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 43d088a3bceb..6246f34df9fd 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -2010,14 +2010,14 @@ xfs_file_mmap_prepare( * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ - if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), + if (!daxdev_mapping_supported(desc, file_inode(file), target->bt_daxdev)) return -EOPNOTSUPP; file_accessed(file); desc->vm_ops = &xfs_file_vm_ops; if (IS_DAX(inode)) - desc->vm_flags |= VM_HUGEPAGE; + vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); return 0; } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index c1e5e30e90a0..8a7161fc49e5 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -333,7 +333,8 @@ static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) * ordering between msync() and page cache writeback. */ if (zonefs_inode_is_seq(file_inode(file)) && - (desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE)) + vma_desc_test_flags(desc, VMA_SHARED_BIT) && + vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) return -EINVAL; file_accessed(file); diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index cbd402b4f974..65d76a38974b 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -176,7 +176,7 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } -extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid); +extern void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask); #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -299,9 +299,9 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } -static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +static inline void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask) { - return true; + nodes_copy(*mask, node_states[N_MEMORY]); } #endif /* !CONFIG_CPUSETS */ diff --git a/include/linux/dax.h b/include/linux/dax.h index 9d624f4d9df6..bf103f317cac 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -65,11 +65,11 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, /* * Check if given mapping is supported by the file / underlying device. */ -static inline bool daxdev_mapping_supported(vm_flags_t vm_flags, +static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - if (!(vm_flags & VM_SYNC)) + if (!vma_desc_test_flags(desc, VMA_SYNC_BIT)) return true; if (!IS_DAX(inode)) return false; @@ -111,11 +111,11 @@ static inline void set_dax_nomc(struct dax_device *dax_dev) static inline void set_dax_synchronous(struct dax_device *dax_dev) { } -static inline bool daxdev_mapping_supported(vm_flags_t vm_flags, +static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - return !(vm_flags & VM_SYNC); + return !vma_desc_test_flags(desc, VMA_SYNC_BIT); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d2e48fa5f72e..65910437be1c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -148,7 +148,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_desc *desc, vm_flags_t vm_flags); + struct vm_area_desc *desc, vma_flags_t vma_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -527,7 +527,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) } extern const struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, +struct file *hugetlb_file_setup(const char *name, size_t size, vma_flags_t acct, int creat_flags, int page_size_log); static inline bool is_file_hugepages(const struct file *file) @@ -543,7 +543,7 @@ static inline struct hstate *hstate_inode(struct inode *i) #define is_file_hugepages(file) false static inline struct file * -hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, +hugetlb_file_setup(const char *name, size_t size, vma_flags_t acctflag, int creat_flags, int page_size_log) { return ERR_PTR(-ENOSYS); diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index a27aa0162918..593f5d4e108b 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -11,6 +11,11 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) return !!(vm_flags & VM_HUGETLB); } +static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) +{ + return vma_flags_test(flags, VMA_HUGETLB_BIT); +} + #else static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) @@ -18,6 +23,11 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) return false; } +static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) +{ + return false; +} + #endif static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 52bfe4157623..70b685a85bf4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1758,7 +1758,7 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, rcu_read_unlock(); } -bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); +void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, nodemask_t *mask); void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); @@ -1829,9 +1829,9 @@ static inline ino_t page_cgroup_ino(struct page *page) return 0; } -static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) +static inline void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, + nodemask_t *mask) { - return true; } static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 7a805796fcfd..96987d9d95a8 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -53,11 +53,11 @@ struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types); void mt_put_memory_types(struct list_head *memory_types); #ifdef CONFIG_MIGRATION -int next_demotion_node(int node); +int next_demotion_node(int node, const nodemask_t *allowed_mask); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); bool node_is_toptier(int node); #else -static inline int next_demotion_node(int node) +static inline int next_demotion_node(int node, const nodemask_t *allowed_mask) { return NUMA_NO_NODE; } @@ -101,7 +101,7 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt } -static inline int next_demotion_node(int node) +static inline int next_demotion_node(int node, const nodemask_t *allowed_mask) { return NUMA_NO_NODE; } diff --git a/include/linux/mm.h b/include/linux/mm.h index dc1ad71a2a70..5be3d8a8f806 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MM_H #define _LINUX_MM_H +#include <linux/args.h> #include <linux/errno.h> #include <linux/mmdebug.h> #include <linux/gfp.h> @@ -551,17 +552,18 @@ enum { /* * Physically remapped pages are special. Tell the * rest of the world about it: - * VM_IO tells people not to look at these pages + * IO tells people not to look at these pages * (accesses can have side effects). - * VM_PFNMAP tells the core MM that the base pages are just + * PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. - * VM_DONTEXPAND + * DONTEXPAND * Disable vma merging and expanding with mremap(). - * VM_DONTDUMP + * DONTDUMP * Omit vma from core dump, even when VM_IO turned off. */ -#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) +#define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, \ + VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT) /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) @@ -945,7 +947,7 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma, * system word. */ if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { - unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + unsigned long *bitmap = vma->flags.__vma_flags; bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); } @@ -989,8 +991,7 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } -static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, - vma_flag_t bit) +static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_t bit) { const vm_flags_t mask = BIT((__force int)bit); @@ -1005,13 +1006,12 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific * valid flags are allowed to do this. */ -static inline void vma_flag_set_atomic(struct vm_area_struct *vma, - vma_flag_t bit) +static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) { - unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + unsigned long *bitmap = vma->flags.__vma_flags; vma_assert_stabilised(vma); - if (__vma_flag_atomic_valid(vma, bit)) + if (__vma_atomic_valid_flag(vma, bit)) set_bit((__force int)bit, bitmap); } @@ -1022,15 +1022,211 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma, * This is necessarily racey, so callers must ensure that serialisation is * achieved through some other means, or that races are permissible. */ -static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, - vma_flag_t bit) +static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) { - if (__vma_flag_atomic_valid(vma, bit)) + if (__vma_atomic_valid_flag(vma, bit)) return test_bit((__force int)bit, &vma->vm_flags); return false; } +/* Set an individual VMA flag in flags, non-atomically. */ +static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +{ + unsigned long *bitmap = flags->__vma_flags; + + __set_bit((__force int)bit, bitmap); +} + +static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +{ + vma_flags_t flags; + int i; + + vma_flags_clear_all(&flags); + for (i = 0; i < count; i++) + vma_flag_set(&flags, bits[i]); + return flags; +} + +/* + * Helper macro which bitwise-or combines the specified input flags into a + * vma_flags_t bitmap value. E.g.: + * + * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + * + * The compiler cleverly optimises away all of the work and this ends up being + * equivalent to aggregating the values manually. + */ +#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flag_t []){__VA_ARGS__}) + +/* Test each of to_test flags in flags, non-atomically. */ +static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, + vma_flags_t to_test) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_test = to_test.__vma_flags; + + return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); +} + +/* + * Test whether any specified VMA flag is set, e.g.: + * + * if (vma_flags_test(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + */ +#define vma_flags_test(flags, ...) \ + vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) + +/* Test that ALL of the to_test flags are set, non-atomically. */ +static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, + vma_flags_t to_test) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_test = to_test.__vma_flags; + + return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); +} + +/* + * Test whether ALL specified VMA flags are set, e.g.: + * + * if (vma_flags_test_all(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + */ +#define vma_flags_test_all(flags, ...) \ + vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) + +/* Set each of the to_set flags in flags, non-atomically. */ +static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) +{ + unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_set = to_set.__vma_flags; + + bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS); +} + +/* + * Set all specified VMA flags, e.g.: + * + * vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + */ +#define vma_flags_set(flags, ...) \ + vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) + +/* Clear all of the to-clear flags in flags, non-atomically. */ +static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) +{ + unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_clear = to_clear.__vma_flags; + + bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS); +} + +/* + * Clear all specified individual flags, e.g.: + * + * vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + */ +#define vma_flags_clear(flags, ...) \ + vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) + +/* + * Helper to test that ALL specified flags are set in a VMA. + * + * Note: appropriate locks must be held, this function does not acquire them for + * you. + */ +static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&vma->flags, flags); +} + +/* + * Helper macro for checking that ALL specified flags are set in a VMA, e.g.: + * + * if (vma_test_all_flags(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } + */ +#define vma_test_all_flags(vma, ...) \ + vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + +/* + * Helper to set all VMA flags in a VMA. + * + * Note: appropriate locks must be held, this function does not acquire them for + * you. + */ +static inline void vma_set_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_set_mask(&vma->flags, flags); +} + +/* + * Helper macro for specifying VMA flags in a VMA, e.g.: + * + * vma_set_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, + * VMA_DONTDUMP_BIT); + * + * Note: appropriate locks must be held, this function does not acquire them for + * you. + */ +#define vma_set_flags(vma, ...) \ + vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + +/* Helper to test all VMA flags in a VMA descriptor. */ +static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_mask(&desc->vma_flags, flags); +} + +/* + * Helper macro for testing VMA flags for an input pointer to a struct + * vm_area_desc object describing a proposed VMA, e.g.: + * + * if (vma_desc_test_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } + */ +#define vma_desc_test_flags(desc, ...) \ + vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +/* Helper to set all VMA flags in a VMA descriptor. */ +static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) +{ + vma_flags_set_mask(&desc->vma_flags, flags); +} + +/* + * Helper macro for specifying VMA flags for an input pointer to a struct + * vm_area_desc object describing a proposed VMA, e.g.: + * + * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, + * VMA_DONTDUMP_BIT); + */ +#define vma_desc_set_flags(desc, ...) \ + vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +/* Helper to clear all VMA flags in a VMA descriptor. */ +static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) +{ + vma_flags_clear_mask(&desc->vma_flags, flags); +} + +/* + * Helper macro for clearing VMA flags for an input pointer to a struct + * vm_area_desc object describing a proposed VMA, e.g.: + * + * vma_desc_clear_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, + * VMA_DONTDUMP_BIT); + */ +#define vma_desc_clear_flags(desc, ...) \ + vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; @@ -1096,15 +1292,20 @@ static inline bool vma_is_accessible(const struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } -static inline bool is_shared_maywrite(vm_flags_t vm_flags) +static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) { return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == (VM_SHARED | VM_MAYWRITE); } +static inline bool is_shared_maywrite(const vma_flags_t *flags) +{ + return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); +} + static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) { - return is_shared_maywrite(vma->vm_flags); + return is_shared_maywrite(&vma->flags); } static inline @@ -1732,6 +1933,14 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) +{ + const vma_flags_t *flags = &desc->vma_flags; + + return vma_flags_test(flags, VMA_MAYWRITE_BIT) && + !vma_flags_test(flags, VMA_SHARED_BIT); +} + #ifndef CONFIG_MMU static inline bool is_nommu_shared_mapping(vm_flags_t flags) { @@ -1745,6 +1954,11 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) */ return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } + +static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags) +{ + return vma_flags_test(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); +} #endif #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -2627,10 +2841,6 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } -void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, unsigned long tree_end); - struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8731606d8d36..3cc8ae722886 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -844,7 +844,7 @@ struct mmap_action { /* * If specified, this hook is invoked when an error occurred when - * attempting the selection action. + * attempting the selected action. * * The hook can return an error code in order to filter the error, but * it is not valid to clear the error here. @@ -866,7 +866,9 @@ struct mmap_action { #define NUM_VMA_FLAG_BITS BITS_PER_LONG typedef struct { DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); -} __private vma_flags_t; +} vma_flags_t; + +#define EMPTY_VMA_FLAGS ((vma_flags_t){ }) /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to @@ -885,10 +887,7 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - union { - vm_flags_t vm_flags; - vma_flags_t vma_flags; - }; + vma_flags_t vma_flags; pgprot_t page_prot; /* Write-only fields. */ @@ -1059,7 +1058,7 @@ struct vm_area_struct { /* Clears all bits in the VMA flags bitmap, non-atomically. */ static inline void vma_flags_clear_all(vma_flags_t *flags) { - bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); + bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); } /* @@ -1070,7 +1069,9 @@ static inline void vma_flags_clear_all(vma_flags_t *flags) */ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) { - *ACCESS_PRIVATE(flags, __vma_flags) = value; + unsigned long *bitmap = flags->__vma_flags; + + bitmap[0] = value; } /* @@ -1081,7 +1082,7 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va */ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; WRITE_ONCE(*bitmap, value); } @@ -1089,7 +1090,7 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo /* Update the first system word of VMA flags setting bits, non-atomically. */ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; *bitmap |= value; } @@ -1097,7 +1098,7 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) /* Update the first system word of VMA flags clearing bits, non-atomically. */ static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; *bitmap &= ~value; } diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d1094c2d5fb6..07a2bbaf86e9 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -515,16 +515,17 @@ static inline void mmu_notifier_range_init_owner( range->owner = owner; } -#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ +#define clear_flush_young_ptes_notify(__vma, __address, __ptep, __nr) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ - __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ + unsigned int ___nr = __nr; \ + __young = clear_flush_young_ptes(___vma, ___address, __ptep, ___nr); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ - PAGE_SIZE); \ + ___nr * PAGE_SIZE); \ __young; \ }) @@ -650,7 +651,7 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) #define mmu_notifier_range_update_to_read_only(r) false -#define ptep_clear_flush_young_notify ptep_clear_flush_young +#define clear_flush_young_ptes_notify clear_flush_young_ptes #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young #define pmdp_clear_young_notify pmdp_test_and_clear_young diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 827dca25c0bc..a50df42a893f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -23,25 +23,6 @@ #endif /* - * On almost all architectures and configurations, 0 can be used as the - * upper ceiling to free_pgtables(): on many architectures it has the same - * effect as using TASK_SIZE. However, there is one configuration which - * must impose a more careful limit, to avoid freeing kernel pgtables. - */ -#ifndef USER_PGTABLES_CEILING -#define USER_PGTABLES_CEILING 0UL -#endif - -/* - * This defines the first usable user address. Platforms - * can override its value with custom FIRST_USER_ADDRESS - * defined in their respective <asm/pgtable.h>. - */ -#ifndef FIRST_USER_ADDRESS -#define FIRST_USER_ADDRESS 0UL -#endif - -/* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>. @@ -1087,6 +1068,41 @@ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, } #endif +#ifndef clear_flush_young_ptes +/** + * clear_flush_young_ptes - Mark PTEs that map consecutive pages of the same + * folio as old and flush the TLB. + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear access bit. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_clear_flush_young(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline int clear_flush_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + int young = 0; + + for (;;) { + young |= ptep_clear_flush_young(vma, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } + + return young; +} +#endif + /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings @@ -1630,6 +1646,25 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end); #endif /* CONFIG_MMU */ /* + * On almost all architectures and configurations, 0 can be used as the + * upper ceiling to free_pgtables(): on many architectures it has the same + * effect as using TASK_SIZE. However, there is one configuration which + * must impose a more careful limit, to avoid freeing kernel pgtables. + */ +#ifndef USER_PGTABLES_CEILING +#define USER_PGTABLES_CEILING 0UL +#endif + +/* + * This defines the first usable user address. Platforms + * can override its value with custom FIRST_USER_ADDRESS + * defined in their respective <asm/pgtable.h>. + */ +#ifndef FIRST_USER_ADDRESS +#define FIRST_USER_ADDRESS 0UL +#endif + +/* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index e2069b3179c4..a8273b32e041 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -102,12 +102,10 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) extern const struct fs_parameter_spec shmem_fs_parameters[]; extern void shmem_init(void); extern int shmem_init_fs_context(struct fs_context *fc); -extern struct file *shmem_file_setup(const char *name, - loff_t size, unsigned long flags); -extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, - unsigned long flags); +struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags); +struct file *shmem_kernel_file_setup(const char *name, loff_t size, vma_flags_t vma_flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, - const char *name, loff_t size, unsigned long flags); + const char *name, loff_t size, vma_flags_t flags); int shmem_zero_setup(struct vm_area_struct *vma); int shmem_zero_setup_desc(struct vm_area_desc *desc); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, diff --git a/ipc/shm.c b/ipc/shm.c index 3db36773dd10..e8c7d1924c50 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -707,9 +707,10 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) int error; struct shmid_kernel *shp; size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + const bool has_no_reserve = shmflg & SHM_NORESERVE; + vma_flags_t acctflag = EMPTY_VMA_FLAGS; struct file *file; char name[13]; - vm_flags_t acctflag = 0; if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; @@ -749,8 +750,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) hugesize = ALIGN(size, huge_page_size(hs)); /* hugetlb_file_setup applies strict accounting */ - if (shmflg & SHM_NORESERVE) - acctflag = VM_NORESERVE; + if (has_no_reserve) + vma_flags_set(&acctflag, VMA_NORESERVE_BIT); file = hugetlb_file_setup(name, hugesize, acctflag, HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { @@ -758,9 +759,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) * Do not allow no accounting for OVERCOMMIT_NEVER, even * if it's asked for. */ - if ((shmflg & SHM_NORESERVE) && - sysctl_overcommit_memory != OVERCOMMIT_NEVER) - acctflag = VM_NORESERVE; + if (has_no_reserve && sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vma_flags_set(&acctflag, VMA_NORESERVE_BIT); file = shmem_kernel_file_setup(name, size, acctflag); } error = PTR_ERR(file); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 832179236529..7607dfe516e6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4145,40 +4145,58 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) return allowed; } -bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +/** + * cpuset_nodes_allowed - return effective_mems mask from a cgroup cpuset. + * @cgroup: pointer to struct cgroup. + * @mask: pointer to struct nodemask_t to be returned. + * + * Returns effective_mems mask from a cgroup cpuset if it is cgroup v2 and + * has cpuset subsys. Otherwise, returns node_states[N_MEMORY]. + * + * This function intentionally avoids taking the cpuset_mutex or callback_lock + * when accessing effective_mems. This is because the obtained effective_mems + * is stale immediately after the query anyway (e.g., effective_mems is updated + * immediately after releasing the lock but before returning). + * + * As a result, returned @mask may be empty because cs->effective_mems can be + * rebound during this call. Besides, nodes in @mask are not guaranteed to be + * online due to hot plugins. Callers should check the mask for validity on + * return based on its subsequent use. + **/ +void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask) { struct cgroup_subsys_state *css; struct cpuset *cs; - bool allowed; /* * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy * and mems_allowed is likely to be empty even if we could get to it, - * so return true to avoid taking a global lock on the empty check. + * so return directly to avoid taking a global lock on the empty check. */ - if (!cpuset_v2()) - return true; + if (!cgroup || !cpuset_v2()) { + nodes_copy(*mask, node_states[N_MEMORY]); + return; + } css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); - if (!css) - return true; + if (!css) { + nodes_copy(*mask, node_states[N_MEMORY]); + return; + } /* - * Normally, accessing effective_mems would require the cpuset_mutex - * or callback_lock - but node_isset is atomic and the reference - * taken via cgroup_get_e_css is sufficient to protect css. - * - * Since this interface is intended for use by migration paths, we - * relax locking here to avoid taking global locks - while accepting - * there may be rare scenarios where the result may be innaccurate. + * The reference taken via cgroup_get_e_css is sufficient to + * protect css, but it does not imply safe accesses to effective_mems. * - * Reclaim and migration are subject to these same race conditions, and - * cannot make strong isolation guarantees, so this is acceptable. + * Normally, accessing effective_mems would require the cpuset_mutex + * or callback_lock - but the correctness of this information is stale + * immediately after the query anyway. We do not acquire the lock + * during this process to save lock contention in exchange for racing + * against mems_allowed rebinds. */ cs = container_of(css, struct cpuset, css); - allowed = node_isset(nid, cs->effective_mems); + nodes_copy(*mask, cs->effective_mems); css_put(css); - return allowed; } /** diff --git a/kernel/relay.c b/kernel/relay.c index 6ed6bc929bf9..5c665b729132 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -91,7 +91,7 @@ static int relay_mmap_prepare_buf(struct rchan_buf *buf, return -EINVAL; desc->vm_ops = &relay_file_mmap_ops; - desc->vm_flags |= VM_DONTEXPAND; + vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT); desc->private_data = buf; return 0; diff --git a/mm/filemap.c b/mm/filemap.c index ebd75684cb0a..6cd7974d4ada 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4012,7 +4012,7 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) { - if (is_shared_maywrite(desc->vm_flags)) + if (is_shared_maywrite(&desc->vma_flags)) return -EINVAL; return generic_file_mmap_prepare(desc); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 90182724d4cf..6e855a32de3d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1193,16 +1193,16 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { - VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); - VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); + VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); desc->private_data = map; } static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) { - VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); - VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); + VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); desc->private_data = (void *)((unsigned long)desc->private_data | flags); } @@ -1216,7 +1216,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) { - VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); return ((unsigned long)desc->private_data) & flag; } @@ -6571,7 +6571,7 @@ next: long hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_desc *desc, - vm_flags_t vm_flags) + vma_flags_t vma_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); @@ -6592,7 +6592,7 @@ long hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ - if (vm_flags & VM_NORESERVE) + if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT)) return 0; /* @@ -6601,7 +6601,7 @@ long hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!desc || desc->vm_flags & VM_MAYSHARE) { + if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6635,7 +6635,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { + if (desc && !vma_desc_test_flags(desc, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6672,7 +6672,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || desc->vm_flags & VM_MAYSHARE) { + if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6736,7 +6736,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || desc->vm_flags & VM_MAYSHARE) + if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ diff --git a/mm/internal.h b/mm/internal.h index 7493d2b2743c..cb0af847d7d9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -197,6 +197,9 @@ static inline void vma_close(struct vm_area_struct *vma) } } +/* unmap_vmas is in mm/memory.c */ +void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap); + #ifdef CONFIG_MMU static inline void get_anon_vma(struct anon_vma *anon_vma) @@ -509,9 +512,8 @@ bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); -void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *start_vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked); +void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc); + void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; @@ -1044,7 +1046,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); -bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, +bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked, unsigned long bytes); /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ea7a3ad2a2c2..eff9e3061925 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1732,7 +1732,7 @@ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) * obtained on guard region installation after the flag is set, so this * check being performed under this lock excludes races. */ - if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT)) + if (vma_test_atomic_flag(vma, VMA_MAYBE_GUARD_BIT)) return false; return true; diff --git a/mm/madvise.c b/mm/madvise.c index 1f3040688f04..8debb2d434aa 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1140,7 +1140,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * acquire an mmap/VMA write lock to read it. All remaining readers may * or may not see the flag set, but we don't care. */ - vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT); + vma_set_atomic_flag(vma, VMA_MAYBE_GUARD_BIT); /* * If anonymous and we are establishing page tables the VMA ought to diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b730233a481d..f2b87e02574e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5649,9 +5649,21 @@ subsys_initcall(mem_cgroup_swap_init); #endif /* CONFIG_SWAP */ -bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) +void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, nodemask_t *mask) { - return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true; + nodemask_t allowed; + + if (!memcg) + return; + + /* + * Since this interface is intended for use by migration paths, and + * reclaim and migration are subject to race conditions such as changes + * in effective_mems and hot-unpluging of nodes, inaccurate allowed + * mask is acceptable. + */ + cpuset_nodes_allowed(memcg->css.cgroup, &allowed); + nodes_and(*mask, *mask, allowed); } void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) diff --git a/mm/memfd.c b/mm/memfd.c index 82a3f38aa30a..919c2a53eb96 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -86,7 +86,7 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); idx >>= huge_page_order(h); - nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0); + nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, EMPTY_VMA_FLAGS); if (nr_resv < 0) return ERR_PTR(nr_resv); @@ -463,12 +463,12 @@ struct file *memfd_alloc_file(const char *name, unsigned int flags) int err = 0; if (flags & MFD_HUGETLB) { - file = hugetlb_file_setup(name, 0, VM_NORESERVE, + file = hugetlb_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT), HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); } else { - file = shmem_file_setup(name, 0, VM_NORESERVE); + file = shmem_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT)); } if (IS_ERR(file)) return file; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 0ae8bec86346..545e34626df7 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -320,16 +320,17 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) /** * next_demotion_node() - Get the next node in the demotion path * @node: The starting node to lookup the next node + * @allowed_mask: The pointer to allowed node mask * * Return: node id for next memory node in the demotion path hierarchy * from @node; NUMA_NO_NODE if @node is terminal. This does not keep * @node online or guarantee that it *continues* to be the next demotion * target. */ -int next_demotion_node(int node) +int next_demotion_node(int node, const nodemask_t *allowed_mask) { struct demotion_nodes *nd; - int target; + nodemask_t mask; if (!node_demotion) return NUMA_NO_NODE; @@ -344,6 +345,10 @@ int next_demotion_node(int node) * node_demotion[] reads need to be consistent. */ rcu_read_lock(); + /* Filter out nodes that are not in allowed_mask. */ + nodes_and(mask, nd->preferred, *allowed_mask); + rcu_read_unlock(); + /* * If there are multiple target nodes, just select one * target node randomly. @@ -356,10 +361,16 @@ int next_demotion_node(int node) * caching issue, which seems more complicated. So selecting * target node randomly seems better until now. */ - target = node_random(&nd->preferred); - rcu_read_unlock(); + if (!nodes_empty(mask)) + return node_random(&mask); - return target; + /* + * Preferred nodes are not in allowed_mask. Flip bits in + * allowed_mask as used node mask. Then, use it to get the + * closest demotion target. + */ + nodes_complement(mask, *allowed_mask); + return find_next_best_node(node, &mask); } static void disable_all_demotion_targets(void) diff --git a/mm/memory.c b/mm/memory.c index b0d487229b2e..876bf73959c6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -370,11 +370,32 @@ void free_pgd_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked) +/** + * free_pgtables() - Free a range of page tables + * @tlb: The mmu gather + * @unmap: The unmap_desc + * + * Note: pg_start and pg_end are provided to indicate the absolute range of the + * page tables that should be removed. This can differ from the vma mappings on + * some archs that may have mappings that need to be removed outside the vmas. + * Note that the prev->vm_end and next->vm_start are often used. + * + * The vma_end differs from the pg_end when a dup_mmap() failed and the tree has + * unrelated data to the mm_struct being torn down. + */ +void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap) { struct unlink_vma_file_batch vb; + struct ma_state *mas = unmap->mas; + struct vm_area_struct *vma = unmap->first; + + /* + * Note: USER_PGTABLES_CEILING may be passed as the value of pg_end and + * may be 0. Underflow is expected in this case. Otherwise the + * pagetable end is exclusive. vma_end is exclusive. The last vma + * address should never be larger than the pagetable end. + */ + WARN_ON_ONCE(unmap->vma_end - 1 > unmap->pg_end - 1); tlb_free_vmas(tlb); @@ -382,19 +403,13 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, unsigned long addr = vma->vm_start; struct vm_area_struct *next; - /* - * Note: USER_PGTABLES_CEILING may be passed as ceiling and may - * be 0. This will underflow and is okay. - */ - next = mas_find(mas, ceiling - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; + next = mas_find(mas, unmap->tree_end - 1); /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ - if (mm_wr_locked) + if (unmap->mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -406,18 +421,16 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { vma = next; - next = mas_find(mas, ceiling - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; - if (mm_wr_locked) + next = mas_find(mas, unmap->tree_end - 1); + if (unmap->mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); unlink_file_vma_batch_add(&vb, vma); } unlink_file_vma_batch_final(&vb); - free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); + free_pgd_range(tlb, addr, vma->vm_end, unmap->pg_start, + next ? next->vm_start : unmap->pg_end); vma = next; } while (vma); } @@ -2124,11 +2137,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather - * @mas: the maple state - * @vma: the starting vma - * @start_addr: virtual address at which to start unmapping - * @end_addr: virtual address at which to end unmapping - * @tree_end: The maximum index to check + * @unmap: The unmap_desc * * Unmap all pages in the vma list. * @@ -2141,10 +2150,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end) +void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) { + struct vm_area_struct *vma; struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, @@ -2152,17 +2160,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, .even_cows = true, }; + vma = unmap->first; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, - start_addr, end_addr); + unmap->vma_start, unmap->vma_end); mmu_notifier_invalidate_range_start(&range); do { - unsigned long start = start_addr; - unsigned long end = end_addr; + unsigned long start = unmap->vma_start; + unsigned long end = unmap->vma_end; hugetlb_zap_begin(vma, &start, &end); unmap_single_vma(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); - vma = mas_find(mas, tree_end - 1); - } while (vma && likely(!xa_is_zero(vma))); + vma = mas_find(unmap->mas, unmap->tree_end - 1); + } while (vma); mmu_notifier_invalidate_range_end(&range); } @@ -2948,7 +2957,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, +static int get_remap_pgoff(bool is_cow, unsigned long addr, unsigned long end, unsigned long vm_start, unsigned long vm_end, unsigned long pfn, pgoff_t *vm_pgoff_p) { @@ -2958,7 +2967,7 @@ static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, * un-COW'ed pages by matching them up with "vma->vm_pgoff". * See vm_normal_page() for details. */ - if (is_cow_mapping(vm_flags)) { + if (is_cow) { if (addr != vm_start || end != vm_end) return -EINVAL; *vm_pgoff_p = pfn; @@ -2979,7 +2988,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS); + VM_WARN_ON_ONCE(!vma_test_all_flags_mask(vma, VMA_REMAP_FLAGS)); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -3103,9 +3112,9 @@ void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) * check it again on complete and will fail there if specified addr is * invalid. */ - get_remap_pgoff(desc->vm_flags, desc->start, desc->end, + get_remap_pgoff(vma_desc_is_cow_mapping(desc), desc->start, desc->end, desc->start, desc->end, pfn, &desc->pgoff); - desc->vm_flags |= VM_REMAP_FLAGS; + vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS); } static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, @@ -3114,13 +3123,12 @@ static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long unsigned long end = addr + PAGE_ALIGN(size); int err; - err = get_remap_pgoff(vma->vm_flags, addr, end, - vma->vm_start, vma->vm_end, - pfn, &vma->vm_pgoff); + err = get_remap_pgoff(is_cow_mapping(vma->vm_flags), addr, end, + vma->vm_start, vma->vm_end, pfn, &vma->vm_pgoff); if (err) return err; - vm_flags_set(vma, VM_REMAP_FLAGS); + vma_set_flags_mask(vma, VMA_REMAP_FLAGS); return 0; } @@ -7316,7 +7324,7 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint) const unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio)); const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE; const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1); - const int radius = FOLIO_ZERO_LOCALITY_RADIUS; + const long radius = FOLIO_ZERO_LOCALITY_RADIUS; struct range r[3]; int i; @@ -7324,20 +7332,19 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint) * Faulting page and its immediate neighbourhood. Will be cleared at the * end to keep its cachelines hot. */ - r[2] = DEFINE_RANGE(clamp_t(s64, fault_idx - radius, pg.start, pg.end), - clamp_t(s64, fault_idx + radius, pg.start, pg.end)); + r[2] = DEFINE_RANGE(fault_idx - radius < (long)pg.start ? pg.start : fault_idx - radius, + fault_idx + radius > (long)pg.end ? pg.end : fault_idx + radius); + /* Region to the left of the fault */ - r[1] = DEFINE_RANGE(pg.start, - clamp_t(s64, r[2].start - 1, pg.start - 1, r[2].start)); + r[1] = DEFINE_RANGE(pg.start, r[2].start - 1); /* Region to the right of the fault: always valid for the common fault_idx=0 case. */ - r[0] = DEFINE_RANGE(clamp_t(s64, r[2].end + 1, r[2].end, pg.end + 1), - pg.end); + r[0] = DEFINE_RANGE(r[2].end + 1, pg.end); for (i = 0; i < ARRAY_SIZE(r); i++) { const unsigned long addr = base_addr + r[i].start * PAGE_SIZE; - const unsigned int nr_pages = range_len(&r[i]); + const long nr_pages = (long)range_len(&r[i]); struct page *page = folio_page(folio, r[i].start); if (nr_pages > 0) diff --git a/mm/mmap.c b/mm/mmap.c index 4bdb9ffa9e25..843160946aa5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -108,7 +108,8 @@ static int check_brk_limits(unsigned long addr, unsigned long len) if (IS_ERR_VALUE(mapped_addr)) return mapped_addr; - return mlock_future_ok(current->mm, current->mm->def_flags, len) + return mlock_future_ok(current->mm, + current->mm->def_flags & VM_LOCKED, len) ? 0 : -EAGAIN; } @@ -225,12 +226,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, - unsigned long bytes) +bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked, + unsigned long bytes) { unsigned long locked_pages, limit_pages; - if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + if (!is_vma_locked || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; @@ -416,7 +417,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!can_do_mlock()) return -EPERM; - if (!mlock_future_ok(mm, vm_flags, len)) + if (!mlock_future_ok(mm, vm_flags & VM_LOCKED, len)) return -EAGAIN; if (file) { @@ -594,7 +595,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, * taken when vm_ops->mmap() is called */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, - VM_NORESERVE, + mk_vma_flags(VMA_NORESERVE_BIT), HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) @@ -1247,6 +1248,29 @@ limits_failed: } EXPORT_SYMBOL(vm_brk_flags); +static +unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi, + struct vm_area_struct *vma, unsigned long end) +{ + unsigned long nr_accounted = 0; + int count = 0; + + mmap_assert_write_locked(mm); + vma_iter_set(vmi, vma->vm_end); + do { + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += vma_pages(vma); + vma_mark_detached(vma); + remove_vma(vma); + count++; + cond_resched(); + vma = vma_next(vmi); + } while (vma && vma->vm_end <= end); + + VM_WARN_ON_ONCE(count != mm->map_count); + return nr_accounted; +} + /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { @@ -1254,7 +1278,7 @@ void exit_mmap(struct mm_struct *mm) struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); - int count = 0; + struct unmap_desc unmap; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); @@ -1263,18 +1287,19 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); vma = vma_next(&vmi); - if (!vma || unlikely(xa_is_zero(vma))) { + if (!vma) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); mmap_write_lock(mm); goto destroy; } + unmap_all_init(&unmap, &vmi, vma); flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX); + unmap_vmas(&tlb, &unmap); mmap_read_unlock(mm); /* @@ -1283,10 +1308,10 @@ void exit_mmap(struct mm_struct *mm) */ mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); + unmap.mm_wr_locked = true; mt_clear_in_rcu(&mm->mm_mt); - vma_iter_set(&vmi, vma->vm_end); - free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, - USER_PGTABLES_CEILING, true); + unmap_pgtable_init(&unmap, &vmi); + free_pgtables(&tlb, &unmap); tlb_finish_mmu(&tlb); /* @@ -1294,22 +1319,11 @@ void exit_mmap(struct mm_struct *mm) * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ - vma_iter_set(&vmi, vma->vm_end); - do { - if (vma->vm_flags & VM_ACCOUNT) - nr_accounted += vma_pages(vma); - vma_mark_detached(vma); - remove_vma(vma); - count++; - cond_resched(); - vma = vma_next(&vmi); - } while (vma && likely(!xa_is_zero(vma))); + nr_accounted = tear_down_vmas(mm, &vmi, vma, ULONG_MAX); - BUG_ON(count != mm->map_count); - - trace_exit_mmap(mm); destroy: __mt_destroy(&mm->mm_mt); + trace_exit_mmap(mm); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -1840,20 +1854,46 @@ loop_out: ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); } else { + unsigned long end; /* - * The entire maple tree has already been duplicated. If the - * mmap duplication fails, mark the failure point with - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, - * stop releasing VMAs that have not been duplicated after this - * point. + * The entire maple tree has already been duplicated, but + * replacing the vmas failed at mpnt (which could be NULL if + * all were allocated but the last vma was not fully set up). + * Use the start address of the failure point to clean up the + * partially initialized tree. */ - if (mpnt) { - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); - /* Avoid OOM iterating a broken tree */ - mm_flags_set(MMF_OOM_SKIP, mm); + if (!mm->map_count) { + /* zero vmas were written to the new tree. */ + end = 0; + } else if (mpnt) { + /* partial tree failure */ + end = mpnt->vm_start; + } else { + /* All vmas were written to the new tree */ + end = ULONG_MAX; + } + + /* Hide mm from oom killer because the memory is being freed */ + mm_flags_set(MMF_OOM_SKIP, mm); + if (end) { + vma_iter_set(&vmi, 0); + tmp = vma_next(&vmi); + UNMAP_STATE(unmap, &vmi, /* first = */ tmp, + /* vma_start = */ 0, /* vma_end = */ end, + /* prev = */ NULL, /* next = */ NULL); + + /* + * Don't iterate over vmas beyond the failure point for + * both unmap_vma() and free_pgtables(). + */ + unmap.tree_end = end; + flush_cache_mm(mm); + unmap_region(&unmap); + charge = tear_down_vmas(mm, &vmi, tmp, end); + vm_unacct_memory(charge); } + __mt_destroy(&mm->mm_mt); /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is diff --git a/mm/mremap.c b/mm/mremap.c index 8391ae17de64..2be876a70cc0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1740,7 +1740,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm) if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) return -EFAULT; - if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) + if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, vrm->delta)) return -EAGAIN; if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 167d4b710786..fcc32737f451 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1429,6 +1429,7 @@ __always_inline bool __free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; + page->private = 0; reset_page_owner(page, order); page_table_check_free(page, order); pgalloc_tag_sub(page, 1 << order); diff --git a/mm/rmap.c b/mm/rmap.c index ab099405151f..0f00570d1b9e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -913,9 +913,11 @@ static bool folio_referenced_one(struct folio *folio, struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int ptes = 0, referenced = 0; + unsigned int nr; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; + nr = 1; if (vma->vm_flags & VM_LOCKED) { ptes++; @@ -960,9 +962,21 @@ static bool folio_referenced_one(struct folio *folio, if (lru_gen_look_around(&pvmw)) referenced++; } else if (pvmw.pte) { - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) + if (folio_test_large(folio)) { + unsigned long end_addr = pmd_addr_end(address, vma->vm_end); + unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT; + pte_t pteval = ptep_get(pvmw.pte); + + nr = folio_pte_batch(folio, pvmw.pte, + pteval, max_nr); + } + + ptes += nr; + if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr)) referenced++; + /* Skip the batched PTEs */ + pvmw.pte += nr - 1; + pvmw.address += (nr - 1) * PAGE_SIZE; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) @@ -972,7 +986,15 @@ static bool folio_referenced_one(struct folio *folio, WARN_ON_ONCE(1); } - pra->mapcount--; + pra->mapcount -= nr; + /* + * If we are sure that we batched the entire folio, + * we can just optimize and stop right here. + */ + if (ptes == pvmw.nr_pages) { + page_vma_mapped_walk_done(&pvmw); + break; + } } if (referenced) @@ -1923,12 +1945,16 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio, end_addr = pmd_addr_end(addr, vma->vm_end); max_nr = (end_addr - addr) >> PAGE_SHIFT; - /* We only support lazyfree batching for now ... */ - if (!folio_test_anon(folio) || folio_test_swapbacked(folio)) + /* We only support lazyfree or file folios batching for now ... */ + if (folio_test_anon(folio) && folio_test_swapbacked(folio)) return 1; + if (pte_unused(pte)) return 1; + if (userfaultfd_wp(vma)) + return 1; + return folio_pte_batch(folio, pvmw->pte, pte, max_nr); } @@ -2291,7 +2317,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * * See Documentation/mm/mmu_notifier.rst */ - dec_mm_counter(mm, mm_counter_file(folio)); + add_mm_counter(mm, mm_counter_file(folio), -nr_pages); } discard: if (unlikely(folio_test_hugetlb(folio))) { diff --git a/mm/secretmem.c b/mm/secretmem.c index edf111e0a1bb..11a779c812a7 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -122,13 +122,12 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc) { const unsigned long len = vma_desc_size(desc); - if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) + if (!vma_desc_test_flags(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT)) return -EINVAL; - if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len)) + vma_desc_set_flags(desc, VMA_LOCKED_BIT, VMA_DONTDUMP_BIT); + if (!mlock_future_ok(desc->mm, /*is_vma_locked=*/ true, len)) return -EAGAIN; - - desc->vm_flags |= VM_LOCKED | VM_DONTDUMP; desc->vm_ops = &secretmem_vm_ops; return 0; diff --git a/mm/shmem.c b/mm/shmem.c index c40d786a21c6..d129f4eb5ca9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3062,9 +3062,9 @@ static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) } static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, - struct super_block *sb, - struct inode *dir, umode_t mode, - dev_t dev, unsigned long flags) + struct super_block *sb, + struct inode *dir, umode_t mode, + dev_t dev, vma_flags_t flags) { struct inode *inode; struct shmem_inode_info *info; @@ -3092,7 +3092,8 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; - info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; + info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) + ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; @@ -3145,7 +3146,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, #ifdef CONFIG_TMPFS_QUOTA static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) + umode_t mode, dev_t dev, vma_flags_t flags) { int err; struct inode *inode; @@ -3171,9 +3172,9 @@ errout: return ERR_PTR(err); } #else -static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, +static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) + umode_t mode, dev_t dev, vma_flags_t flags) { return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); } @@ -3880,7 +3881,8 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; - inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -3915,7 +3917,8 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode; int error; - inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; @@ -4112,7 +4115,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, return -ENAMETOOLONG; inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, - VM_NORESERVE); + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -5113,7 +5116,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #endif /* CONFIG_TMPFS_QUOTA */ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, - S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + S_IFDIR | sbinfo->mode, 0, + mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; @@ -5814,7 +5818,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) + umode_t mode, dev_t dev, vma_flags_t flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); @@ -5825,10 +5829,11 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, /* common code */ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, - loff_t size, unsigned long vm_flags, + loff_t size, vma_flags_t flags, unsigned int i_flags) { - unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; + const unsigned long shmem_flags = + vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; @@ -5841,13 +5846,13 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); - if (shmem_acct_size(flags, size)) + if (shmem_acct_size(shmem_flags, size)) return ERR_PTR(-ENOMEM); inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, - S_IFREG | S_IRWXUGO, 0, vm_flags); + S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { - shmem_unacct_size(flags, size); + shmem_unacct_size(shmem_flags, size); return ERR_CAST(inode); } inode->i_flags |= i_flags; @@ -5870,9 +5875,10 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ -struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +struct file *shmem_kernel_file_setup(const char *name, loff_t size, + vma_flags_t flags) { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } @@ -5882,9 +5888,9 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags) { return __shmem_file_setup(shm_mnt, name, size, flags, 0); } @@ -5895,16 +5901,17 @@ EXPORT_SYMBOL_GPL(shmem_file_setup); * @mnt: the tmpfs mount where the file will be created * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, - loff_t size, unsigned long flags) + loff_t size, vma_flags_t flags) { return __shmem_file_setup(mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); -static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags) +static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, + vma_flags_t flags) { loff_t size = end - start; @@ -5914,7 +5921,7 @@ static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, v * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - return shmem_kernel_file_setup("dev/zero", size, vm_flags); + return shmem_kernel_file_setup("dev/zero", size, flags); } /** @@ -5924,7 +5931,7 @@ static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, v */ int shmem_zero_setup(struct vm_area_struct *vma) { - struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags); + struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->flags); if (IS_ERR(file)) return PTR_ERR(file); @@ -5945,7 +5952,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) */ int shmem_zero_setup_desc(struct vm_area_desc *desc) { - struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags); + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vma_flags); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/mm/util.c b/mm/util.c index 97cae40c0209..b05ab6f97e11 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1154,7 +1154,7 @@ int __compat_vma_mmap(const struct file_operations *f_op, .pgoff = vma->vm_pgoff, .vm_file = vma->vm_file, - .vm_flags = vma->vm_flags, + .vma_flags = vma->flags, .page_prot = vma->vm_page_prot, .action.type = MMAP_NOTHING, /* Default */ @@ -15,7 +15,10 @@ struct mmap_state { unsigned long end; pgoff_t pgoff; unsigned long pglen; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; struct file *file; pgprot_t page_prot; @@ -472,19 +475,16 @@ void remove_vma(struct vm_area_struct *vma) * * Called with the mm semaphore held. */ -void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct vm_area_struct *next) +void unmap_region(struct unmap_desc *unmap) { - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = unmap->first->vm_mm; struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end); - mas_set(mas, vma->vm_end); - free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : USER_PGTABLES_CEILING, - /* mm_wr_locked = */ true); + unmap_vmas(&tlb, unmap); + mas_set(unmap->mas, unmap->tree_reset); + free_pgtables(&tlb, unmap); tlb_finish_mmu(&tlb); } @@ -1256,26 +1256,32 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { - struct mmu_gather tlb; + struct unmap_desc unmap = { + .mas = mas_detach, + .first = vms->vma, + /* start and end may be different if there is no prev or next vma. */ + .pg_start = vms->unmap_start, + .pg_end = vms->unmap_end, + .vma_start = vms->start, + .vma_end = vms->end, + /* + * The tree limits and reset differ from the normal case since it's a + * side-tree + */ + .tree_reset = 1, + .tree_end = vms->vma_count, + /* + * We can free page tables without write-locking mmap_lock because VMAs + * were isolated before we downgraded mmap_lock. + */ + .mm_wr_locked = mm_wr_locked, + }; if (!vms->clear_ptes) /* Nothing to do */ return; - /* - * We can free page tables without write-locking mmap_lock because VMAs - * were isolated before we downgraded mmap_lock. - */ mas_set(mas_detach, 1); - tlb_gather_mmu(&tlb, vms->vma->vm_mm); - update_hiwater_rss(vms->vma->vm_mm); - unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, - vms->vma_count); - - mas_set(mas_detach, 1); - /* start and end may be different if there is no prev or next vma. */ - free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, - vms->unmap_end, mm_wr_locked); - tlb_finish_mmu(&tlb); + unmap_region(&unmap); vms->clear_ptes = false; } @@ -2366,7 +2372,7 @@ static void set_desc_from_map(struct vm_area_desc *desc, desc->pgoff = map->pgoff; desc->vm_file = map->file; - desc->vm_flags = map->vm_flags; + desc->vma_flags = map->vma_flags; desc->page_prot = map->page_prot; } @@ -2461,13 +2467,14 @@ static int __mmap_new_file_vma(struct mmap_state *map, error = mmap_file(vma->vm_file, vma); if (error) { + UNMAP_STATE(unmap, vmi, vma, vma->vm_start, vma->vm_end, + map->prev, map->next); fput(vma->vm_file); vma->vm_file = NULL; vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(&vmi->mas, vma, map->prev, map->next); - + unmap_region(&unmap); return error; } @@ -2646,7 +2653,7 @@ static int call_mmap_prepare(struct mmap_state *map, map->file_doesnt_need_get = true; map->file = desc->vm_file; } - map->vm_flags = desc->vm_flags; + map->vma_flags = desc->vma_flags; map->page_prot = desc->page_prot; /* User-defined fields. */ map->vm_ops = desc->vm_ops; @@ -2819,7 +2826,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ - if (file && is_shared_maywrite(vm_flags)) { + if (file && is_shared_maywrite_vm_flags(vm_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) @@ -3049,7 +3056,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* mlock limit tests */ - if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) + if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ @@ -155,6 +155,72 @@ struct vma_merge_struct { }; +struct unmap_desc { + struct ma_state *mas; /* the maple state point to the first vma */ + struct vm_area_struct *first; /* The first vma */ + unsigned long pg_start; /* The first pagetable address to free (floor) */ + unsigned long pg_end; /* The last pagetable address to free (ceiling) */ + unsigned long vma_start; /* The min vma address */ + unsigned long vma_end; /* The max vma address */ + unsigned long tree_end; /* Maximum for the vma tree search */ + unsigned long tree_reset; /* Where to reset the vma tree walk */ + bool mm_wr_locked; /* If the mmap write lock is held */ +}; + +/* + * unmap_all_init() - Initialize unmap_desc to remove all vmas, point the + * pg_start and pg_end to a safe location. + */ +static inline void unmap_all_init(struct unmap_desc *unmap, + struct vma_iterator *vmi, struct vm_area_struct *vma) +{ + unmap->mas = &vmi->mas; + unmap->first = vma; + unmap->pg_start = FIRST_USER_ADDRESS; + unmap->pg_end = USER_PGTABLES_CEILING; + unmap->vma_start = 0; + unmap->vma_end = ULONG_MAX; + unmap->tree_end = ULONG_MAX; + unmap->tree_reset = vma->vm_end; + unmap->mm_wr_locked = false; +} + +/* + * unmap_pgtable_init() - Initialize unmap_desc to remove all page tables within + * the user range. + * + * ARM can have mappings outside of vmas. + * See: e2cdef8c847b4 ("[PATCH] freepgt: free_pgtables from FIRST_USER_ADDRESS") + * + * ARM LPAE uses page table mappings beyond the USER_PGTABLES_CEILING + * See: CONFIG_ARM_LPAE in arch/arm/include/asm/pgtable.h + */ +static inline void unmap_pgtable_init(struct unmap_desc *unmap, + struct vma_iterator *vmi) +{ + vma_iter_set(vmi, unmap->tree_reset); + unmap->vma_start = FIRST_USER_ADDRESS; + unmap->vma_end = USER_PGTABLES_CEILING; + unmap->tree_end = USER_PGTABLES_CEILING; +} + +#define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next) \ + struct unmap_desc name = { \ + .mas = &(_vmi)->mas, \ + .first = _vma, \ + .pg_start = _prev ? ((struct vm_area_struct *)_prev)->vm_end : \ + FIRST_USER_ADDRESS, \ + .pg_end = _next ? ((struct vm_area_struct *)_next)->vm_start : \ + USER_PGTABLES_CEILING, \ + .vma_start = _vma_start, \ + .vma_end = _vma_end, \ + .tree_end = _next ? \ + ((struct vm_area_struct *)_next)->vm_start : \ + USER_PGTABLES_CEILING, \ + .tree_reset = _vma->vm_end, \ + .mm_wr_locked = true, \ + } + static inline bool vmg_nomem(struct vma_merge_struct *vmg) { return vmg->state == VMA_MERGE_ERROR_NOMEM; @@ -243,8 +309,7 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma, vma->vm_pgoff = desc->pgoff; if (desc->vm_file != vma->vm_file) vma_set_file(vma, desc->vm_file); - if (desc->vm_flags != vma->vm_flags) - vm_flags_set(vma, desc->vm_flags); + vma->flags = desc->vma_flags; vma->vm_page_prot = desc->page_prot; /* User-defined fields. */ @@ -262,9 +327,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, bool unlock); void remove_vma(struct vm_area_struct *vma); - -void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct vm_area_struct *next); +void unmap_region(struct unmap_desc *unmap); /** * vma_modify_flags() - Perform any necessary split/merge in preparation for diff --git a/mm/vma_internal.h b/mm/vma_internal.h index 2f05735ff190..2da6d224c1a8 100644 --- a/mm/vma_internal.h +++ b/mm/vma_internal.h @@ -46,6 +46,7 @@ #include <linux/swap.h> #include <linux/uprobes.h> #include <linux/userfaultfd_k.h> +#include <linux/pgtable.h> #include <asm/current.h> #include <asm/tlb.h> diff --git a/mm/vmscan.c b/mm/vmscan.c index 3fc4a4461927..44e4fcd6463c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -343,19 +343,21 @@ static void flush_reclaim_state(struct scan_control *sc) static bool can_demote(int nid, struct scan_control *sc, struct mem_cgroup *memcg) { - int demotion_nid; + struct pglist_data *pgdat = NODE_DATA(nid); + nodemask_t allowed_mask; - if (!numa_demotion_enabled) + if (!pgdat || !numa_demotion_enabled) return false; if (sc && sc->no_demotion) return false; - demotion_nid = next_demotion_node(nid); - if (demotion_nid == NUMA_NO_NODE) + node_get_allowed_targets(pgdat, &allowed_mask); + if (nodes_empty(allowed_mask)) return false; - /* If demotion node isn't in the cgroup's mems_allowed, fall back */ - return mem_cgroup_node_allowed(memcg, demotion_nid); + /* Filter out nodes that are not in cgroup's mems_allowed. */ + mem_cgroup_node_filter_allowed(memcg, &allowed_mask); + return !nodes_empty(allowed_mask); } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, @@ -1017,9 +1019,10 @@ static struct folio *alloc_demote_folio(struct folio *src, * Folios which are not demoted are left on @demote_folios. */ static unsigned int demote_folio_list(struct list_head *demote_folios, - struct pglist_data *pgdat) + struct pglist_data *pgdat, + struct mem_cgroup *memcg) { - int target_nid = next_demotion_node(pgdat->node_id); + int target_nid; unsigned int nr_succeeded; nodemask_t allowed_mask; @@ -1031,7 +1034,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, */ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOMEMALLOC | GFP_NOWAIT, - .nid = target_nid, .nmask = &allowed_mask, .reason = MR_DEMOTION, }; @@ -1039,10 +1041,17 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, if (list_empty(demote_folios)) return 0; + node_get_allowed_targets(pgdat, &allowed_mask); + mem_cgroup_node_filter_allowed(memcg, &allowed_mask); + if (nodes_empty(allowed_mask)) + return 0; + + target_nid = next_demotion_node(pgdat->node_id, &allowed_mask); if (target_nid == NUMA_NO_NODE) + /* No lower-tier nodes or nodes were hot-unplugged. */ return 0; - node_get_allowed_targets(pgdat, &allowed_mask); + mtc.nid = target_nid; /* Demotion ignores all cpuset and mempolicy settings */ migrate_pages(demote_folios, alloc_demote_folio, NULL, @@ -1564,7 +1573,7 @@ keep: /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - nr_demoted = demote_folio_list(&demote_folios, pgdat); + nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg); nr_reclaimed += nr_demoted; stat->nr_demoted += nr_demoted; /* Folios that could not be demoted are still in @demote_folios */ diff --git a/security/keys/big_key.c b/security/keys/big_key.c index d46862ab90d6..268f702df380 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -103,7 +103,7 @@ int big_key_preparse(struct key_preparsed_payload *prep) 0, enckey); /* save aligned data to file */ - file = shmem_kernel_file_setup("", enclen, 0); + file = shmem_kernel_file_setup("", enclen, EMPTY_VMA_FLAGS); if (IS_ERR(file)) { ret = PTR_ERR(file); goto err_enckey; diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 0d992245c600..250883090a5d 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -24,6 +24,10 @@ void __bitmap_set(unsigned long *map, unsigned int start, int len); void __bitmap_clear(unsigned long *map, unsigned int start, int len); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); +bool __bitmap_subset(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) @@ -81,6 +85,15 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, __bitmap_or(dst, src1, src2, nbits); } +static __always_inline +bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; + return __bitmap_andnot(dst, src1, src2, nbits); +} + static inline unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags __maybe_unused) { return malloc(bitmap_size(nbits)); @@ -157,6 +170,15 @@ static inline bool bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } +static __always_inline +bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); + else + return __bitmap_subset(src1, src2, nbits); +} + static inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index 51255c69754d..aa83d22c45e3 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -140,3 +140,32 @@ void __bitmap_clear(unsigned long *map, unsigned int start, int len) *p &= ~mask_to_clear; } } + +bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + unsigned int k; + unsigned int lim = bits/BITS_PER_LONG; + unsigned long result = 0; + + for (k = 0; k < lim; k++) + result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); + return result != 0; +} + +bool __bitmap_subset(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + unsigned int k, lim = bits/BITS_PER_LONG; + for (k = 0; k < lim; ++k) + if (bitmap1[k] & ~bitmap2[k]) + return false; + + if (bits % BITS_PER_LONG) + if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) + return false; + return true; +} diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index c2a8586e51a1..83ad9454dd9d 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -12,6 +12,7 @@ map_hugetlb map_populate thuge-gen compaction_test +memory-failure migration mlock2-tests mrelease_test diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index dca8f590c1e6..7a5de4e9bf52 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -75,6 +75,7 @@ TEST_GEN_FILES += map_populate ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64 loongarch32 loongarch64)) TEST_GEN_FILES += memfd_secret endif +TEST_GEN_FILES += memory-failure TEST_GEN_FILES += migration TEST_GEN_FILES += mkdirty TEST_GEN_FILES += mlock-random-test @@ -154,6 +155,7 @@ TEST_PROGS += ksft_ksm_numa.sh TEST_PROGS += ksft_madv_guard.sh TEST_PROGS += ksft_madv_populate.sh TEST_PROGS += ksft_memfd_secret.sh +TEST_PROGS += ksft_memory_failure.sh TEST_PROGS += ksft_migration.sh TEST_PROGS += ksft_mkdirty.sh TEST_PROGS += ksft_mlock.sh diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config index deba93379c80..1dbe2b4558ab 100644 --- a/tools/testing/selftests/mm/config +++ b/tools/testing/selftests/mm/config @@ -11,3 +11,5 @@ CONFIG_ANON_VMA_NAME=y CONFIG_FTRACE=y CONFIG_PROFILING=y CONFIG_UPROBES=y +CONFIG_MEMORY_FAILURE=y +CONFIG_HWPOISON_INJECT=m diff --git a/tools/testing/selftests/mm/ksft_memory_failure.sh b/tools/testing/selftests/mm/ksft_memory_failure.sh new file mode 100755 index 000000000000..ae1614d4d49b --- /dev/null +++ b/tools/testing/selftests/mm/ksft_memory_failure.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t memory-failure diff --git a/tools/testing/selftests/mm/memory-failure.c b/tools/testing/selftests/mm/memory-failure.c new file mode 100644 index 000000000000..3d9e0b9ffb41 --- /dev/null +++ b/tools/testing/selftests/mm/memory-failure.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory-failure functional tests. + * + * Author(s): Miaohe Lin <linmiaohe@huawei.com> + */ + +#include "../kselftest_harness.h" + +#include <sys/mman.h> +#include <linux/mman.h> +#include <linux/string.h> +#include <unistd.h> +#include <signal.h> +#include <setjmp.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/vfs.h> +#include <linux/magic.h> +#include <errno.h> + +#include "vm_util.h" + +enum inject_type { + MADV_HARD, + MADV_SOFT, +}; + +enum result_type { + MADV_HARD_ANON, + MADV_HARD_CLEAN_PAGECACHE, + MADV_HARD_DIRTY_PAGECACHE, + MADV_SOFT_ANON, + MADV_SOFT_CLEAN_PAGECACHE, + MADV_SOFT_DIRTY_PAGECACHE, +}; + +static jmp_buf signal_jmp_buf; +static siginfo_t siginfo; +const char *pagemap_proc = "/proc/self/pagemap"; +const char *kpageflags_proc = "/proc/kpageflags"; + +FIXTURE(memory_failure) +{ + unsigned long page_size; + unsigned long corrupted_size; + unsigned long pfn; + int pagemap_fd; + int kpageflags_fd; + bool triggered; +}; + +FIXTURE_VARIANT(memory_failure) +{ + enum inject_type type; + int (*inject)(FIXTURE_DATA(memory_failure) * self, void *vaddr); +}; + +static int madv_hard_inject(FIXTURE_DATA(memory_failure) * self, void *vaddr) +{ + return madvise(vaddr, self->page_size, MADV_HWPOISON); +} + +FIXTURE_VARIANT_ADD(memory_failure, madv_hard) +{ + .type = MADV_HARD, + .inject = madv_hard_inject, +}; + +static int madv_soft_inject(FIXTURE_DATA(memory_failure) * self, void *vaddr) +{ + return madvise(vaddr, self->page_size, MADV_SOFT_OFFLINE); +} + +FIXTURE_VARIANT_ADD(memory_failure, madv_soft) +{ + .type = MADV_SOFT, + .inject = madv_soft_inject, +}; + +static void sigbus_action(int signo, siginfo_t *si, void *args) +{ + memcpy(&siginfo, si, sizeof(siginfo_t)); + siglongjmp(signal_jmp_buf, 1); +} + +static int setup_sighandler(void) +{ + struct sigaction sa = { + .sa_sigaction = sigbus_action, + .sa_flags = SA_SIGINFO, + }; + + return sigaction(SIGBUS, &sa, NULL); +} + +FIXTURE_SETUP(memory_failure) +{ + memset(self, 0, sizeof(*self)); + + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); + + memset(&siginfo, 0, sizeof(siginfo)); + if (setup_sighandler()) + SKIP(return, "setup sighandler failed.\n"); + + self->pagemap_fd = open(pagemap_proc, O_RDONLY); + if (self->pagemap_fd == -1) + SKIP(return, "open %s failed.\n", pagemap_proc); + + self->kpageflags_fd = open(kpageflags_proc, O_RDONLY); + if (self->kpageflags_fd == -1) + SKIP(return, "open %s failed.\n", kpageflags_proc); +} + +static void teardown_sighandler(void) +{ + struct sigaction sa = { + .sa_handler = SIG_DFL, + .sa_flags = SA_SIGINFO, + }; + + sigaction(SIGBUS, &sa, NULL); +} + +FIXTURE_TEARDOWN(memory_failure) +{ + close(self->kpageflags_fd); + close(self->pagemap_fd); + teardown_sighandler(); +} + +static void prepare(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self, + void *vaddr) +{ + self->pfn = pagemap_get_pfn(self->pagemap_fd, vaddr); + ASSERT_NE(self->pfn, -1UL); + + ASSERT_EQ(get_hardware_corrupted_size(&self->corrupted_size), 0); +} + +static bool check_memory(void *vaddr, unsigned long size) +{ + char buf[64]; + + memset(buf, 0xce, sizeof(buf)); + while (size >= sizeof(buf)) { + if (memcmp(vaddr, buf, sizeof(buf))) + return false; + size -= sizeof(buf); + vaddr += sizeof(buf); + } + + return true; +} + +static void check(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self, + void *vaddr, enum result_type type, int setjmp) +{ + unsigned long size; + uint64_t pfn_flags; + + switch (type) { + case MADV_SOFT_ANON: + case MADV_HARD_CLEAN_PAGECACHE: + case MADV_SOFT_CLEAN_PAGECACHE: + case MADV_SOFT_DIRTY_PAGECACHE: + /* It is not expected to receive a SIGBUS signal. */ + ASSERT_EQ(setjmp, 0); + + /* The page content should remain unchanged. */ + ASSERT_TRUE(check_memory(vaddr, self->page_size)); + + /* The backing pfn of addr should have changed. */ + ASSERT_NE(pagemap_get_pfn(self->pagemap_fd, vaddr), self->pfn); + break; + case MADV_HARD_ANON: + case MADV_HARD_DIRTY_PAGECACHE: + /* The SIGBUS signal should have been received. */ + ASSERT_EQ(setjmp, 1); + + /* Check if siginfo contains correct SIGBUS context. */ + ASSERT_EQ(siginfo.si_signo, SIGBUS); + ASSERT_EQ(siginfo.si_code, BUS_MCEERR_AR); + ASSERT_EQ(1UL << siginfo.si_addr_lsb, self->page_size); + ASSERT_EQ(siginfo.si_addr, vaddr); + + /* XXX Check backing pte is hwpoison entry when supported. */ + ASSERT_TRUE(pagemap_is_swapped(self->pagemap_fd, vaddr)); + break; + default: + SKIP(return, "unexpected inject type %d.\n", type); + } + + /* Check if the value of HardwareCorrupted has increased. */ + ASSERT_EQ(get_hardware_corrupted_size(&size), 0); + ASSERT_EQ(size, self->corrupted_size + self->page_size / 1024); + + /* Check if HWPoison flag is set. */ + ASSERT_EQ(pageflags_get(self->pfn, self->kpageflags_fd, &pfn_flags), 0); + ASSERT_EQ(pfn_flags & KPF_HWPOISON, KPF_HWPOISON); +} + +static void cleanup(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self, + void *vaddr) +{ + unsigned long size; + uint64_t pfn_flags; + + ASSERT_EQ(unpoison_memory(self->pfn), 0); + + /* Check if HWPoison flag is cleared. */ + ASSERT_EQ(pageflags_get(self->pfn, self->kpageflags_fd, &pfn_flags), 0); + ASSERT_NE(pfn_flags & KPF_HWPOISON, KPF_HWPOISON); + + /* Check if the value of HardwareCorrupted has decreased. */ + ASSERT_EQ(get_hardware_corrupted_size(&size), 0); + ASSERT_EQ(size, self->corrupted_size); +} + +TEST_F(memory_failure, anon) +{ + char *addr; + int ret; + + addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (addr == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + memset(addr, 0xce, self->page_size); + + prepare(_metadata, self, addr); + + ret = sigsetjmp(signal_jmp_buf, 1); + if (!self->triggered) { + self->triggered = true; + ASSERT_EQ(variant->inject(self, addr), 0); + FORCE_READ(*addr); + } + + if (variant->type == MADV_HARD) + check(_metadata, self, addr, MADV_HARD_ANON, ret); + else + check(_metadata, self, addr, MADV_SOFT_ANON, ret); + + cleanup(_metadata, self, addr); + + ASSERT_EQ(munmap(addr, self->page_size), 0); +} + +static int prepare_file(const char *fname, unsigned long size) +{ + int fd; + + fd = open(fname, O_RDWR | O_CREAT, 0664); + if (fd >= 0) { + unlink(fname); + ftruncate(fd, size); + } + return fd; +} + +/* Borrowed from mm/gup_longterm.c. */ +static int get_fs_type(int fd) +{ + struct statfs fs; + int ret; + + do { + ret = fstatfs(fd, &fs); + } while (ret && errno == EINTR); + + return ret ? 0 : (int)fs.f_type; +} + +TEST_F(memory_failure, clean_pagecache) +{ + int fd; + char *addr; + int ret; + int fs_type; + + fd = prepare_file("./clean-page-cache-test-file", self->page_size); + if (fd < 0) + SKIP(return, "failed to open test file.\n"); + fs_type = get_fs_type(fd); + if (!fs_type || fs_type == TMPFS_MAGIC) + SKIP(return, "unsupported filesystem :%x\n", fs_type); + + addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + memset(addr, 0xce, self->page_size); + fsync(fd); + + prepare(_metadata, self, addr); + + ret = sigsetjmp(signal_jmp_buf, 1); + if (!self->triggered) { + self->triggered = true; + ASSERT_EQ(variant->inject(self, addr), 0); + FORCE_READ(*addr); + } + + if (variant->type == MADV_HARD) + check(_metadata, self, addr, MADV_HARD_CLEAN_PAGECACHE, ret); + else + check(_metadata, self, addr, MADV_SOFT_CLEAN_PAGECACHE, ret); + + cleanup(_metadata, self, addr); + + ASSERT_EQ(munmap(addr, self->page_size), 0); + + ASSERT_EQ(close(fd), 0); +} + +TEST_F(memory_failure, dirty_pagecache) +{ + int fd; + char *addr; + int ret; + int fs_type; + + fd = prepare_file("./dirty-page-cache-test-file", self->page_size); + if (fd < 0) + SKIP(return, "failed to open test file.\n"); + fs_type = get_fs_type(fd); + if (!fs_type || fs_type == TMPFS_MAGIC) + SKIP(return, "unsupported filesystem :%x\n", fs_type); + + addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + memset(addr, 0xce, self->page_size); + + prepare(_metadata, self, addr); + + ret = sigsetjmp(signal_jmp_buf, 1); + if (!self->triggered) { + self->triggered = true; + ASSERT_EQ(variant->inject(self, addr), 0); + FORCE_READ(*addr); + } + + if (variant->type == MADV_HARD) + check(_metadata, self, addr, MADV_HARD_DIRTY_PAGECACHE, ret); + else + check(_metadata, self, addr, MADV_SOFT_DIRTY_PAGECACHE, ret); + + cleanup(_metadata, self, addr); + + ASSERT_EQ(munmap(addr, self->page_size), 0); + + ASSERT_EQ(close(fd), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 29be9038bfb0..afdcfd0d7cef 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -91,6 +91,8 @@ separated by spaces: test VMA merge cases behave as expected - rmap test rmap behaves as expected +- memory-failure + test memory-failure behaves as expected example: ./run_vmtests.sh -t "hmm mmap ksm" EOF @@ -527,6 +529,25 @@ CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned CATEGORY="rmap" run_test ./rmap +# Try to load hwpoison_inject if not present. +HWPOISON_DIR=/sys/kernel/debug/hwpoison/ +if [ ! -d "$HWPOISON_DIR" ]; then + if ! modprobe -q -R hwpoison_inject; then + echo "Module hwpoison_inject not found, skipping..." + else + modprobe hwpoison_inject > /dev/null 2>&1 + LOADED_MOD=1 + fi +fi + +if [ -d "$HWPOISON_DIR" ]; then + CATEGORY="memory-failure" run_test ./memory-failure +fi + +if [ -n "${LOADED_MOD}" ]; then + modprobe -r hwpoison_inject > /dev/null 2>&1 +fi + if [ "${HAVE_HUGEPAGES}" = 1 ]; then echo "$orig_nr_hugepgs" > /proc/sys/vm/nr_hugepages fi diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index d954bf91afd5..a6d4ff7dfdc0 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -723,3 +723,44 @@ int ksm_stop(void) close(ksm_fd); return ret == 1 ? 0 : -errno; } + +int get_hardware_corrupted_size(unsigned long *val) +{ + unsigned long size; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + int ret = -1; + + if (!f) + return ret; + + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "HardwareCorrupted: %12lu kB", &size) == 1) { + *val = size; + ret = 0; + break; + } + } + + free(line); + fclose(f); + return ret; +} + +int unpoison_memory(unsigned long pfn) +{ + int unpoison_fd, len; + char buf[32]; + ssize_t ret; + + unpoison_fd = open("/sys/kernel/debug/hwpoison/unpoison-pfn", O_WRONLY); + if (unpoison_fd < 0) + return -errno; + + len = sprintf(buf, "0x%lx\n", pfn); + ret = write(unpoison_fd, buf, len); + close(unpoison_fd); + + return ret > 0 ? 0 : -errno; +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 522f7f9050f5..e9c4e24769c1 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -20,6 +20,7 @@ #define KPF_COMPOUND_HEAD BIT_ULL(15) #define KPF_COMPOUND_TAIL BIT_ULL(16) +#define KPF_HWPOISON BIT_ULL(19) #define KPF_THP BIT_ULL(22) /* * Ignore the checkpatch warning, we must read from x but don't want to do @@ -154,6 +155,8 @@ long ksm_get_full_scans(void); int ksm_use_zero_pages(void); int ksm_start(void); int ksm_stop(void); +int get_hardware_corrupted_size(unsigned long *val); +int unpoison_memory(unsigned long pfn); /* * On ppc64 this will only work with radix 2M hugepage size diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile index 66f3831a668f..e72b45dedda5 100644 --- a/tools/testing/vma/Makefile +++ b/tools/testing/vma/Makefile @@ -6,10 +6,13 @@ default: vma include ../shared/shared.mk -OFILES = $(SHARED_OFILES) vma.o maple-shim.o +OFILES = $(SHARED_OFILES) main.o shared.o maple-shim.o TARGETS = vma -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h +# These can be varied to test different sizes. +CFLAGS += -DNUM_VMA_FLAG_BITS=128 -DNUM_MM_FLAG_BITS=128 + +main.o: main.c shared.c shared.h vma_internal.h tests/merge.c tests/mmap.c tests/vma.c ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h include/custom.h include/dup.h include/stubs.h vma: $(OFILES) $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS) diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h new file mode 100644 index 000000000000..802a76317245 --- /dev/null +++ b/tools/testing/vma/include/custom.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#pragma once + +/* + * Contains declarations that exist in the kernel which have been CUSTOMISED for + * testing purposes to faciliate userland VMA testing. + */ + +#ifdef CONFIG_MMU +extern unsigned long mmap_min_addr; +extern unsigned long dac_mmap_min_addr; +#else +#define mmap_min_addr 0UL +#define dac_mmap_min_addr 0UL +#endif + +#define VM_WARN_ON(_expr) (WARN_ON(_expr)) +#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) +#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) +#define VM_BUG_ON(_expr) (BUG_ON(_expr)) +#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) + +/* We hardcode this for now. */ +#define sysctl_max_map_count 0x1000000UL + +#define TASK_SIZE ((1ul << 47)-PAGE_SIZE) + +/* + * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...) + * either way :) + */ +#define pr_warn_once pr_err + +#define pgtable_supports_soft_dirty() 1 + +struct anon_vma { + struct anon_vma *root; + struct rb_root_cached rb_root; + + /* Test fields. */ + bool was_cloned; + bool was_unlinked; +}; + +static inline void unlink_anon_vmas(struct vm_area_struct *vma) +{ + /* For testing purposes, indicate that the anon_vma was unlinked. */ + vma->anon_vma->was_unlinked = true; +} + +static inline void vma_start_write(struct vm_area_struct *vma) +{ + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; +} + +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; + return 0; +} + +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) +{ + /* For testing purposes. We indicate that an anon_vma has been cloned. */ + if (src->anon_vma != NULL) { + dst->anon_vma = src->anon_vma; + dst->anon_vma->was_cloned = true; + } + + return 0; +} + +static inline int __anon_vma_prepare(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma)); + + if (!anon_vma) + return -ENOMEM; + + anon_vma->root = anon_vma; + vma->anon_vma = anon_vma; + + return 0; +} + +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) +{ + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); +} + +static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +{ + vma_flags_t flags; + int i; + + /* + * For testing purposes: allow invalid bit specification so we can + * easily test. + */ + vma_flags_clear_all(&flags); + for (i = 0; i < count; i++) + if (bits[i] < NUM_VMA_FLAG_BITS) + vma_flag_set(&flags, bits[i]); + return flags; +} diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h new file mode 100644 index 000000000000..3078ff1487d3 --- /dev/null +++ b/tools/testing/vma/include/dup.h @@ -0,0 +1,1320 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#pragma once + +/* Forward declarations to avoid header cycle. */ +struct vm_area_struct; +static inline void vma_start_write(struct vm_area_struct *vma); + +extern const struct vm_operations_struct vma_dummy_vm_ops; +extern unsigned long stack_guard_gap; +extern const struct vm_operations_struct vma_dummy_vm_ops; +extern unsigned long rlimit(unsigned int limit); +struct task_struct *get_current(void); + +#define MMF_HAS_MDWE 28 +#define current get_current() + +/* + * Define the task command name length as enum, then it can be visible to + * BPF programs. + */ +enum { + TASK_COMM_LEN = 16, +}; + +/* PARTIALLY implemented types. */ +struct mm_struct { + struct maple_tree mm_mt; + int map_count; /* number of VMAs */ + unsigned long total_vm; /* Total pages mapped */ + unsigned long locked_vm; /* Pages that have PG_mlocked set */ + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ + unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ + unsigned long stack_vm; /* VM_STACK */ + + unsigned long def_flags; + + mm_flags_t flags; /* Must use mm_flags_* helpers to access */ +}; +struct address_space { + struct rb_root_cached i_mmap; + unsigned long flags; + atomic_t i_mmap_writable; +}; +struct file_operations { + int (*mmap)(struct file *, struct vm_area_struct *); + int (*mmap_prepare)(struct vm_area_desc *); +}; +struct file { + struct address_space *f_mapping; + const struct file_operations *f_op; +}; +struct anon_vma_chain { + struct anon_vma *anon_vma; + struct list_head same_vma; +}; +struct task_struct { + char comm[TASK_COMM_LEN]; + pid_t pid; + struct mm_struct *mm; + + /* Used for emulating ABI behavior of previous Linux versions: */ + unsigned int personality; +}; + +struct kref { + refcount_t refcount; +}; + +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + +/* + * Contains declarations that are DUPLICATED from kernel source in order to + * faciliate userland VMA testing. + * + * These must be kept in sync with kernel source. + */ + +#define VMA_LOCK_OFFSET 0x40000000 + +typedef struct { unsigned long v; } freeptr_t; + +#define VM_NONE 0x00000000 + +typedef int __bitwise vma_flag_t; + +#define ACCESS_PRIVATE(p, member) ((p)->member) + +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ +#ifdef CONFIG_MMU + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), +#endif /* CONFIG_MMU */ + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ +#ifdef CONFIG_STACK_GROWSUP + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) +#else +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) +#if CONFIG_ARCH_PKEY_BITS > 3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) +#else +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ +#if CONFIG_ARCH_PKEY_BITS > 4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) +#else +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE +#endif +#if defined(CONFIG_PPC64) +#define VM_SAO INIT_VM_FLAG(SAO) +#elif defined(CONFIG_PARISC) +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) +#elif defined(CONFIG_SPARC64) +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif defined(CONFIG_ARM64) +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif !defined(CONFIG_MMU) +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) +#endif +#ifndef VM_GROWSUP +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE +#endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) +#else +#define VM_UFFD_MINOR VM_NONE +#endif +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#else +#define VM_DROPPABLE VM_NONE +#endif + +/* Bits set in the VMA until the stack is in its final location */ +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ + VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#endif + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) + +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + +/* + * Special vmas that are non-mergable, non-mlock()able. + */ +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_LOW DEFAULT_MAP_WINDOW +#define TASK_SIZE_MAX DEFAULT_MAP_WINDOW +#define STACK_TOP TASK_SIZE_LOW +#define STACK_TOP_MAX TASK_SIZE_MAX + +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#define RLIMIT_STACK 3 /* max stack size */ +#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ + +#define CAP_IPC_LOCK 14 + +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) + +#define VM_IGNORE_MERGE VM_STICKY + +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) + +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) } ) + +#define for_each_vma(__vmi, __vma) \ + while (((__vma) = vma_next(&(__vmi))) != NULL) + +/* The MM code likes to work with exclusive end addresses */ +#define for_each_vma_range(__vmi, __vma, __end) \ + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) + +#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) + +#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT)) + +#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr) +#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr) + +#define AS_MM_ALL_LOCKS 2 + +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +/* + * Flags for bug emulation. + * + * These occupy the top three bytes. + */ +enum { + READ_IMPLIES_EXEC = 0x0400000, +}; + +struct vma_iterator { + struct ma_state mas; +}; + +#define VMA_ITERATOR(name, __mm, __addr) \ + struct vma_iterator name = { \ + .mas = { \ + .tree = &(__mm)->mm_mt, \ + .index = __addr, \ + .node = NULL, \ + .status = ma_start, \ + }, \ + } + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = {} + +#define DECLARE_BITMAP(name, bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#define EMPTY_VMA_FLAGS ((vma_flags_t){ }) + +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + +/* + * Describes a VMA that is about to be mmap()'ed. Drivers may choose to + * manipulate mutable fields which will cause those fields to be updated in the + * resultant VMA. + * + * Helper functions are not required for manipulating any field. + */ +struct vm_area_desc { + /* Immutable state. */ + const struct mm_struct *const mm; + struct file *const file; /* May vary from vm_file in stacked callers. */ + unsigned long start; + unsigned long end; + + /* Mutable fields. Populated with initial state. */ + pgoff_t pgoff; + struct file *vm_file; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; + pgprot_t page_prot; + + /* Write-only fields. */ + const struct vm_operations_struct *vm_ops; + void *private_data; + + /* Take further action? */ + struct mmap_action action; +}; + +struct vm_area_struct { + /* The first cache line has the info for VMA tree walking. */ + + union { + struct { + /* VMA covers [vm_start; vm_end) addresses within mm */ + unsigned long vm_start; + unsigned long vm_end; + }; + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ + }; + + struct mm_struct *vm_mm; /* The address space we belong to. */ + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ + + /* + * Flags, see mm.h. + * To modify use vm_flags_{init|reset|set|clear|mod} functions. + */ + union { + const vm_flags_t vm_flags; + vma_flags_t flags; + }; + +#ifdef CONFIG_PER_VMA_LOCK + /* + * Can only be written (using WRITE_ONCE()) while holding both: + * - mmap_lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set + * Can be read reliably while holding one of: + * - mmap_lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 + * Can be read unreliably (using READ_ONCE()) for pessimistic bailout + * while holding nothing (except RCU to keep the VMA struct allocated). + * + * This sequence counter is explicitly allowed to overflow; sequence + * counter reuse can only lead to occasional unnecessary use of the + * slowpath. + */ + unsigned int vm_lock_seq; +#endif + + /* + * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma + * list, after a COW of one of the file pages. A MAP_SHARED vma + * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack + * or brk vma (with NULL file) can only be in an anon_vma list. + */ + struct list_head anon_vma_chain; /* Serialized by mmap_lock & + * page_table_lock */ + struct anon_vma *anon_vma; /* Serialized by page_table_lock */ + + /* Function pointers to deal with this struct. */ + const struct vm_operations_struct *vm_ops; + + /* Information about our backing store: */ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units */ + struct file * vm_file; /* File we map to (can be NULL). */ + void * vm_private_data; /* was vm_pte (shared mem) */ + +#ifdef CONFIG_SWAP + atomic_long_t swap_readahead_info; +#endif +#ifndef CONFIG_MMU + struct vm_region *vm_region; /* NOMMU mapping region */ +#endif +#ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ +#endif +#ifdef CONFIG_NUMA_BALANCING + struct vma_numab_state *numab_state; /* NUMA Balancing state */ +#endif +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + refcount_t vm_refcnt; +#endif + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +} __randomize_layout; + +struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*close)(struct vm_area_struct * area); + /* Called any time before splitting to check if it's allowed */ + int (*may_split)(struct vm_area_struct *area, unsigned long addr); + int (*mremap)(struct vm_area_struct *area); + /* + * Called by mprotect() to make driver-specific permission + * checks before mprotect() is finalised. The VMA must not + * be modified. Returns 0 if mprotect() can proceed. + */ + int (*mprotect)(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags); + vm_fault_t (*fault)(struct vm_fault *vmf); + vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); + vm_fault_t (*map_pages)(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff); + unsigned long (*pagesize)(struct vm_area_struct * area); + + /* notification that a previously read-only page is about to become + * writable, if an error is returned it will cause a SIGBUS */ + vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); + + /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ + vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); + + /* called by access_process_vm when get_user_pages() fails, typically + * for use by special VMAs. See also generic_access_phys() for a generic + * implementation useful for any iomem mapping. + */ + int (*access)(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); + + /* Called by the /proc/PID/maps code to ask the vma whether it + * has a special name. Returning non-NULL will also cause this + * vma to be dumped unconditionally. */ + const char *(*name)(struct vm_area_struct *vma); + +#ifdef CONFIG_NUMA + /* + * set_policy() op must add a reference to any non-NULL @new mempolicy + * to hold the policy upon return. Caller should pass NULL @new to + * remove a policy and fall back to surrounding context--i.e. do not + * install a MPOL_DEFAULT policy, nor the task or system default + * mempolicy. + */ + int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); + + /* + * get_policy() op must add reference [mpol_get()] to any policy at + * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure + * in mm/mempolicy.c will do this automatically. + * get_policy() must NOT add a ref if the policy at (vma,addr) is not + * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. + * If no [shared/vma] mempolicy exists at the addr, get_policy() op + * must return NULL--i.e., do not "fallback" to task or system default + * policy. + */ + struct mempolicy *(*get_policy)(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *ilx); +#endif +#ifdef CONFIG_FIND_NORMAL_PAGE + /* + * Called by vm_normal_page() for special PTEs in @vma at @addr. This + * allows for returning a "normal" page from vm_normal_page() even + * though the PTE indicates that the "struct page" either does not exist + * or should not be touched: "special". + * + * Do not add new users: this really only works when a "normal" page + * was mapped, but then the PTE got changed to something weird (+ + * marked special) that would not make pte_pfn() identify the originally + * inserted page. + */ + struct page *(*find_normal_page)(struct vm_area_struct *vma, + unsigned long addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ +}; + +struct vm_unmapped_area_info { +#define VM_UNMAPPED_AREA_TOPDOWN 1 + unsigned long flags; + unsigned long length; + unsigned long low_limit; + unsigned long high_limit; + unsigned long align_mask; + unsigned long align_offset; + unsigned long start_gap; +}; + +struct pagetable_move_control { + struct vm_area_struct *old; /* Source VMA. */ + struct vm_area_struct *new; /* Destination VMA. */ + unsigned long old_addr; /* Address from which the move begins. */ + unsigned long old_end; /* Exclusive address at which old range ends. */ + unsigned long new_addr; /* Address to move page tables to. */ + unsigned long len_in; /* Bytes to remap specified by user. */ + + bool need_rmap_locks; /* Do rmap locks need to be taken? */ + bool for_stack; /* Is this an early temp stack being moved? */ +}; + +#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \ + struct pagetable_move_control name = { \ + .old = old_, \ + .new = new_, \ + .old_addr = old_addr_, \ + .old_end = (old_addr_) + (len_), \ + .new_addr = new_addr_, \ + .len_in = len_, \ + } + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot)); +} + +static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) +{ + return __pgprot(vm_flags); +} + +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + __set_bit((__force int)bit, bitmap); +} + +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); +} + +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + /* + * The user should only be interested in avoiding reordering of + * assignment to the first word. + */ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word_once(&vma->flags, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_set_word(&vma->flags, flags); +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_clear_word(&vma->flags, flags); +} + +static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits); + +#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flag_t []){__VA_ARGS__}) + +static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, + vma_flags_t to_test) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_test = to_test.__vma_flags; + + return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_test(flags, ...) \ + vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, + vma_flags_t to_test) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_test = to_test.__vma_flags; + + return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_test_all(flags, ...) \ + vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) +{ + unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_set = to_set.__vma_flags; + + bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_set(flags, ...) \ + vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) +{ + unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_clear = to_clear.__vma_flags; + + bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_clear(flags, ...) \ + vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) + +static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&vma->flags, flags); +} + +#define vma_test_all_flags(vma, ...) \ + vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + +static inline void vma_set_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_set_mask(&vma->flags, flags); +} + +#define vma_set_flags(vma, ...) \ + vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_mask(&desc->vma_flags, flags); +} + +#define vma_desc_test_flags(desc, ...) \ + vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) +{ + vma_flags_set_mask(&desc->vma_flags, flags); +} + +#define vma_desc_set_flags(desc, ...) \ + vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) +{ + vma_flags_clear_mask(&desc->vma_flags, flags); +} + +#define vma_desc_clear_flags(desc, ...) \ + vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) + +static inline bool is_shared_maywrite(const vma_flags_t *flags) +{ + return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); +} + +static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +{ + return is_shared_maywrite(&vma->flags); +} + +static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) +{ + /* + * Uses mas_find() to get the first VMA when the iterator starts. + * Calling mas_next() could skip the first entry. + */ + return mas_find(&vmi->mas, ULONG_MAX); +} + +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ +static inline void vma_assert_attached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); +} + +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); +} + +static inline void vma_assert_write_locked(struct vm_area_struct *); +static inline void vma_mark_attached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set_release(&vma->vm_refcnt, 1); +} + +static inline void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + /* We are the only writer, so no need to use vma_refcount_put(). */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* + * Reader must have temporarily raised vm_refcnt but it will + * drop it without using the vma since vma is write-locked. + */ + } +} + +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) +{ + memset(vma, 0, sizeof(*vma)); + vma->vm_mm = mm; + vma->vm_ops = &vma_dummy_vm_ops; + INIT_LIST_HEAD(&vma->anon_vma_chain); + vma->vm_lock_seq = UINT_MAX; +} + +/* + * These are defined in vma.h, but sadly vm_stat_account() is referenced by + * kernel/fork.c, so we have to these broadly available there, and temporarily + * define them here to resolve the dependency cycle. + */ +#define is_exec_mapping(flags) \ + ((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC) + +#define is_stack_mapping(flags) \ + (((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK)) + +#define is_data_mapping(flags) \ + ((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE) + +static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, + long npages) +{ + WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); + + if (is_exec_mapping(flags)) + mm->exec_vm += npages; + else if (is_stack_mapping(flags)) + mm->stack_vm += npages; + else if (is_data_mapping(flags)) + mm->data_vm += npages; +} + +#undef is_exec_mapping +#undef is_stack_mapping +#undef is_data_mapping + +static inline void vm_unacct_memory(long pages) +{ + vm_acct_memory(-pages); +} + +static inline void mapping_allow_writable(struct address_space *mapping) +{ + atomic_inc(&mapping->i_mmap_writable); +} + +static inline +struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) +{ + return mas_find(&vmi->mas, max - 1); +} + +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +static inline void vma_set_anonymous(struct vm_area_struct *vma) +{ + vma->vm_ops = NULL; +} + +/* Declared in vma.h. */ +static inline void set_vma_from_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc); + +static inline int __compat_vma_mmap(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ + }; + int err; + + err = f_op->mmap_prepare(&desc); + if (err) + return err; + + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); +} + +static inline int compat_vma_mmap(struct file *file, + struct vm_area_struct *vma) +{ + return __compat_vma_mmap(file->f_op, file, vma); +} + + +static inline void vma_iter_init(struct vma_iterator *vmi, + struct mm_struct *mm, unsigned long addr) +{ + mas_init(&vmi->mas, &mm->mm_mt, addr); +} + +static inline unsigned long vma_pages(struct vm_area_struct *vma) +{ + return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +} + +static inline void mmap_assert_locked(struct mm_struct *); +static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} + +static inline +struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) +{ + return mtree_load(&mm->mm_mt, addr); +} + +static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) +{ + return mas_prev(&vmi->mas, 0); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + +static inline bool vma_is_anonymous(struct vm_area_struct *vma) +{ + return !vma->vm_ops; +} + +/* Defined in vma.h, so temporarily define here to avoid circular dependency. */ +#define vma_iter_load(vmi) \ + mas_walk(&(vmi)->mas) + +static inline struct vm_area_struct * +find_vma_prev(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); + + vma = vma_iter_load(&vmi); + *pprev = vma_prev(&vmi); + if (!vma) + vma = vma_next(&vmi); + return vma; +} + +#undef vma_iter_load + +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); +} + +static inline +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) +{ + return mas_next_range(&vmi->mas, ULONG_MAX); +} + +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); + +/* Update vma->vm_page_prot to reflect vma->vm_flags. */ +static inline void vma_set_page_prot(struct vm_area_struct *vma) +{ + vm_flags_t vm_flags = vma->vm_flags; + pgprot_t vm_page_prot; + + /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ + vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags)); + + if (vma_wants_writenotify(vma, vm_page_prot)) { + vm_flags &= ~VM_SHARED; + /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ + vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags)); + } + /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ + WRITE_ONCE(vma->vm_page_prot, vm_page_prot); +} + +static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_GROWSDOWN) + return stack_guard_gap; + + /* See reasoning around the VM_SHADOW_STACK definition */ + if (vma->vm_flags & VM_SHADOW_STACK) + return PAGE_SIZE; + + return 0; +} + +static inline unsigned long vm_start_gap(struct vm_area_struct *vma) +{ + unsigned long gap = stack_guard_start_gap(vma); + unsigned long vm_start = vma->vm_start; + + vm_start -= gap; + if (vm_start > vma->vm_start) + vm_start = 0; + return vm_start; +} + +static inline unsigned long vm_end_gap(struct vm_area_struct *vma) +{ + unsigned long vm_end = vma->vm_end; + + if (vma->vm_flags & VM_GROWSUP) { + vm_end += stack_guard_gap; + if (vm_end < vma->vm_end) + vm_end = -PAGE_SIZE; + } + return vm_end; +} + +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_ACCESS_FLAGS; +} + +static inline bool mlock_future_ok(const struct mm_struct *mm, + vm_flags_t vm_flags, unsigned long bytes) +{ + unsigned long locked_pages, limit_pages; + + if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + return true; + + locked_pages = bytes >> PAGE_SHIFT; + locked_pages += mm->locked_vm; + + limit_pages = rlimit(RLIMIT_MEMLOCK); + limit_pages >>= PAGE_SHIFT; + + return locked_pages <= limit_pages; +} + +static inline bool map_deny_write_exec(unsigned long old, unsigned long new) +{ + /* If MDWE is disabled, we have nothing to deny. */ + if (mm_flags_test(MMF_HAS_MDWE, current->mm)) + return false; + + /* If the new VMA is not executable, we have nothing to deny. */ + if (!(new & VM_EXEC)) + return false; + + /* Under MDWE we do not accept newly writably executable VMAs... */ + if (new & VM_WRITE) + return true; + + /* ...nor previously non-executable VMAs becoming executable. */ + if (!(old & VM_EXEC)) + return true; + + return false; +} + +static inline int mapping_map_writable(struct address_space *mapping) +{ + return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? + 0 : -EPERM; +} + +/* Did the driver provide valid mmap hook configuration? */ +static inline bool can_mmap_file(struct file *file) +{ + bool has_mmap = file->f_op->mmap; + bool has_mmap_prepare = file->f_op->mmap_prepare; + + /* Hooks are mutually exclusive. */ + if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) + return false; + if (!has_mmap && !has_mmap_prepare) + return false; + + return true; +} + +static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (file->f_op->mmap_prepare) + return compat_vma_mmap(file, vma); + + return file->f_op->mmap(file, vma); +} + +static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) +{ + return file->f_op->mmap_prepare(desc); +} + +static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) +{ + /* Changing an anonymous vma with this is illegal */ + get_file(file); + swap(vma->vm_file, file); + fput(file); +} diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h new file mode 100644 index 000000000000..947a3a0c2566 --- /dev/null +++ b/tools/testing/vma/include/stubs.h @@ -0,0 +1,428 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#pragma once + +/* + * Contains declarations that are STUBBED, that is that are rendered no-ops, in + * order to faciliate userland VMA testing. + */ + +/* Forward declarations. */ +struct mm_struct; +struct vm_area_struct; +struct vm_area_desc; +struct pagetable_move_control; +struct mmap_action; +struct file; +struct anon_vma; +struct anon_vma_chain; +struct address_space; +struct unmap_desc; + +#define __bitwise +#define __randomize_layout + +#define FIRST_USER_ADDRESS 0UL +#define USER_PGTABLES_CEILING 0UL + +#define vma_policy(vma) NULL + +#define down_write_nest_lock(sem, nest_lock) + +#define data_race(expr) expr + +#define ASSERT_EXCLUSIVE_WRITER(x) + +struct vm_userfaultfd_ctx {}; +struct mempolicy {}; +struct mmu_gather {}; +struct mutex {}; +struct vm_fault {}; + +static inline void userfaultfd_unmap_complete(struct mm_struct *mm, + struct list_head *uf) +{ +} + +static inline unsigned long move_page_tables(struct pagetable_move_control *pmc) +{ + return 0; +} + +static inline void free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ +} + +static inline int ksm_execve(struct mm_struct *mm) +{ + return 0; +} + +static inline void ksm_exit(struct mm_struct *mm) +{ +} + +static inline void vma_numab_state_init(struct vm_area_struct *vma) +{ +} + +static inline void vma_numab_state_free(struct vm_area_struct *vma) +{ +} + +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ +} + +static inline void free_anon_vma_name(struct vm_area_struct *vma) +{ +} + +static inline void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ +} + +static inline int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + return 0; +} + +static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ +} + +static inline bool shmem_file(struct file *file) +{ + return false; +} + +static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, + const struct file *file, vm_flags_t vm_flags) +{ + return vm_flags; +} + +static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ +} + +static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot) +{ + return 0; +} + +static inline int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf) +{ + return 0; +} + +/* Currently stubbed but we may later wish to un-stub. */ +static inline void vm_acct_memory(long pages); + +static inline void mmap_assert_locked(struct mm_struct *mm) +{ +} + + +static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) +{ +} + +static inline void i_mmap_unlock_write(struct address_space *mapping) +{ +} + +static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + struct list_head *unmaps) +{ + return 0; +} + +static inline void mmap_write_downgrade(struct mm_struct *mm) +{ +} + +static inline void mmap_read_unlock(struct mm_struct *mm) +{ +} + +static inline void mmap_write_unlock(struct mm_struct *mm) +{ +} + +static inline int mmap_write_lock_killable(struct mm_struct *mm) +{ + return 0; +} + +static inline bool can_modify_mm(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + return true; +} + +static inline void arch_unmap(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ +} + +static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) +{ + return true; +} + +static inline void khugepaged_enter_vma(struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ +} + +static inline bool mapping_can_writeback(struct address_space *mapping) +{ + return true; +} + +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool userfaultfd_wp(struct vm_area_struct *vma) +{ + return false; +} + +static inline void mmap_assert_write_locked(struct mm_struct *mm) +{ +} + +static inline void mutex_lock(struct mutex *lock) +{ +} + +static inline void mutex_unlock(struct mutex *lock) +{ +} + +static inline bool mutex_is_locked(struct mutex *lock) +{ + return true; +} + +static inline bool signal_pending(void *p) +{ + return false; +} + +static inline bool is_file_hugepages(struct file *file) +{ + return false; +} + +static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) +{ + return 0; +} + +static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, + unsigned long npages) +{ + return true; +} + +static inline int shmem_zero_setup(struct vm_area_struct *vma) +{ + return 0; +} + + +static inline void vm_acct_memory(long pages) +{ +} + +static inline void vma_interval_tree_insert(struct vm_area_struct *vma, + struct rb_root_cached *rb) +{ +} + +static inline void vma_interval_tree_remove(struct vm_area_struct *vma, + struct rb_root_cached *rb) +{ +} + +static inline void flush_dcache_mmap_unlock(struct address_space *mapping) +{ +} + +static inline void anon_vma_interval_tree_insert(struct anon_vma_chain *avc, + struct rb_root_cached *rb) +{ +} + +static inline void anon_vma_interval_tree_remove(struct anon_vma_chain *avc, + struct rb_root_cached *rb) +{ +} + +static inline void uprobe_mmap(struct vm_area_struct *vma) +{ +} + +static inline void uprobe_munmap(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} + +static inline void i_mmap_lock_write(struct address_space *mapping) +{ +} + +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) +{ +} + +static inline void vma_assert_write_locked(struct vm_area_struct *vma) +{ +} + +static inline void ksm_add_vma(struct vm_area_struct *vma) +{ +} + +static inline void perf_event_mmap(struct vm_area_struct *vma) +{ +} + +static inline bool vma_is_dax(struct vm_area_struct *vma) +{ + return false; +} + +static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return NULL; +} + +static inline bool arch_validate_flags(vm_flags_t flags) +{ + return true; +} + +static inline void vma_close(struct vm_area_struct *vma) +{ +} + +static inline int mmap_file(struct file *file, struct vm_area_struct *vma) +{ + return 0; +} + +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, unsigned long len) +{ + return 0; +} + +static inline bool capable(int cap) +{ + return true; +} + +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, + struct vm_userfaultfd_ctx vm_ctx) +{ + return true; +} + +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, + struct anon_vma_name *anon_name2) +{ + return true; +} + +static inline void might_sleep(void) +{ +} + +static inline void fput(struct file *file) +{ +} + +static inline void mpol_put(struct mempolicy *pol) +{ +} + +static inline void lru_add_drain(void) +{ +} + +static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) +{ +} + +static inline void update_hiwater_rss(struct mm_struct *mm) +{ +} + +static inline void update_hiwater_vm(struct mm_struct *mm) +{ +} + +static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) +{ +} + +static inline void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap) +{ +} + +static inline void mapping_unmap_writable(struct address_space *mapping) +{ +} + +static inline void flush_dcache_mmap_lock(struct address_space *mapping) +{ +} + +static inline void tlb_finish_mmu(struct mmu_gather *tlb) +{ +} + +static inline struct file *get_file(struct file *f) +{ + return f; +} + +static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + return 0; +} + +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + struct vm_area_struct *next) +{ +} + +static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {} diff --git a/tools/testing/vma/main.c b/tools/testing/vma/main.c new file mode 100644 index 000000000000..49b09e97a51f --- /dev/null +++ b/tools/testing/vma/main.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shared.h" +/* + * Directly import the VMA implementation here. Our vma_internal.h wrapper + * provides userland-equivalent functionality for everything vma.c uses. + */ +#include "../../../mm/vma_init.c" +#include "../../../mm/vma_exec.c" +#include "../../../mm/vma.c" + +/* Tests are included directly so they can test static functions in mm/vma.c. */ +#include "tests/merge.c" +#include "tests/mmap.c" +#include "tests/vma.c" + +/* Helper functions which utilise static kernel functions. */ + +struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *vma; + + vma = vma_merge_existing_range(vmg); + if (vma) + vma_assert_attached(vma); + return vma; +} + +int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + int res; + + res = vma_link(mm, vma); + if (!res) + vma_assert_attached(vma); + return res; +} + +/* Main test running which invokes tests/ *.c runners. */ +int main(void) +{ + int num_tests = 0, num_fail = 0; + + maple_tree_init(); + vma_state_init(); + + run_merge_tests(&num_tests, &num_fail); + run_mmap_tests(&num_tests, &num_fail); + run_vma_tests(&num_tests, &num_fail); + + printf("%d tests run, %d passed, %d failed.\n", + num_tests, num_tests - num_fail, num_fail); + + return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/tools/testing/vma/shared.c b/tools/testing/vma/shared.c new file mode 100644 index 000000000000..bda578cc3304 --- /dev/null +++ b/tools/testing/vma/shared.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shared.h" + + +bool fail_prealloc; +unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; + +const struct vm_operations_struct vma_dummy_vm_ops; +struct anon_vma dummy_anon_vma; +struct task_struct __current; + +struct vm_area_struct *alloc_vma(struct mm_struct *mm, + unsigned long start, unsigned long end, + pgoff_t pgoff, vm_flags_t vm_flags) +{ + struct vm_area_struct *vma = vm_area_alloc(mm); + + if (vma == NULL) + return NULL; + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + vm_flags_reset(vma, vm_flags); + vma_assert_detached(vma); + + return vma; +} + +void detach_free_vma(struct vm_area_struct *vma) +{ + vma_mark_detached(vma); + vm_area_free(vma); +} + +struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, + unsigned long start, unsigned long end, + pgoff_t pgoff, vm_flags_t vm_flags) +{ + struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags); + + if (vma == NULL) + return NULL; + + if (attach_vma(mm, vma)) { + detach_free_vma(vma); + return NULL; + } + + /* + * Reset this counter which we use to track whether writes have + * begun. Linking to the tree will have caused this to be incremented, + * which means we will get a false positive otherwise. + */ + vma->vm_lock_seq = UINT_MAX; + + return vma; +} + +void reset_dummy_anon_vma(void) +{ + dummy_anon_vma.was_cloned = false; + dummy_anon_vma.was_unlinked = false; +} + +int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi) +{ + struct vm_area_struct *vma; + int count = 0; + + fail_prealloc = false; + reset_dummy_anon_vma(); + + vma_iter_set(vmi, 0); + for_each_vma(*vmi, vma) { + detach_free_vma(vma); + count++; + } + + mtree_destroy(&mm->mm_mt); + mm->map_count = 0; + return count; +} + +bool vma_write_started(struct vm_area_struct *vma) +{ + int seq = vma->vm_lock_seq; + + /* We reset after each check. */ + vma->vm_lock_seq = UINT_MAX; + + /* The vma_start_write() stub simply increments this value. */ + return seq > -1; +} + +void __vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc, struct anon_vma *anon_vma) +{ + vma->anon_vma = anon_vma; + INIT_LIST_HEAD(&vma->anon_vma_chain); + list_add(&avc->same_vma, &vma->anon_vma_chain); + avc->anon_vma = vma->anon_vma; +} + +void vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc) +{ + __vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma); +} + +struct task_struct *get_current(void) +{ + return &__current; +} + +unsigned long rlimit(unsigned int limit) +{ + return (unsigned long)-1; +} + +void vma_set_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + pgoff_t pgoff) +{ + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; +} diff --git a/tools/testing/vma/shared.h b/tools/testing/vma/shared.h new file mode 100644 index 000000000000..6c64211cfa22 --- /dev/null +++ b/tools/testing/vma/shared.h @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +#include "generated/bit-length.h" +#include "maple-shared.h" +#include "vma_internal.h" +#include "../../../mm/vma.h" + +/* Simple test runner. Assumes local num_[fail, tests] counters. */ +#define TEST(name) \ + do { \ + (*num_tests)++; \ + if (!test_##name()) { \ + (*num_fail)++; \ + fprintf(stderr, "Test " #name " FAILED\n"); \ + } \ + } while (0) + +#define ASSERT_TRUE(_expr) \ + do { \ + if (!(_expr)) { \ + fprintf(stderr, \ + "Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \ + __FILE__, __LINE__, __FUNCTION__, #_expr); \ + return false; \ + } \ + } while (0) + +#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr)) +#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) +#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) + +#define IS_SET(_val, _flags) ((_val & _flags) == _flags) + +extern bool fail_prealloc; + +/* Override vma_iter_prealloc() so we can choose to fail it. */ +#define vma_iter_prealloc(vmi, vma) \ + (fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL)) + +#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536 + +extern unsigned long mmap_min_addr; +extern unsigned long dac_mmap_min_addr; +extern unsigned long stack_guard_gap; + +extern const struct vm_operations_struct vma_dummy_vm_ops; +extern struct anon_vma dummy_anon_vma; +extern struct task_struct __current; + +/* + * Helper function which provides a wrapper around a merge existing VMA + * operation. + * + * Declared in main.c as uses static VMA function. + */ +struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg); + +/* + * Helper function to allocate a VMA and link it to the tree. + * + * Declared in main.c as uses static VMA function. + */ +int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma); + +/* Helper function providing a dummy vm_ops->close() method.*/ +static inline void dummy_close(struct vm_area_struct *) +{ +} + +/* Helper function to simply allocate a VMA. */ +struct vm_area_struct *alloc_vma(struct mm_struct *mm, + unsigned long start, unsigned long end, + pgoff_t pgoff, vm_flags_t vm_flags); + +/* Helper function to detach and free a VMA. */ +void detach_free_vma(struct vm_area_struct *vma); + +/* Helper function to allocate a VMA and link it to the tree. */ +struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, + unsigned long start, unsigned long end, + pgoff_t pgoff, vm_flags_t vm_flags); + +/* + * Helper function to reset the dummy anon_vma to indicate it has not been + * duplicated. + */ +void reset_dummy_anon_vma(void); + +/* + * Helper function to remove all VMAs and destroy the maple tree associated with + * a virtual address space. Returns a count of VMAs in the tree. + */ +int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi); + +/* Helper function to determine if VMA has had vma_start_write() performed. */ +bool vma_write_started(struct vm_area_struct *vma); + +void __vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc, struct anon_vma *anon_vma); + +/* Provide a simple dummy VMA/anon_vma dummy setup for testing. */ +void vma_set_dummy_anon_vma(struct vm_area_struct *vma, + struct anon_vma_chain *avc); + +/* Helper function to specify a VMA's range. */ +void vma_set_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + pgoff_t pgoff); diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/tests/merge.c index 93d21bc7e112..3708dc6945b0 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/tests/merge.c @@ -1,132 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <stdbool.h> -#include <stdio.h> -#include <stdlib.h> - -#include "generated/bit-length.h" - -#include "maple-shared.h" -#include "vma_internal.h" - -/* Include so header guard set. */ -#include "../../../mm/vma.h" - -static bool fail_prealloc; - -/* Then override vma_iter_prealloc() so we can choose to fail it. */ -#define vma_iter_prealloc(vmi, vma) \ - (fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL)) - -#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536 - -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; -unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; -unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; - -/* - * Directly import the VMA implementation here. Our vma_internal.h wrapper - * provides userland-equivalent functionality for everything vma.c uses. - */ -#include "../../../mm/vma_init.c" -#include "../../../mm/vma_exec.c" -#include "../../../mm/vma.c" - -const struct vm_operations_struct vma_dummy_vm_ops; -static struct anon_vma dummy_anon_vma; - -#define ASSERT_TRUE(_expr) \ - do { \ - if (!(_expr)) { \ - fprintf(stderr, \ - "Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \ - __FILE__, __LINE__, __FUNCTION__, #_expr); \ - return false; \ - } \ - } while (0) -#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr)) -#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) -#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) - -#define IS_SET(_val, _flags) ((_val & _flags) == _flags) - -static struct task_struct __current; - -struct task_struct *get_current(void) -{ - return &__current; -} - -unsigned long rlimit(unsigned int limit) -{ - return (unsigned long)-1; -} - -/* Helper function to simply allocate a VMA. */ -static struct vm_area_struct *alloc_vma(struct mm_struct *mm, - unsigned long start, - unsigned long end, - pgoff_t pgoff, - vm_flags_t vm_flags) -{ - struct vm_area_struct *vma = vm_area_alloc(mm); - - if (vma == NULL) - return NULL; - - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; - vm_flags_reset(vma, vm_flags); - vma_assert_detached(vma); - - return vma; -} - -/* Helper function to allocate a VMA and link it to the tree. */ -static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma) -{ - int res; - - res = vma_link(mm, vma); - if (!res) - vma_assert_attached(vma); - return res; -} - -static void detach_free_vma(struct vm_area_struct *vma) -{ - vma_mark_detached(vma); - vm_area_free(vma); -} - -/* Helper function to allocate a VMA and link it to the tree. */ -static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, - unsigned long start, - unsigned long end, - pgoff_t pgoff, - vm_flags_t vm_flags) -{ - struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags); - - if (vma == NULL) - return NULL; - - if (attach_vma(mm, vma)) { - detach_free_vma(vma); - return NULL; - } - - /* - * Reset this counter which we use to track whether writes have - * begun. Linking to the tree will have caused this to be incremented, - * which means we will get a false positive otherwise. - */ - vma->vm_lock_seq = UINT_MAX; - - return vma; -} - /* Helper function which provides a wrapper around a merge new VMA operation. */ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) { @@ -147,20 +20,6 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg) } /* - * Helper function which provides a wrapper around a merge existing VMA - * operation. - */ -static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) -{ - struct vm_area_struct *vma; - - vma = vma_merge_existing_range(vmg); - if (vma) - vma_assert_attached(vma); - return vma; -} - -/* * Helper function which provides a wrapper around the expansion of an existing * VMA. */ @@ -173,8 +32,8 @@ static int expand_existing(struct vma_merge_struct *vmg) * Helper function to reset merge state the associated VMA iterator to a * specified new range. */ -static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, - unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags) +void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, + unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags) { vma_iter_set(vmg->vmi, start); @@ -197,8 +56,8 @@ static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, /* Helper function to set both the VMG range and its anon_vma. */ static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start, - unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags, - struct anon_vma *anon_vma) + unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags, + struct anon_vma *anon_vma) { vmg_set_range(vmg, start, end, pgoff, vm_flags); vmg->anon_vma = anon_vma; @@ -211,10 +70,9 @@ static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long s * VMA, link it to the maple tree and return it. */ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, - struct vma_merge_struct *vmg, - unsigned long start, unsigned long end, - pgoff_t pgoff, vm_flags_t vm_flags, - bool *was_merged) + struct vma_merge_struct *vmg, unsigned long start, + unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags, + bool *was_merged) { struct vm_area_struct *merged; @@ -234,72 +92,6 @@ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, return alloc_and_link_vma(mm, start, end, pgoff, vm_flags); } -/* - * Helper function to reset the dummy anon_vma to indicate it has not been - * duplicated. - */ -static void reset_dummy_anon_vma(void) -{ - dummy_anon_vma.was_cloned = false; - dummy_anon_vma.was_unlinked = false; -} - -/* - * Helper function to remove all VMAs and destroy the maple tree associated with - * a virtual address space. Returns a count of VMAs in the tree. - */ -static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi) -{ - struct vm_area_struct *vma; - int count = 0; - - fail_prealloc = false; - reset_dummy_anon_vma(); - - vma_iter_set(vmi, 0); - for_each_vma(*vmi, vma) { - detach_free_vma(vma); - count++; - } - - mtree_destroy(&mm->mm_mt); - mm->map_count = 0; - return count; -} - -/* Helper function to determine if VMA has had vma_start_write() performed. */ -static bool vma_write_started(struct vm_area_struct *vma) -{ - int seq = vma->vm_lock_seq; - - /* We reset after each check. */ - vma->vm_lock_seq = UINT_MAX; - - /* The vma_start_write() stub simply increments this value. */ - return seq > -1; -} - -/* Helper function providing a dummy vm_ops->close() method.*/ -static void dummy_close(struct vm_area_struct *) -{ -} - -static void __vma_set_dummy_anon_vma(struct vm_area_struct *vma, - struct anon_vma_chain *avc, - struct anon_vma *anon_vma) -{ - vma->anon_vma = anon_vma; - INIT_LIST_HEAD(&vma->anon_vma_chain); - list_add(&avc->same_vma, &vma->anon_vma_chain); - avc->anon_vma = vma->anon_vma; -} - -static void vma_set_dummy_anon_vma(struct vm_area_struct *vma, - struct anon_vma_chain *avc) -{ - __vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma); -} - static bool test_simple_merge(void) { struct vm_area_struct *vma; @@ -1616,39 +1408,6 @@ static bool test_merge_extend(void) return true; } -static bool test_copy_vma(void) -{ - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; - struct mm_struct mm = {}; - bool need_locks = false; - VMA_ITERATOR(vmi, &mm, 0); - struct vm_area_struct *vma, *vma_new, *vma_next; - - /* Move backwards and do not merge. */ - - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); - vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks); - ASSERT_NE(vma_new, vma); - ASSERT_EQ(vma_new->vm_start, 0); - ASSERT_EQ(vma_new->vm_end, 0x2000); - ASSERT_EQ(vma_new->vm_pgoff, 0); - vma_assert_attached(vma_new); - - cleanup_mm(&mm, &vmi); - - /* Move a VMA into position next to another and merge the two. */ - - vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags); - vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks); - vma_assert_attached(vma_new); - - ASSERT_EQ(vma_new, vma_next); - - cleanup_mm(&mm, &vmi); - return true; -} - static bool test_expand_only_mode(void) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; @@ -1689,73 +1448,8 @@ static bool test_expand_only_mode(void) return true; } -static bool test_mmap_region_basic(void) -{ - struct mm_struct mm = {}; - unsigned long addr; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, &mm, 0); - - current->mm = &mm; - - /* Map at 0x300000, length 0x3000. */ - addr = __mmap_region(NULL, 0x300000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x300, NULL); - ASSERT_EQ(addr, 0x300000); - - /* Map at 0x250000, length 0x3000. */ - addr = __mmap_region(NULL, 0x250000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x250, NULL); - ASSERT_EQ(addr, 0x250000); - - /* Map at 0x303000, merging to 0x300000 of length 0x6000. */ - addr = __mmap_region(NULL, 0x303000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x303, NULL); - ASSERT_EQ(addr, 0x303000); - - /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */ - addr = __mmap_region(NULL, 0x24d000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x24d, NULL); - ASSERT_EQ(addr, 0x24d000); - - ASSERT_EQ(mm.map_count, 2); - - for_each_vma(vmi, vma) { - if (vma->vm_start == 0x300000) { - ASSERT_EQ(vma->vm_end, 0x306000); - ASSERT_EQ(vma->vm_pgoff, 0x300); - } else if (vma->vm_start == 0x24d000) { - ASSERT_EQ(vma->vm_end, 0x253000); - ASSERT_EQ(vma->vm_pgoff, 0x24d); - } else { - ASSERT_FALSE(true); - } - } - - cleanup_mm(&mm, &vmi); - return true; -} - -int main(void) +static void run_merge_tests(int *num_tests, int *num_fail) { - int num_tests = 0, num_fail = 0; - - maple_tree_init(); - vma_state_init(); - -#define TEST(name) \ - do { \ - num_tests++; \ - if (!test_##name()) { \ - num_fail++; \ - fprintf(stderr, "Test " #name " FAILED\n"); \ - } \ - } while (0) - /* Very simple tests to kick the tyres. */ TEST(simple_merge); TEST(simple_modify); @@ -1771,15 +1465,5 @@ int main(void) TEST(dup_anon_vma); TEST(vmi_prealloc_fail); TEST(merge_extend); - TEST(copy_vma); TEST(expand_only_mode); - - TEST(mmap_region_basic); - -#undef TEST - - printf("%d tests run, %d passed, %d failed.\n", - num_tests, num_tests - num_fail, num_fail); - - return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/tools/testing/vma/tests/mmap.c b/tools/testing/vma/tests/mmap.c new file mode 100644 index 000000000000..bded4ecbe5db --- /dev/null +++ b/tools/testing/vma/tests/mmap.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +static bool test_mmap_region_basic(void) +{ + struct mm_struct mm = {}; + unsigned long addr; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, &mm, 0); + + current->mm = &mm; + + /* Map at 0x300000, length 0x3000. */ + addr = __mmap_region(NULL, 0x300000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x300, NULL); + ASSERT_EQ(addr, 0x300000); + + /* Map at 0x250000, length 0x3000. */ + addr = __mmap_region(NULL, 0x250000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x250, NULL); + ASSERT_EQ(addr, 0x250000); + + /* Map at 0x303000, merging to 0x300000 of length 0x6000. */ + addr = __mmap_region(NULL, 0x303000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x303, NULL); + ASSERT_EQ(addr, 0x303000); + + /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */ + addr = __mmap_region(NULL, 0x24d000, 0x3000, + VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, + 0x24d, NULL); + ASSERT_EQ(addr, 0x24d000); + + ASSERT_EQ(mm.map_count, 2); + + for_each_vma(vmi, vma) { + if (vma->vm_start == 0x300000) { + ASSERT_EQ(vma->vm_end, 0x306000); + ASSERT_EQ(vma->vm_pgoff, 0x300); + } else if (vma->vm_start == 0x24d000) { + ASSERT_EQ(vma->vm_end, 0x253000); + ASSERT_EQ(vma->vm_pgoff, 0x24d); + } else { + ASSERT_FALSE(true); + } + } + + cleanup_mm(&mm, &vmi); + return true; +} + +static void run_mmap_tests(int *num_tests, int *num_fail) +{ + TEST(mmap_region_basic); +} diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c new file mode 100644 index 000000000000..c54ffc954f11 --- /dev/null +++ b/tools/testing/vma/tests/vma.c @@ -0,0 +1,339 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +static bool compare_legacy_flags(vm_flags_t legacy_flags, vma_flags_t flags) +{ + const unsigned long legacy_val = legacy_flags; + /* The lower word should contain the precise same value. */ + const unsigned long flags_lower = flags.__vma_flags[0]; +#if NUM_VMA_FLAGS > BITS_PER_LONG + int i; + + /* All bits in higher flag values should be zero. */ + for (i = 1; i < NUM_VMA_FLAGS / BITS_PER_LONG; i++) { + if (flags.__vma_flags[i] != 0) + return false; + } +#endif + + static_assert(sizeof(legacy_flags) == sizeof(unsigned long)); + + return legacy_val == flags_lower; +} + +static bool test_copy_vma(void) +{ + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + bool need_locks = false; + VMA_ITERATOR(vmi, &mm, 0); + struct vm_area_struct *vma, *vma_new, *vma_next; + + /* Move backwards and do not merge. */ + + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks); + ASSERT_NE(vma_new, vma); + ASSERT_EQ(vma_new->vm_start, 0); + ASSERT_EQ(vma_new->vm_end, 0x2000); + ASSERT_EQ(vma_new->vm_pgoff, 0); + vma_assert_attached(vma_new); + + cleanup_mm(&mm, &vmi); + + /* Move a VMA into position next to another and merge the two. */ + + vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags); + vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks); + vma_assert_attached(vma_new); + + ASSERT_EQ(vma_new, vma_next); + + cleanup_mm(&mm, &vmi); + return true; +} + +static bool test_vma_flags_unchanged(void) +{ + vma_flags_t flags = EMPTY_VMA_FLAGS; + vm_flags_t legacy_flags = 0; + int bit; + struct vm_area_struct vma; + struct vm_area_desc desc; + + + vma.flags = EMPTY_VMA_FLAGS; + desc.vma_flags = EMPTY_VMA_FLAGS; + + for (bit = 0; bit < BITS_PER_LONG; bit++) { + vma_flags_t mask = mk_vma_flags(bit); + + legacy_flags |= (1UL << bit); + + /* Individual flags. */ + vma_flags_set(&flags, bit); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, flags)); + + /* Via mask. */ + vma_flags_set_mask(&flags, mask); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, flags)); + + /* Same for VMA. */ + vma_set_flags(&vma, bit); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, vma.flags)); + vma_set_flags_mask(&vma, mask); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, vma.flags)); + + /* Same for VMA descriptor. */ + vma_desc_set_flags(&desc, bit); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, desc.vma_flags)); + vma_desc_set_flags_mask(&desc, mask); + ASSERT_TRUE(compare_legacy_flags(legacy_flags, desc.vma_flags)); + } + + return true; +} + +static bool test_vma_flags_cleared(void) +{ + const vma_flags_t empty = EMPTY_VMA_FLAGS; + vma_flags_t flags; + int i; + + /* Set all bits high. */ + memset(&flags, 1, sizeof(flags)); + /* Try to clear. */ + vma_flags_clear_all(&flags); + /* Equal to EMPTY_VMA_FLAGS? */ + ASSERT_EQ(memcmp(&empty, &flags, sizeof(flags)), 0); + /* Make sure every unsigned long entry in bitmap array zero. */ + for (i = 0; i < sizeof(flags) / BITS_PER_LONG; i++) { + const unsigned long val = flags.__vma_flags[i]; + + ASSERT_EQ(val, 0); + } + + return true; +} + +/* + * Assert that VMA flag functions that operate at the system word level function + * correctly. + */ +static bool test_vma_flags_word(void) +{ + vma_flags_t flags = EMPTY_VMA_FLAGS; + const vma_flags_t comparison = + mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, 64, 65); + + /* Set some custom high flags. */ + vma_flags_set(&flags, 64, 65); + /* Now overwrite the first word. */ + vma_flags_overwrite_word(&flags, VM_READ | VM_WRITE); + /* Ensure they are equal. */ + ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0); + + flags = EMPTY_VMA_FLAGS; + vma_flags_set(&flags, 64, 65); + + /* Do the same with the _once() equivalent. */ + vma_flags_overwrite_word_once(&flags, VM_READ | VM_WRITE); + ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0); + + flags = EMPTY_VMA_FLAGS; + vma_flags_set(&flags, 64, 65); + + /* Make sure we can set a word without disturbing other bits. */ + vma_flags_set(&flags, VMA_WRITE_BIT); + vma_flags_set_word(&flags, VM_READ); + ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0); + + flags = EMPTY_VMA_FLAGS; + vma_flags_set(&flags, 64, 65); + + /* Make sure we can clear a word without disturbing other bits. */ + vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + vma_flags_clear_word(&flags, VM_EXEC); + ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0); + + return true; +} + +/* Ensure that vma_flags_test() and friends works correctly. */ +static bool test_vma_flags_test(void) +{ + const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT, 64, 65); + struct vm_area_struct vma; + struct vm_area_desc desc; + + vma.flags = flags; + desc.vma_flags = flags; + +#define do_test(...) \ + ASSERT_TRUE(vma_flags_test(&flags, __VA_ARGS__)); \ + ASSERT_TRUE(vma_desc_test_flags(&desc, __VA_ARGS__)) + +#define do_test_all_true(...) \ + ASSERT_TRUE(vma_flags_test_all(&flags, __VA_ARGS__)); \ + ASSERT_TRUE(vma_test_all_flags(&vma, __VA_ARGS__)) + +#define do_test_all_false(...) \ + ASSERT_FALSE(vma_flags_test_all(&flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_test_all_flags(&vma, __VA_ARGS__)) + + /* + * Testing for some flags that are present, some that are not - should + * pass. ANY flags matching should work. + */ + do_test(VMA_READ_BIT, VMA_MAYREAD_BIT, VMA_SEQ_READ_BIT); + /* However, the ...test_all() variant should NOT pass. */ + do_test_all_false(VMA_READ_BIT, VMA_MAYREAD_BIT, VMA_SEQ_READ_BIT); + /* But should pass for flags present. */ + do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65); + /* Also subsets... */ + do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64); + do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT); + do_test_all_true(VMA_READ_BIT); + /* + * Check _mask variant. We don't need to test extensively as macro + * helper is the equivalent. + */ + ASSERT_TRUE(vma_flags_test_mask(&flags, flags)); + ASSERT_TRUE(vma_flags_test_all_mask(&flags, flags)); + + /* Single bits. */ + do_test(VMA_READ_BIT); + do_test(VMA_WRITE_BIT); + do_test(VMA_EXEC_BIT); +#if NUM_VMA_FLAG_BITS > 64 + do_test(64); + do_test(65); +#endif + + /* Two bits. */ + do_test(VMA_READ_BIT, VMA_WRITE_BIT); + do_test(VMA_READ_BIT, VMA_EXEC_BIT); + do_test(VMA_WRITE_BIT, VMA_EXEC_BIT); + /* Ordering shouldn't matter. */ + do_test(VMA_WRITE_BIT, VMA_READ_BIT); + do_test(VMA_EXEC_BIT, VMA_READ_BIT); + do_test(VMA_EXEC_BIT, VMA_WRITE_BIT); +#if NUM_VMA_FLAG_BITS > 64 + do_test(VMA_READ_BIT, 64); + do_test(VMA_WRITE_BIT, 64); + do_test(64, VMA_READ_BIT); + do_test(64, VMA_WRITE_BIT); + do_test(VMA_READ_BIT, 65); + do_test(VMA_WRITE_BIT, 65); + do_test(65, VMA_READ_BIT); + do_test(65, VMA_WRITE_BIT); +#endif + /* Three bits. */ + do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); +#if NUM_VMA_FLAG_BITS > 64 + /* No need to consider every single permutation. */ + do_test(VMA_READ_BIT, VMA_WRITE_BIT, 64); + do_test(VMA_READ_BIT, VMA_WRITE_BIT, 65); + + /* Four bits. */ + do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64); + do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 65); + + /* Five bits. */ + do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65); +#endif + +#undef do_test +#undef do_test_all_true +#undef do_test_all_false + + return true; +} + +/* Ensure that vma_flags_clear() and friends works correctly. */ +static bool test_vma_flags_clear(void) +{ + vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT, 64, 65); + vma_flags_t mask = mk_vma_flags(VMA_EXEC_BIT, 64); + struct vm_area_struct vma; + struct vm_area_desc desc; + + vma.flags = flags; + desc.vma_flags = flags; + + /* Cursory check of _mask() variant, as the helper macros imply. */ + vma_flags_clear_mask(&flags, mask); + vma_flags_clear_mask(&vma.flags, mask); + vma_desc_clear_flags_mask(&desc, mask); + ASSERT_FALSE(vma_flags_test(&flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_flags_test(&vma.flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_desc_test_flags(&desc, VMA_EXEC_BIT, 64)); + /* Reset. */ + vma_flags_set(&flags, VMA_EXEC_BIT, 64); + vma_set_flags(&vma, VMA_EXEC_BIT, 64); + vma_desc_set_flags(&desc, VMA_EXEC_BIT, 64); + + /* + * Clear the flags and assert clear worked, then reset flags back to + * include specified flags. + */ +#define do_test_and_reset(...) \ + vma_flags_clear(&flags, __VA_ARGS__); \ + vma_flags_clear(&vma.flags, __VA_ARGS__); \ + vma_desc_clear_flags(&desc, __VA_ARGS__); \ + ASSERT_FALSE(vma_flags_test(&flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_flags_test(&vma.flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_desc_test_flags(&desc, __VA_ARGS__)); \ + vma_flags_set(&flags, __VA_ARGS__); \ + vma_set_flags(&vma, __VA_ARGS__); \ + vma_desc_set_flags(&desc, __VA_ARGS__) + + /* Single flags. */ + do_test_and_reset(VMA_READ_BIT); + do_test_and_reset(VMA_WRITE_BIT); + do_test_and_reset(VMA_EXEC_BIT); + do_test_and_reset(64); + do_test_and_reset(65); + + /* Two flags, in different orders. */ + do_test_and_reset(VMA_READ_BIT, VMA_WRITE_BIT); + do_test_and_reset(VMA_READ_BIT, VMA_EXEC_BIT); + do_test_and_reset(VMA_READ_BIT, 64); + do_test_and_reset(VMA_READ_BIT, 65); + do_test_and_reset(VMA_WRITE_BIT, VMA_READ_BIT); + do_test_and_reset(VMA_WRITE_BIT, VMA_EXEC_BIT); + do_test_and_reset(VMA_WRITE_BIT, 64); + do_test_and_reset(VMA_WRITE_BIT, 65); + do_test_and_reset(VMA_EXEC_BIT, VMA_READ_BIT); + do_test_and_reset(VMA_EXEC_BIT, VMA_WRITE_BIT); + do_test_and_reset(VMA_EXEC_BIT, 64); + do_test_and_reset(VMA_EXEC_BIT, 65); + do_test_and_reset(64, VMA_READ_BIT); + do_test_and_reset(64, VMA_WRITE_BIT); + do_test_and_reset(64, VMA_EXEC_BIT); + do_test_and_reset(64, 65); + do_test_and_reset(65, VMA_READ_BIT); + do_test_and_reset(65, VMA_WRITE_BIT); + do_test_and_reset(65, VMA_EXEC_BIT); + do_test_and_reset(65, 64); + + /* Three flags. */ + +#undef do_test_some_missing +#undef do_test_and_reset + + return true; +} + +static void run_vma_tests(int *num_tests, int *num_fail) +{ + TEST(copy_vma); + TEST(vma_flags_unchanged); + TEST(vma_flags_cleared); + TEST(vma_flags_word); + TEST(vma_flags_test); + TEST(vma_flags_clear); +} diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 7fa56dcc53a6..0e1121e2ef23 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -12,16 +12,18 @@ #ifndef __MM_VMA_INTERNAL_H #define __MM_VMA_INTERNAL_H -#define __private -#define __bitwise -#define __randomize_layout +#include <stdlib.h> #define CONFIG_MMU #define CONFIG_PER_VMA_LOCK -#include <stdlib.h> +#ifdef __CONCAT +#undef __CONCAT +#endif +#include <linux/args.h> #include <linux/atomic.h> +#include <linux/bitmap.h> #include <linux/list.h> #include <linux/maple_tree.h> #include <linux/mm.h> @@ -29,1839 +31,28 @@ #include <linux/refcount.h> #include <linux/slab.h> -extern unsigned long stack_guard_gap; -#ifdef CONFIG_MMU -extern unsigned long mmap_min_addr; -extern unsigned long dac_mmap_min_addr; -#else -#define mmap_min_addr 0UL -#define dac_mmap_min_addr 0UL -#endif - -#define VM_WARN_ON(_expr) (WARN_ON(_expr)) -#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) -#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) -#define VM_BUG_ON(_expr) (BUG_ON(_expr)) -#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) - -#define MMF_HAS_MDWE 28 - -/* - * vm_flags in vm_area_struct, see mm_types.h. - * When changing, update also include/trace/events/mmflags.h - */ - -#define VM_NONE 0x00000000 - -/** - * typedef vma_flag_t - specifies an individual VMA flag by bit number. - * - * This value is made type safe by sparse to avoid passing invalid flag values - * around. - */ -typedef int __bitwise vma_flag_t; - -#define DECLARE_VMA_BIT(name, bitnum) \ - VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) -#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ - VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT -enum { - DECLARE_VMA_BIT(READ, 0), - DECLARE_VMA_BIT(WRITE, 1), - DECLARE_VMA_BIT(EXEC, 2), - DECLARE_VMA_BIT(SHARED, 3), - /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ - DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ - DECLARE_VMA_BIT(MAYWRITE, 5), - DECLARE_VMA_BIT(MAYEXEC, 6), - DECLARE_VMA_BIT(MAYSHARE, 7), - DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ -#ifdef CONFIG_MMU - DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ -#else - /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ - DECLARE_VMA_BIT(MAYOVERLAY, 9), -#endif /* CONFIG_MMU */ - /* Page-ranges managed without "struct page", just pure PFN */ - DECLARE_VMA_BIT(PFNMAP, 10), - DECLARE_VMA_BIT(MAYBE_GUARD, 11), - DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ - DECLARE_VMA_BIT(LOCKED, 13), - DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ - DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ - DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ - DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ - DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ - DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ - DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ - DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ - DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ - DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ - DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ - DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ - DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ - DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ - DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ - DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ - DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ - DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ - /* These bits are reused, we define specific uses below. */ - DECLARE_VMA_BIT(HIGH_ARCH_0, 32), - DECLARE_VMA_BIT(HIGH_ARCH_1, 33), - DECLARE_VMA_BIT(HIGH_ARCH_2, 34), - DECLARE_VMA_BIT(HIGH_ARCH_3, 35), - DECLARE_VMA_BIT(HIGH_ARCH_4, 36), - DECLARE_VMA_BIT(HIGH_ARCH_5, 37), - DECLARE_VMA_BIT(HIGH_ARCH_6, 38), - /* - * This flag is used to connect VFIO to arch specific KVM code. It - * indicates that the memory under this VMA is safe for use with any - * non-cachable memory type inside KVM. Some VFIO devices, on some - * platforms, are thought to be unsafe and can cause machine crashes - * if KVM does not lock down the memory type. - */ - DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), -#ifdef CONFIG_PPC32 - DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), -#else - DECLARE_VMA_BIT(DROPPABLE, 40), -#endif - DECLARE_VMA_BIT(UFFD_MINOR, 41), - DECLARE_VMA_BIT(SEALED, 42), - /* Flags that reuse flags above. */ - DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), - DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), - DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), - DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), - DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), -#if defined(CONFIG_X86_USER_SHADOW_STACK) - /* - * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of - * support core mm. - * - * These VMAs will get a single end guard page. This helps userspace - * protect itself from attacks. A single page is enough for current - * shadow stack archs (x86). See the comments near alloc_shstk() in - * arch/x86/kernel/shstk.c for more details on the guard size. - */ - DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), -#elif defined(CONFIG_ARM64_GCS) - /* - * arm64's Guarded Control Stack implements similar functionality and - * has similar constraints to shadow stacks. - */ - DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), -#endif - DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ - DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ - DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ - DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ - DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ - DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ - DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ - DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ -#ifdef CONFIG_STACK_GROWSUP - DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), - DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), -#else - DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), -#endif -}; - -#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) -#define VM_READ INIT_VM_FLAG(READ) -#define VM_WRITE INIT_VM_FLAG(WRITE) -#define VM_EXEC INIT_VM_FLAG(EXEC) -#define VM_SHARED INIT_VM_FLAG(SHARED) -#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) -#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) -#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) -#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) -#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) -#ifdef CONFIG_MMU -#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) -#else -#define VM_UFFD_MISSING VM_NONE -#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) -#endif -#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) -#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) -#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) -#define VM_LOCKED INIT_VM_FLAG(LOCKED) -#define VM_IO INIT_VM_FLAG(IO) -#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) -#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) -#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) -#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) -#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) -#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) -#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) -#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) -#define VM_SYNC INIT_VM_FLAG(SYNC) -#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) -#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) -#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) -#ifdef CONFIG_MEM_SOFT_DIRTY -#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) -#else -#define VM_SOFTDIRTY VM_NONE -#endif -#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) -#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) -#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) -#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) -#define VM_STACK INIT_VM_FLAG(STACK) -#ifdef CONFIG_STACK_GROWS_UP -#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) -#else -#define VM_STACK_EARLY VM_NONE -#endif -#ifdef CONFIG_ARCH_HAS_PKEYS -#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) -/* Despite the naming, these are FLAGS not bits. */ -#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) -#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) -#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) -#if CONFIG_ARCH_PKEY_BITS > 3 -#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) -#else -#define VM_PKEY_BIT3 VM_NONE -#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ -#if CONFIG_ARCH_PKEY_BITS > 4 -#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) -#else -#define VM_PKEY_BIT4 VM_NONE -#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ -#endif /* CONFIG_ARCH_HAS_PKEYS */ -#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) -#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) -#else -#define VM_SHADOW_STACK VM_NONE -#endif -#if defined(CONFIG_PPC64) -#define VM_SAO INIT_VM_FLAG(SAO) -#elif defined(CONFIG_PARISC) -#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) -#elif defined(CONFIG_SPARC64) -#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) -#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) -#elif defined(CONFIG_ARM64) -#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) -#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) -#elif !defined(CONFIG_MMU) -#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) -#endif -#ifndef VM_GROWSUP -#define VM_GROWSUP VM_NONE -#endif -#ifdef CONFIG_ARM64_MTE -#define VM_MTE INIT_VM_FLAG(MTE) -#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) -#else -#define VM_MTE VM_NONE -#define VM_MTE_ALLOWED VM_NONE -#endif -#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR -#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) -#else -#define VM_UFFD_MINOR VM_NONE -#endif -#ifdef CONFIG_64BIT -#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) -#define VM_SEALED INIT_VM_FLAG(SEALED) -#else -#define VM_ALLOW_ANY_UNCACHED VM_NONE -#define VM_SEALED VM_NONE -#endif -#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) -#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) -#else -#define VM_DROPPABLE VM_NONE -#endif - -/* Bits set in the VMA until the stack is in its final location */ -#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) - -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) - -/* Common data flag combinations */ -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ - VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC -#endif - -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -#endif - -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) - -/* VMA basic access permission flags */ -#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) - -/* - * Special vmas that are non-mergable, non-mlock()able. - */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) - -#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) -#define TASK_SIZE_LOW DEFAULT_MAP_WINDOW -#define TASK_SIZE_MAX DEFAULT_MAP_WINDOW -#define STACK_TOP TASK_SIZE_LOW -#define STACK_TOP_MAX TASK_SIZE_MAX - -/* This mask represents all the VMA flag bits used by mlock */ -#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) - -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) - -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#define RLIMIT_STACK 3 /* max stack size */ -#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ - -#define CAP_IPC_LOCK 14 - -/* - * Flags which should be 'sticky' on merge - that is, flags which, when one VMA - * possesses it but the other does not, the merged VMA should nonetheless have - * applied to it: - * - * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its - * references cleared via /proc/$pid/clear_refs, any merged VMA - * should be considered soft-dirty also as it operates at a VMA - * granularity. - */ -#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) - -/* - * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one - * of these flags and the other not does not preclude a merge. - * - * VM_STICKY - When merging VMAs, VMA flags must match, unless they are - * 'sticky'. If any sticky flags exist in either VMA, we simply - * set all of them on the merged VMA. - */ -#define VM_IGNORE_MERGE VM_STICKY - -/* - * Flags which should result in page tables being copied on fork. These are - * flags which indicate that the VMA maps page tables which cannot be - * reconsistuted upon page fault, so necessitate page table copying upon - * - * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be - * reasonably reconstructed on page fault. - * - * VM_UFFD_WP - Encodes metadata about an installed uffd - * write protect handler, which cannot be - * reconstructed on page fault. - * - * We always copy pgtables when dst_vma has uffd-wp - * enabled even if it's file-backed - * (e.g. shmem). Because when uffd-wp is enabled, - * pgtable contains uffd-wp protection information, - * that's something we can't retrieve from page cache, - * and skip copying will lose those info. - * - * VM_MAYBE_GUARD - Could contain page guard region markers which - * by design are a property of the page tables - * only and thus cannot be reconstructed on page - * fault. - */ -#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) - -#define FIRST_USER_ADDRESS 0UL -#define USER_PGTABLES_CEILING 0UL - -#define vma_policy(vma) NULL - -#define down_write_nest_lock(sem, nest_lock) - -#define pgprot_val(x) ((x).pgprot) -#define __pgprot(x) ((pgprot_t) { (x) } ) - -#define for_each_vma(__vmi, __vma) \ - while (((__vma) = vma_next(&(__vmi))) != NULL) - -/* The MM code likes to work with exclusive end addresses */ -#define for_each_vma_range(__vmi, __vma, __end) \ - while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) - -#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) - -#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT)) - -#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr) -#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr) - -#define TASK_SIZE ((1ul << 47)-PAGE_SIZE) - -#define AS_MM_ALL_LOCKS 2 - -/* We hardcode this for now. */ -#define sysctl_max_map_count 0x1000000UL - -#define pgoff_t unsigned long -typedef unsigned long pgprotval_t; -typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; -typedef unsigned long vm_flags_t; -typedef __bitwise unsigned int vm_fault_t; - -/* - * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...) - * either way :) - */ -#define pr_warn_once pr_err - -#define data_race(expr) expr - -#define ASSERT_EXCLUSIVE_WRITER(x) - -#define pgtable_supports_soft_dirty() 1 - -/** - * swap - swap values of @a and @b - * @a: first value - * @b: second value - */ -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - -struct kref { - refcount_t refcount; -}; - -/* - * Define the task command name length as enum, then it can be visible to - * BPF programs. - */ -enum { - TASK_COMM_LEN = 16, -}; - /* - * Flags for bug emulation. - * - * These occupy the top three bytes. + * DUPLICATE typedef definitions from kernel source that have to be declared + * ahead of all other headers. */ -enum { - READ_IMPLIES_EXEC = 0x0400000, -}; - -struct task_struct { - char comm[TASK_COMM_LEN]; - pid_t pid; - struct mm_struct *mm; - - /* Used for emulating ABI behavior of previous Linux versions: */ - unsigned int personality; -}; - -struct task_struct *get_current(void); -#define current get_current() - -struct anon_vma { - struct anon_vma *root; - struct rb_root_cached rb_root; - - /* Test fields. */ - bool was_cloned; - bool was_unlinked; -}; - -struct anon_vma_chain { - struct anon_vma *anon_vma; - struct list_head same_vma; -}; - -struct anon_vma_name { - struct kref kref; - /* The name needs to be at the end because it is dynamically sized. */ - char name[]; -}; - -struct vma_iterator { - struct ma_state mas; -}; - -#define VMA_ITERATOR(name, __mm, __addr) \ - struct vma_iterator name = { \ - .mas = { \ - .tree = &(__mm)->mm_mt, \ - .index = __addr, \ - .node = NULL, \ - .status = ma_start, \ - }, \ - } - -struct address_space { - struct rb_root_cached i_mmap; - unsigned long flags; - atomic_t i_mmap_writable; -}; - -struct vm_userfaultfd_ctx {}; -struct mempolicy {}; -struct mmu_gather {}; -struct mutex {}; -#define DEFINE_MUTEX(mutexname) \ - struct mutex mutexname = {} - -#define DECLARE_BITMAP(name, bits) \ - unsigned long name[BITS_TO_LONGS(bits)] - -#define NUM_MM_FLAG_BITS (64) +#define __private +/* NUM_MM_FLAG_BITS defined by test code. */ typedef struct { __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } mm_flags_t; - -/* - * Opaque type representing current VMA (vm_area_struct) flag state. Must be - * accessed via vma_flags_xxx() helper functions. - */ -#define NUM_VMA_FLAG_BITS BITS_PER_LONG +/* NUM_VMA_FLAG_BITS defined by test code. */ typedef struct { DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); } __private vma_flags_t; -struct mm_struct { - struct maple_tree mm_mt; - int map_count; /* number of VMAs */ - unsigned long total_vm; /* Total pages mapped */ - unsigned long locked_vm; /* Pages that have PG_mlocked set */ - unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ - unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ - unsigned long stack_vm; /* VM_STACK */ - - unsigned long def_flags; - - mm_flags_t flags; /* Must use mm_flags_* helpers to access */ -}; - -struct vm_area_struct; - - -/* What action should be taken after an .mmap_prepare call is complete? */ -enum mmap_action_type { - MMAP_NOTHING, /* Mapping is complete, no further action. */ - MMAP_REMAP_PFN, /* Remap PFN range. */ - MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ -}; - -/* - * Describes an action an mmap_prepare hook can instruct to be taken to complete - * the mapping of a VMA. Specified in vm_area_desc. - */ -struct mmap_action { - union { - /* Remap range. */ - struct { - unsigned long start; - unsigned long start_pfn; - unsigned long size; - pgprot_t pgprot; - } remap; - }; - enum mmap_action_type type; - - /* - * If specified, this hook is invoked after the selected action has been - * successfully completed. Note that the VMA write lock still held. - * - * The absolute minimum ought to be done here. - * - * Returns 0 on success, or an error code. - */ - int (*success_hook)(const struct vm_area_struct *vma); - - /* - * If specified, this hook is invoked when an error occurred when - * attempting the selection action. - * - * The hook can return an error code in order to filter the error, but - * it is not valid to clear the error here. - */ - int (*error_hook)(int err); - - /* - * This should be set in rare instances where the operation required - * that the rmap should not be able to access the VMA until - * completely set up. - */ - bool hide_from_rmap_until_complete :1; -}; - -/* Operations which modify VMAs. */ -enum vma_operation { - VMA_OP_SPLIT, - VMA_OP_MERGE_UNFAULTED, - VMA_OP_REMAP, - VMA_OP_FORK, -}; - -/* - * Describes a VMA that is about to be mmap()'ed. Drivers may choose to - * manipulate mutable fields which will cause those fields to be updated in the - * resultant VMA. - * - * Helper functions are not required for manipulating any field. - */ -struct vm_area_desc { - /* Immutable state. */ - const struct mm_struct *const mm; - struct file *const file; /* May vary from vm_file in stacked callers. */ - unsigned long start; - unsigned long end; - - /* Mutable fields. Populated with initial state. */ - pgoff_t pgoff; - struct file *vm_file; - union { - vm_flags_t vm_flags; - vma_flags_t vma_flags; - }; - pgprot_t page_prot; - - /* Write-only fields. */ - const struct vm_operations_struct *vm_ops; - void *private_data; - - /* Take further action? */ - struct mmap_action action; -}; - -struct file_operations { - int (*mmap)(struct file *, struct vm_area_struct *); - int (*mmap_prepare)(struct vm_area_desc *); -}; - -struct file { - struct address_space *f_mapping; - const struct file_operations *f_op; -}; - -#define VMA_LOCK_OFFSET 0x40000000 - -typedef struct { unsigned long v; } freeptr_t; - -struct vm_area_struct { - /* The first cache line has the info for VMA tree walking. */ - - union { - struct { - /* VMA covers [vm_start; vm_end) addresses within mm */ - unsigned long vm_start; - unsigned long vm_end; - }; - freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ - }; - - struct mm_struct *vm_mm; /* The address space we belong to. */ - pgprot_t vm_page_prot; /* Access permissions of this VMA. */ - - /* - * Flags, see mm.h. - * To modify use vm_flags_{init|reset|set|clear|mod} functions. - */ - union { - const vm_flags_t vm_flags; - vma_flags_t flags; - }; - -#ifdef CONFIG_PER_VMA_LOCK - /* - * Can only be written (using WRITE_ONCE()) while holding both: - * - mmap_lock (in write mode) - * - vm_refcnt bit at VMA_LOCK_OFFSET is set - * Can be read reliably while holding one of: - * - mmap_lock (in read or write mode) - * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 - * Can be read unreliably (using READ_ONCE()) for pessimistic bailout - * while holding nothing (except RCU to keep the VMA struct allocated). - * - * This sequence counter is explicitly allowed to overflow; sequence - * counter reuse can only lead to occasional unnecessary use of the - * slowpath. - */ - unsigned int vm_lock_seq; -#endif - - /* - * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma - * list, after a COW of one of the file pages. A MAP_SHARED vma - * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack - * or brk vma (with NULL file) can only be in an anon_vma list. - */ - struct list_head anon_vma_chain; /* Serialized by mmap_lock & - * page_table_lock */ - struct anon_vma *anon_vma; /* Serialized by page_table_lock */ - - /* Function pointers to deal with this struct. */ - const struct vm_operations_struct *vm_ops; - - /* Information about our backing store: */ - unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE - units */ - struct file * vm_file; /* File we map to (can be NULL). */ - void * vm_private_data; /* was vm_pte (shared mem) */ - -#ifdef CONFIG_SWAP - atomic_long_t swap_readahead_info; -#endif -#ifndef CONFIG_MMU - struct vm_region *vm_region; /* NOMMU mapping region */ -#endif -#ifdef CONFIG_NUMA - struct mempolicy *vm_policy; /* NUMA policy for the VMA */ -#endif -#ifdef CONFIG_NUMA_BALANCING - struct vma_numab_state *numab_state; /* NUMA Balancing state */ -#endif -#ifdef CONFIG_PER_VMA_LOCK - /* Unstable RCU readers are allowed to read this. */ - refcount_t vm_refcnt; -#endif - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree. - * - */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; -#ifdef CONFIG_ANON_VMA_NAME - /* - * For private and shared anonymous mappings, a pointer to a null - * terminated string containing the name given to the vma, or NULL if - * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. - */ - struct anon_vma_name *anon_name; -#endif - struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -} __randomize_layout; - -struct vm_fault {}; - -struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); - /** - * @close: Called when the VMA is being removed from the MM. - * Context: User context. May sleep. Caller holds mmap_lock. - */ - void (*close)(struct vm_area_struct * area); - /* Called any time before splitting to check if it's allowed */ - int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area); - /* - * Called by mprotect() to make driver-specific permission - * checks before mprotect() is finalised. The VMA must not - * be modified. Returns 0 if mprotect() can proceed. - */ - int (*mprotect)(struct vm_area_struct *vma, unsigned long start, - unsigned long end, unsigned long newflags); - vm_fault_t (*fault)(struct vm_fault *vmf); - vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); - vm_fault_t (*map_pages)(struct vm_fault *vmf, - pgoff_t start_pgoff, pgoff_t end_pgoff); - unsigned long (*pagesize)(struct vm_area_struct * area); - - /* notification that a previously read-only page is about to become - * writable, if an error is returned it will cause a SIGBUS */ - vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); - - /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ - vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); - - /* called by access_process_vm when get_user_pages() fails, typically - * for use by special VMAs. See also generic_access_phys() for a generic - * implementation useful for any iomem mapping. - */ - int (*access)(struct vm_area_struct *vma, unsigned long addr, - void *buf, int len, int write); - - /* Called by the /proc/PID/maps code to ask the vma whether it - * has a special name. Returning non-NULL will also cause this - * vma to be dumped unconditionally. */ - const char *(*name)(struct vm_area_struct *vma); - -#ifdef CONFIG_NUMA - /* - * set_policy() op must add a reference to any non-NULL @new mempolicy - * to hold the policy upon return. Caller should pass NULL @new to - * remove a policy and fall back to surrounding context--i.e. do not - * install a MPOL_DEFAULT policy, nor the task or system default - * mempolicy. - */ - int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); - - /* - * get_policy() op must add reference [mpol_get()] to any policy at - * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure - * in mm/mempolicy.c will do this automatically. - * get_policy() must NOT add a ref if the policy at (vma,addr) is not - * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. - * If no [shared/vma] mempolicy exists at the addr, get_policy() op - * must return NULL--i.e., do not "fallback" to task or system default - * policy. - */ - struct mempolicy *(*get_policy)(struct vm_area_struct *vma, - unsigned long addr, pgoff_t *ilx); -#endif -#ifdef CONFIG_FIND_NORMAL_PAGE - /* - * Called by vm_normal_page() for special PTEs in @vma at @addr. This - * allows for returning a "normal" page from vm_normal_page() even - * though the PTE indicates that the "struct page" either does not exist - * or should not be touched: "special". - * - * Do not add new users: this really only works when a "normal" page - * was mapped, but then the PTE got changed to something weird (+ - * marked special) that would not make pte_pfn() identify the originally - * inserted page. - */ - struct page *(*find_normal_page)(struct vm_area_struct *vma, - unsigned long addr); -#endif /* CONFIG_FIND_NORMAL_PAGE */ -}; - -struct vm_unmapped_area_info { -#define VM_UNMAPPED_AREA_TOPDOWN 1 - unsigned long flags; - unsigned long length; - unsigned long low_limit; - unsigned long high_limit; - unsigned long align_mask; - unsigned long align_offset; - unsigned long start_gap; -}; - -struct pagetable_move_control { - struct vm_area_struct *old; /* Source VMA. */ - struct vm_area_struct *new; /* Destination VMA. */ - unsigned long old_addr; /* Address from which the move begins. */ - unsigned long old_end; /* Exclusive address at which old range ends. */ - unsigned long new_addr; /* Address to move page tables to. */ - unsigned long len_in; /* Bytes to remap specified by user. */ - - bool need_rmap_locks; /* Do rmap locks need to be taken? */ - bool for_stack; /* Is this an early temp stack being moved? */ -}; - -#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \ - struct pagetable_move_control name = { \ - .old = old_, \ - .new = new_, \ - .old_addr = old_addr_, \ - .old_end = (old_addr_) + (len_), \ - .new_addr = new_addr_, \ - .len_in = len_, \ - } - -static inline void vma_iter_invalidate(struct vma_iterator *vmi) -{ - mas_pause(&vmi->mas); -} - -static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) -{ - return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot)); -} - -static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) -{ - return __pgprot(vm_flags); -} - -static inline bool is_shared_maywrite(vm_flags_t vm_flags) -{ - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); -} - -static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) -{ - return is_shared_maywrite(vma->vm_flags); -} - -static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) -{ - /* - * Uses mas_find() to get the first VMA when the iterator starts. - * Calling mas_next() could skip the first entry. - */ - return mas_find(&vmi->mas, ULONG_MAX); -} - -/* - * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these - * assertions should be made either under mmap_write_lock or when the object - * has been isolated under mmap_write_lock, ensuring no competing writers. - */ -static inline void vma_assert_attached(struct vm_area_struct *vma) -{ - WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); -} - -static inline void vma_assert_detached(struct vm_area_struct *vma) -{ - WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); -} - -static inline void vma_assert_write_locked(struct vm_area_struct *); -static inline void vma_mark_attached(struct vm_area_struct *vma) -{ - vma_assert_write_locked(vma); - vma_assert_detached(vma); - refcount_set_release(&vma->vm_refcnt, 1); -} - -static inline void vma_mark_detached(struct vm_area_struct *vma) -{ - vma_assert_write_locked(vma); - vma_assert_attached(vma); - /* We are the only writer, so no need to use vma_refcount_put(). */ - if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { - /* - * Reader must have temporarily raised vm_refcnt but it will - * drop it without using the vma since vma is write-locked. - */ - } -} - -extern const struct vm_operations_struct vma_dummy_vm_ops; - -extern unsigned long rlimit(unsigned int limit); - -static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) -{ - memset(vma, 0, sizeof(*vma)); - vma->vm_mm = mm; - vma->vm_ops = &vma_dummy_vm_ops; - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_lock_seq = UINT_MAX; -} - -/* - * These are defined in vma.h, but sadly vm_stat_account() is referenced by - * kernel/fork.c, so we have to these broadly available there, and temporarily - * define them here to resolve the dependency cycle. - */ - -#define is_exec_mapping(flags) \ - ((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC) - -#define is_stack_mapping(flags) \ - (((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK)) - -#define is_data_mapping(flags) \ - ((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE) - -static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, - long npages) -{ - WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); - - if (is_exec_mapping(flags)) - mm->exec_vm += npages; - else if (is_stack_mapping(flags)) - mm->stack_vm += npages; - else if (is_data_mapping(flags)) - mm->data_vm += npages; -} - -#undef is_exec_mapping -#undef is_stack_mapping -#undef is_data_mapping - -/* Currently stubbed but we may later wish to un-stub. */ -static inline void vm_acct_memory(long pages); -static inline void vm_unacct_memory(long pages) -{ - vm_acct_memory(-pages); -} - -static inline void mapping_allow_writable(struct address_space *mapping) -{ - atomic_inc(&mapping->i_mmap_writable); -} - -static inline void vma_set_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - pgoff_t pgoff) -{ - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; -} - -static inline -struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) -{ - return mas_find(&vmi->mas, max - 1); -} - -static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, - unsigned long start, unsigned long end, gfp_t gfp) -{ - __mas_set_range(&vmi->mas, start, end - 1); - mas_store_gfp(&vmi->mas, NULL, gfp); - if (unlikely(mas_is_err(&vmi->mas))) - return -ENOMEM; - - return 0; -} - -static inline void mmap_assert_locked(struct mm_struct *); -static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, - unsigned long start_addr, - unsigned long end_addr) -{ - unsigned long index = start_addr; - - mmap_assert_locked(mm); - return mt_find(&mm->mm_mt, &index, end_addr - 1); -} - -static inline -struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) -{ - return mtree_load(&mm->mm_mt, addr); -} - -static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) -{ - return mas_prev(&vmi->mas, 0); -} - -static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) -{ - mas_set(&vmi->mas, addr); -} - -static inline bool vma_is_anonymous(struct vm_area_struct *vma) -{ - return !vma->vm_ops; -} - -/* Defined in vma.h, so temporarily define here to avoid circular dependency. */ -#define vma_iter_load(vmi) \ - mas_walk(&(vmi)->mas) - -static inline struct vm_area_struct * -find_vma_prev(struct mm_struct *mm, unsigned long addr, - struct vm_area_struct **pprev) -{ - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, addr); - - vma = vma_iter_load(&vmi); - *pprev = vma_prev(&vmi); - if (!vma) - vma = vma_next(&vmi); - return vma; -} - -#undef vma_iter_load - -static inline void vma_iter_init(struct vma_iterator *vmi, - struct mm_struct *mm, unsigned long addr) -{ - mas_init(&vmi->mas, &mm->mm_mt, addr); -} - -/* Stubbed functions. */ - -static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) -{ - return NULL; -} - -static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, - struct vm_userfaultfd_ctx vm_ctx) -{ - return true; -} - -static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, - struct anon_vma_name *anon_name2) -{ - return true; -} - -static inline void might_sleep(void) -{ -} - -static inline unsigned long vma_pages(struct vm_area_struct *vma) -{ - return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; -} - -static inline void fput(struct file *file) -{ -} - -static inline void mpol_put(struct mempolicy *pol) -{ -} - -static inline void lru_add_drain(void) -{ -} - -static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) -{ -} - -static inline void update_hiwater_rss(struct mm_struct *mm) -{ -} - -static inline void update_hiwater_vm(struct mm_struct *mm) -{ -} - -static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end) -{ -} - -static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked) -{ -} - -static inline void mapping_unmap_writable(struct address_space *mapping) -{ -} - -static inline void flush_dcache_mmap_lock(struct address_space *mapping) -{ -} - -static inline void tlb_finish_mmu(struct mmu_gather *tlb) -{ -} - -static inline struct file *get_file(struct file *f) -{ - return f; -} - -static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) -{ - return 0; -} - -static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, - enum vma_operation operation) -{ - /* For testing purposes. We indicate that an anon_vma has been cloned. */ - if (src->anon_vma != NULL) { - dst->anon_vma = src->anon_vma; - dst->anon_vma->was_cloned = true; - } - - return 0; -} - -static inline void vma_start_write(struct vm_area_struct *vma) -{ - /* Used to indicate to tests that a write operation has begun. */ - vma->vm_lock_seq++; -} - -static inline __must_check -int vma_start_write_killable(struct vm_area_struct *vma) -{ - /* Used to indicate to tests that a write operation has begun. */ - vma->vm_lock_seq++; - return 0; -} - -static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - struct vm_area_struct *next) -{ -} - -static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {} - -static inline void vma_iter_free(struct vma_iterator *vmi) -{ - mas_destroy(&vmi->mas); -} - -static inline -struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) -{ - return mas_next_range(&vmi->mas, ULONG_MAX); -} - -static inline void vm_acct_memory(long pages) -{ -} - -static inline void vma_interval_tree_insert(struct vm_area_struct *vma, - struct rb_root_cached *rb) -{ -} - -static inline void vma_interval_tree_remove(struct vm_area_struct *vma, - struct rb_root_cached *rb) -{ -} - -static inline void flush_dcache_mmap_unlock(struct address_space *mapping) -{ -} - -static inline void anon_vma_interval_tree_insert(struct anon_vma_chain *avc, - struct rb_root_cached *rb) -{ -} - -static inline void anon_vma_interval_tree_remove(struct anon_vma_chain *avc, - struct rb_root_cached *rb) -{ -} - -static inline void uprobe_mmap(struct vm_area_struct *vma) -{ -} - -static inline void uprobe_munmap(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} - -static inline void i_mmap_lock_write(struct address_space *mapping) -{ -} - -static inline void anon_vma_lock_write(struct anon_vma *anon_vma) -{ -} - -static inline void vma_assert_write_locked(struct vm_area_struct *vma) -{ -} - -static inline void unlink_anon_vmas(struct vm_area_struct *vma) -{ - /* For testing purposes, indicate that the anon_vma was unlinked. */ - vma->anon_vma->was_unlinked = true; -} - -static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) -{ -} - -static inline void i_mmap_unlock_write(struct address_space *mapping) -{ -} - -static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - struct list_head *unmaps) -{ - return 0; -} - -static inline void mmap_write_downgrade(struct mm_struct *mm) -{ -} - -static inline void mmap_read_unlock(struct mm_struct *mm) -{ -} - -static inline void mmap_write_unlock(struct mm_struct *mm) -{ -} - -static inline int mmap_write_lock_killable(struct mm_struct *mm) -{ - return 0; -} - -static inline bool can_modify_mm(struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ - return true; -} - -static inline void arch_unmap(struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ -} - -static inline void mmap_assert_locked(struct mm_struct *mm) -{ -} - -static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) -{ - return true; -} - -static inline void khugepaged_enter_vma(struct vm_area_struct *vma, - vm_flags_t vm_flags) -{ -} - -static inline bool mapping_can_writeback(struct address_space *mapping) -{ - return true; -} - -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) -{ - return false; -} - -static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) -{ - return false; -} - -static inline bool userfaultfd_wp(struct vm_area_struct *vma) -{ - return false; -} - -static inline void mmap_assert_write_locked(struct mm_struct *mm) -{ -} - -static inline void mutex_lock(struct mutex *lock) -{ -} - -static inline void mutex_unlock(struct mutex *lock) -{ -} - -static inline bool mutex_is_locked(struct mutex *lock) -{ - return true; -} - -static inline bool signal_pending(void *p) -{ - return false; -} - -static inline bool is_file_hugepages(struct file *file) -{ - return false; -} - -static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) -{ - return 0; -} - -static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, - unsigned long npages) -{ - return true; -} - -static inline int shmem_zero_setup(struct vm_area_struct *vma) -{ - return 0; -} - -static inline void vma_set_anonymous(struct vm_area_struct *vma) -{ - vma->vm_ops = NULL; -} - -static inline void ksm_add_vma(struct vm_area_struct *vma) -{ -} - -static inline void perf_event_mmap(struct vm_area_struct *vma) -{ -} - -static inline bool vma_is_dax(struct vm_area_struct *vma) -{ - return false; -} - -static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); - -/* Update vma->vm_page_prot to reflect vma->vm_flags. */ -static inline void vma_set_page_prot(struct vm_area_struct *vma) -{ - vm_flags_t vm_flags = vma->vm_flags; - pgprot_t vm_page_prot; - - /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ - vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags)); - - if (vma_wants_writenotify(vma, vm_page_prot)) { - vm_flags &= ~VM_SHARED; - /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ - vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags)); - } - /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ - WRITE_ONCE(vma->vm_page_prot, vm_page_prot); -} - -static inline bool arch_validate_flags(vm_flags_t flags) -{ - return true; -} - -static inline void vma_close(struct vm_area_struct *vma) -{ -} - -static inline int mmap_file(struct file *file, struct vm_area_struct *vma) -{ - return 0; -} - -static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_GROWSDOWN) - return stack_guard_gap; - - /* See reasoning around the VM_SHADOW_STACK definition */ - if (vma->vm_flags & VM_SHADOW_STACK) - return PAGE_SIZE; - - return 0; -} - -static inline unsigned long vm_start_gap(struct vm_area_struct *vma) -{ - unsigned long gap = stack_guard_start_gap(vma); - unsigned long vm_start = vma->vm_start; - - vm_start -= gap; - if (vm_start > vma->vm_start) - vm_start = 0; - return vm_start; -} - -static inline unsigned long vm_end_gap(struct vm_area_struct *vma) -{ - unsigned long vm_end = vma->vm_end; - - if (vma->vm_flags & VM_GROWSUP) { - vm_end += stack_guard_gap; - if (vm_end < vma->vm_end) - vm_end = -PAGE_SIZE; - } - return vm_end; -} - -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, unsigned long len) -{ - return 0; -} - -static inline bool vma_is_accessible(struct vm_area_struct *vma) -{ - return vma->vm_flags & VM_ACCESS_FLAGS; -} - -static inline bool capable(int cap) -{ - return true; -} - -static inline bool mlock_future_ok(const struct mm_struct *mm, - vm_flags_t vm_flags, unsigned long bytes) -{ - unsigned long locked_pages, limit_pages; - - if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) - return true; - - locked_pages = bytes >> PAGE_SHIFT; - locked_pages += mm->locked_vm; - - limit_pages = rlimit(RLIMIT_MEMLOCK); - limit_pages >>= PAGE_SHIFT; - - return locked_pages <= limit_pages; -} - -static inline int __anon_vma_prepare(struct vm_area_struct *vma) -{ - struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma)); - - if (!anon_vma) - return -ENOMEM; - - anon_vma->root = anon_vma; - vma->anon_vma = anon_vma; - - return 0; -} - -static inline int anon_vma_prepare(struct vm_area_struct *vma) -{ - if (likely(vma->anon_vma)) - return 0; - - return __anon_vma_prepare(vma); -} - -static inline void userfaultfd_unmap_complete(struct mm_struct *mm, - struct list_head *uf) -{ -} - -#define ACCESS_PRIVATE(p, member) ((p)->member) - -#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) - -static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) -{ - unsigned int len = bitmap_size(nbits); - - if (small_const_nbits(nbits)) - *dst = 0; - else - memset(dst, 0, len); -} - -static inline bool mm_flags_test(int flag, const struct mm_struct *mm) -{ - return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); -} - -/* Clears all bits in the VMA flags bitmap, non-atomically. */ -static inline void vma_flags_clear_all(vma_flags_t *flags) -{ - bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); -} - -/* - * Copy value to the first system word of VMA flags, non-atomically. - * - * IMPORTANT: This does not overwrite bytes past the first system word. The - * caller must account for this. - */ -static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) -{ - *ACCESS_PRIVATE(flags, __vma_flags) = value; -} - -/* - * Copy value to the first system word of VMA flags ONCE, non-atomically. - * - * IMPORTANT: This does not overwrite bytes past the first system word. The - * caller must account for this. - */ -static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) -{ - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); - - WRITE_ONCE(*bitmap, value); -} - -/* Update the first system word of VMA flags setting bits, non-atomically. */ -static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) -{ - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); - - *bitmap |= value; -} - -/* Update the first system word of VMA flags clearing bits, non-atomically. */ -static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) -{ - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); - - *bitmap &= ~value; -} - - -/* Use when VMA is not part of the VMA tree and needs no locking */ -static inline void vm_flags_init(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_flags_clear_all(&vma->flags); - vma_flags_overwrite_word(&vma->flags, flags); -} - -/* - * Use when VMA is part of the VMA tree and modifications need coordination - * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and - * it should be locked explicitly beforehand. - */ -static inline void vm_flags_reset(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_assert_write_locked(vma); - vm_flags_init(vma, flags); -} - -static inline void vm_flags_reset_once(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_assert_write_locked(vma); - /* - * The user should only be interested in avoiding reordering of - * assignment to the first word. - */ - vma_flags_clear_all(&vma->flags); - vma_flags_overwrite_word_once(&vma->flags, flags); -} - -static inline void vm_flags_set(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma_flags_set_word(&vma->flags, flags); -} - -static inline void vm_flags_clear(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma_flags_clear_word(&vma->flags, flags); -} - -/* - * Denies creating a writable executable mapping or gaining executable permissions. - * - * This denies the following: - * - * a) mmap(PROT_WRITE | PROT_EXEC) - * - * b) mmap(PROT_WRITE) - * mprotect(PROT_EXEC) - * - * c) mmap(PROT_WRITE) - * mprotect(PROT_READ) - * mprotect(PROT_EXEC) - * - * But allows the following: - * - * d) mmap(PROT_READ | PROT_EXEC) - * mmap(PROT_READ | PROT_EXEC | PROT_BTI) - * - * This is only applicable if the user has set the Memory-Deny-Write-Execute - * (MDWE) protection mask for the current process. - * - * @old specifies the VMA flags the VMA originally possessed, and @new the ones - * we propose to set. - * - * Return: false if proposed change is OK, true if not ok and should be denied. - */ -static inline bool map_deny_write_exec(unsigned long old, unsigned long new) -{ - /* If MDWE is disabled, we have nothing to deny. */ - if (mm_flags_test(MMF_HAS_MDWE, current->mm)) - return false; - - /* If the new VMA is not executable, we have nothing to deny. */ - if (!(new & VM_EXEC)) - return false; - - /* Under MDWE we do not accept newly writably executable VMAs... */ - if (new & VM_WRITE) - return true; - - /* ...nor previously non-executable VMAs becoming executable. */ - if (!(old & VM_EXEC)) - return true; - - return false; -} - -static inline int mapping_map_writable(struct address_space *mapping) -{ - return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? - 0 : -EPERM; -} - -static inline unsigned long move_page_tables(struct pagetable_move_control *pmc) -{ - return 0; -} - -static inline void free_pgd_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling) -{ -} - -static inline int ksm_execve(struct mm_struct *mm) -{ - return 0; -} - -static inline void ksm_exit(struct mm_struct *mm) -{ -} - -static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) -{ - if (reset_refcnt) - refcount_set(&vma->vm_refcnt, 0); -} - -static inline void vma_numab_state_init(struct vm_area_struct *vma) -{ -} - -static inline void vma_numab_state_free(struct vm_area_struct *vma) -{ -} - -static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) -{ -} - -static inline void free_anon_vma_name(struct vm_area_struct *vma) -{ -} - -/* Declared in vma.h. */ -static inline void set_vma_from_desc(struct vm_area_struct *vma, - struct vm_area_desc *desc); - -static inline void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) -{ -} - -static inline int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) -{ - return 0; -} - -static inline int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma) -{ - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vm_flags = vma->vm_flags, - .page_prot = vma->vm_page_prot, - - .action.type = MMAP_NOTHING, /* Default */ - }; - int err; - - err = f_op->mmap_prepare(&desc); - if (err) - return err; - - mmap_action_prepare(&desc.action, &desc); - set_vma_from_desc(vma, &desc); - return mmap_action_complete(&desc.action, vma); -} - -static inline int compat_vma_mmap(struct file *file, - struct vm_area_struct *vma) -{ - return __compat_vma_mmap(file->f_op, file, vma); -} - -/* Did the driver provide valid mmap hook configuration? */ -static inline bool can_mmap_file(struct file *file) -{ - bool has_mmap = file->f_op->mmap; - bool has_mmap_prepare = file->f_op->mmap_prepare; - - /* Hooks are mutually exclusive. */ - if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) - return false; - if (!has_mmap && !has_mmap_prepare) - return false; - - return true; -} - -static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) -{ - if (file->f_op->mmap_prepare) - return compat_vma_mmap(file, vma); - - return file->f_op->mmap(file, vma); -} - -static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) -{ - return file->f_op->mmap_prepare(desc); -} - -static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) -{ -} - -static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) -{ - /* Changing an anonymous vma with this is illegal */ - get_file(file); - swap(vma->vm_file, file); - fput(file); -} - -static inline bool shmem_file(struct file *file) -{ - return false; -} - -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, - const struct file *file, vm_flags_t vm_flags) -{ - return vm_flags; -} - -static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) -{ -} - -static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t pgprot) -{ - return 0; -} +typedef unsigned long vm_flags_t; +#define pgoff_t unsigned long +typedef unsigned long pgprotval_t; +typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; +typedef __bitwise unsigned int vm_fault_t; -static inline int do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf) -{ - return 0; -} +#include "include/stubs.h" +#include "include/dup.h" +#include "include/custom.h" #endif /* __MM_VMA_INTERNAL_H */ |
