summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-19 07:50:32 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-19 07:50:32 +0300
commiteeccf287a2a517954b57cf9d733b3cf5d47afa34 (patch)
tree46b5cd55d8da25cbc9aa96b38470506958851005
parent956b9cbd7f156c8672dac94a00de3c6a0939c692 (diff)
parentac1ea219590c09572ed5992dc233bbf7bb70fef9 (diff)
downloadlinux-eeccf287a2a517954b57cf9d733b3cf5d47afa34.tar.xz
Merge tag 'mm-stable-2026-02-18-19-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton: - "mm/vmscan: fix demotion targets checks in reclaim/demotion" fixes a couple of issues in the demotion code - pages were failed demotion and were finding themselves demoted into disallowed nodes (Bing Jiao) - "Remove XA_ZERO from error recovery of dup_mmap()" fixes a rare mapledtree race and performs a number of cleanups (Liam Howlett) - "mm: add bitmap VMA flag helpers and convert all mmap_prepare to use them" implements a lot of cleanups following on from the conversion of the VMA flags into a bitmap (Lorenzo Stoakes) - "support batch checking of references and unmapping for large folios" implements batching to greatly improve the performance of reclaiming clean file-backed large folios (Baolin Wang) - "selftests/mm: add memory failure selftests" does as claimed (Miaohe Lin) * tag 'mm-stable-2026-02-18-19-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (36 commits) mm/page_alloc: clear page->private in free_pages_prepare() selftests/mm: add memory failure dirty pagecache test selftests/mm: add memory failure clean pagecache test selftests/mm: add memory failure anonymous page test mm: rmap: support batched unmapping for file large folios arm64: mm: implement the architecture-specific clear_flush_young_ptes() arm64: mm: support batch clearing of the young flag for large folios arm64: mm: factor out the address and ptep alignment into a new helper mm: rmap: support batched checks of the references for large folios tools/testing/vma: add VMA userland tests for VMA flag functions tools/testing/vma: separate out vma_internal.h into logical headers tools/testing/vma: separate VMA userland tests into separate files mm: make vm_area_desc utilise vma_flags_t only mm: update all remaining mmap_prepare users to use vma_flags_t mm: update shmem_[kernel]_file_*() functions to use vma_flags_t mm: update secretmem to use VMA flags on mmap_prepare mm: update hugetlbfs to use VMA flags on mmap_prepare mm: add basic VMA flag operation helper functions tools: bitmap: add missing bitmap_[subset(), andnot()] mm: add mk_vma_flags() bitmap flag macro helper ...
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/arm64/include/asm/pgtable.h23
-rw-r--r--arch/arm64/mm/contpte.c62
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c2
-rw-r--r--drivers/char/mem.c6
-rw-r--r--drivers/dax/device.c10
-rw-r--r--drivers/gpu/drm/drm_gem.c5
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_shmem.c2
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_ttm.c3
-rw-r--r--drivers/gpu/drm/i915/gt/shmem_utils.c3
-rw-r--r--drivers/gpu/drm/ttm/tests/ttm_tt_test.c2
-rw-r--r--drivers/gpu/drm/ttm/ttm_backup.c3
-rw-r--r--drivers/gpu/drm/ttm/ttm_tt.c2
-rw-r--r--fs/aio.c2
-rw-r--r--fs/erofs/data.c5
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/hugetlbfs/inode.c14
-rw-r--r--fs/ntfs3/file.c2
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/resctrl/pseudo_lock.c2
-rw-r--r--fs/romfs/mmap-nommu.c2
-rw-r--r--fs/xfs/scrub/xfile.c3
-rw-r--r--fs/xfs/xfs_buf_mem.c2
-rw-r--r--fs/xfs/xfs_file.c4
-rw-r--r--fs/zonefs/file.c3
-rw-r--r--include/linux/cpuset.h6
-rw-r--r--include/linux/dax.h8
-rw-r--r--include/linux/hugetlb.h6
-rw-r--r--include/linux/hugetlb_inline.h10
-rw-r--r--include/linux/memcontrol.h6
-rw-r--r--include/linux/memory-tiers.h6
-rw-r--r--include/linux/mm.h252
-rw-r--r--include/linux/mm_types.h23
-rw-r--r--include/linux/mmu_notifier.h9
-rw-r--r--include/linux/pgtable.h73
-rw-r--r--include/linux/shmem_fs.h8
-rw-r--r--ipc/shm.c12
-rw-r--r--kernel/cgroup/cpuset.c54
-rw-r--r--kernel/relay.c2
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/hugetlb.c22
-rw-r--r--mm/internal.h10
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c16
-rw-r--r--mm/memfd.c6
-rw-r--r--mm/memory-tiers.c21
-rw-r--r--mm/memory.c101
-rw-r--r--mm/mmap.c110
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page_alloc.c1
-rw-r--r--mm/rmap.c38
-rw-r--r--mm/secretmem.c7
-rw-r--r--mm/shmem.c61
-rw-r--r--mm/util.c2
-rw-r--r--mm/vma.c67
-rw-r--r--mm/vma.h73
-rw-r--r--mm/vma_internal.h1
-rw-r--r--mm/vmscan.c31
-rw-r--r--security/keys/big_key.c2
-rw-r--r--tools/include/linux/bitmap.h22
-rw-r--r--tools/lib/bitmap.c29
-rw-r--r--tools/testing/selftests/mm/.gitignore1
-rw-r--r--tools/testing/selftests/mm/Makefile2
-rw-r--r--tools/testing/selftests/mm/config2
-rwxr-xr-xtools/testing/selftests/mm/ksft_memory_failure.sh4
-rw-r--r--tools/testing/selftests/mm/memory-failure.c359
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh21
-rw-r--r--tools/testing/selftests/mm/vm_util.c41
-rw-r--r--tools/testing/selftests/mm/vm_util.h3
-rw-r--r--tools/testing/vma/Makefile7
-rw-r--r--tools/testing/vma/include/custom.h119
-rw-r--r--tools/testing/vma/include/dup.h1320
-rw-r--r--tools/testing/vma/include/stubs.h428
-rw-r--r--tools/testing/vma/main.c55
-rw-r--r--tools/testing/vma/shared.c131
-rw-r--r--tools/testing/vma/shared.h114
-rw-r--r--tools/testing/vma/tests/merge.c (renamed from tools/testing/vma/vma.c)332
-rw-r--r--tools/testing/vma/tests/mmap.c57
-rw-r--r--tools/testing/vma/tests/vma.c339
-rw-r--r--tools/testing/vma/vma_internal.h1847
82 files changed, 3939 insertions, 2519 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index dc82a6bd1a61..b8d8a5c41597 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11845,6 +11845,7 @@ F: include/linux/memory-failure.h
F: include/trace/events/memory-failure.h
F: mm/hwpoison-inject.c
F: mm/memory-failure.c
+F: tools/testing/selftests/mm/memory-failure.c
HYCON HY46XX TOUCHSCREEN SUPPORT
M: Giulio Benetti <giulio.benetti@benettiengineering.com>
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index d94445b4f3df..a17eb8a76788 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1648,10 +1648,10 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
unsigned long addr, pte_t *ptep,
unsigned int nr, int full);
-extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
-extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
+int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr);
+int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr);
extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr);
extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
@@ -1823,7 +1823,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
if (likely(!pte_valid_cont(orig_pte)))
return __ptep_test_and_clear_young(vma, addr, ptep);
- return contpte_ptep_test_and_clear_young(vma, addr, ptep);
+ return contpte_test_and_clear_young_ptes(vma, addr, ptep, 1);
}
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
@@ -1835,7 +1835,18 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
if (likely(!pte_valid_cont(orig_pte)))
return __ptep_clear_flush_young(vma, addr, ptep);
- return contpte_ptep_clear_flush_young(vma, addr, ptep);
+ return contpte_clear_flush_young_ptes(vma, addr, ptep, 1);
+}
+
+#define clear_flush_young_ptes clear_flush_young_ptes
+static inline int clear_flush_young_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr)
+{
+ if (likely(nr == 1 && !pte_cont(__ptep_get(ptep))))
+ return __ptep_clear_flush_young(vma, addr, ptep);
+
+ return contpte_clear_flush_young_ptes(vma, addr, ptep, nr);
}
#define wrprotect_ptes wrprotect_ptes
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 589bcf878938..b929a455103f 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -26,6 +26,26 @@ static inline pte_t *contpte_align_down(pte_t *ptep)
return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
}
+static inline pte_t *contpte_align_addr_ptep(unsigned long *start,
+ unsigned long *end, pte_t *ptep,
+ unsigned int nr)
+{
+ /*
+ * Note: caller must ensure these nr PTEs are consecutive (present)
+ * PTEs that map consecutive pages of the same large folio within a
+ * single VMA and a single page table.
+ */
+ if (pte_cont(__ptep_get(ptep + nr - 1)))
+ *end = ALIGN(*end, CONT_PTE_SIZE);
+
+ if (pte_cont(__ptep_get(ptep))) {
+ *start = ALIGN_DOWN(*start, CONT_PTE_SIZE);
+ ptep = contpte_align_down(ptep);
+ }
+
+ return ptep;
+}
+
static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr)
{
@@ -488,8 +508,9 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);
-int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr)
{
/*
* ptep_clear_flush_young() technically requires us to clear the access
@@ -498,41 +519,45 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
* contig range when the range is covered by a single folio, we can get
* away with clearing young for the whole contig range here, so we avoid
* having to unfold.
+ *
+ * The 'nr' means consecutive (present) PTEs that map consecutive pages
+ * of the same large folio in a single VMA and a single page table.
*/
+ unsigned long end = addr + nr * PAGE_SIZE;
int young = 0;
- int i;
-
- ptep = contpte_align_down(ptep);
- addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
- for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
+ ptep = contpte_align_addr_ptep(&addr, &end, ptep, nr);
+ for (; addr != end; ptep++, addr += PAGE_SIZE)
young |= __ptep_test_and_clear_young(vma, addr, ptep);
return young;
}
-EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);
+EXPORT_SYMBOL_GPL(contpte_test_and_clear_young_ptes);
-int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr)
{
int young;
- young = contpte_ptep_test_and_clear_young(vma, addr, ptep);
+ young = contpte_test_and_clear_young_ptes(vma, addr, ptep, nr);
if (young) {
+ unsigned long end = addr + nr * PAGE_SIZE;
+
+ contpte_align_addr_ptep(&addr, &end, ptep, nr);
/*
* See comment in __ptep_clear_flush_young(); same rationale for
* eliding the trailing DSB applies here.
*/
- addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
- __flush_tlb_range_nosync(vma->vm_mm, addr, addr + CONT_PTE_SIZE,
+ __flush_tlb_range_nosync(vma->vm_mm, addr, end,
PAGE_SIZE, true, 3);
}
return young;
}
-EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young);
+EXPORT_SYMBOL_GPL(contpte_clear_flush_young_ptes);
void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr)
@@ -569,14 +594,7 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
unsigned long start = addr;
unsigned long end = start + nr * PAGE_SIZE;
- if (pte_cont(__ptep_get(ptep + nr - 1)))
- end = ALIGN(end, CONT_PTE_SIZE);
-
- if (pte_cont(__ptep_get(ptep))) {
- start = ALIGN_DOWN(start, CONT_PTE_SIZE);
- ptep = contpte_align_down(ptep);
- }
-
+ ptep = contpte_align_addr_ptep(&start, &end, ptep, nr);
__clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags);
}
EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes);
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 9322a9287dc7..0bc36957979d 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -83,7 +83,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
encl_size = secs->size + PAGE_SIZE;
backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5),
- VM_NORESERVE);
+ mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(backing)) {
ret = PTR_ERR(backing);
goto err_out_shrink;
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 52039fae1594..cca4529431f8 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -306,7 +306,7 @@ static unsigned zero_mmap_capabilities(struct file *file)
/* can't do an in-place private mapping if there's no MMU */
static inline int private_mapping_ok(struct vm_area_desc *desc)
{
- return is_nommu_shared_mapping(desc->vm_flags);
+ return is_nommu_shared_vma_flags(&desc->vma_flags);
}
#else
@@ -360,7 +360,7 @@ static int mmap_mem_prepare(struct vm_area_desc *desc)
desc->vm_ops = &mmap_mem_ops;
- /* Remap-pfn-range will mark the range VM_IO. */
+ /* Remap-pfn-range will mark the range with the I/O flag. */
mmap_action_remap_full(desc, desc->pgoff);
/* We filter remap errors to -EAGAIN. */
desc->action.error_hook = mmap_filter_error;
@@ -520,7 +520,7 @@ static int mmap_zero_prepare(struct vm_area_desc *desc)
#ifndef CONFIG_MMU
return -ENOSYS;
#endif
- if (desc->vm_flags & VM_SHARED)
+ if (vma_desc_test_flags(desc, VMA_SHARED_BIT))
return shmem_zero_setup_desc(desc);
desc->action.success_hook = mmap_zero_private_success;
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 22999a402e02..528e81240c4d 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -13,7 +13,7 @@
#include "dax-private.h"
#include "bus.h"
-static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags,
+static int __check_vma(struct dev_dax *dev_dax, vma_flags_t flags,
unsigned long start, unsigned long end, struct file *file,
const char *func)
{
@@ -24,7 +24,7 @@ static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags,
return -ENXIO;
/* prevent private mappings from being established */
- if ((vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
+ if (!vma_flags_test(&flags, VMA_MAYSHARE_BIT)) {
dev_info_ratelimited(dev,
"%s: %s: fail, attempted private mapping\n",
current->comm, func);
@@ -53,7 +53,7 @@ static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags,
static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
const char *func)
{
- return __check_vma(dev_dax, vma->vm_flags, vma->vm_start, vma->vm_end,
+ return __check_vma(dev_dax, vma->flags, vma->vm_start, vma->vm_end,
vma->vm_file, func);
}
@@ -306,14 +306,14 @@ static int dax_mmap_prepare(struct vm_area_desc *desc)
* fault time.
*/
id = dax_read_lock();
- rc = __check_vma(dev_dax, desc->vm_flags, desc->start, desc->end, filp,
+ rc = __check_vma(dev_dax, desc->vma_flags, desc->start, desc->end, filp,
__func__);
dax_read_unlock(id);
if (rc)
return rc;
desc->vm_ops = &dax_vm_ops;
- desc->vm_flags |= VM_HUGEPAGE;
+ vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT);
return 0;
}
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index ffa7852c8f6c..f7094c4aa97a 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -186,15 +186,16 @@ int drm_gem_object_init(struct drm_device *dev, struct drm_gem_object *obj,
{
struct vfsmount *huge_mnt;
struct file *filp;
+ const vma_flags_t flags = mk_vma_flags(VMA_NORESERVE_BIT);
drm_gem_private_object_init(dev, obj, size);
huge_mnt = drm_gem_get_huge_mnt(dev);
if (huge_mnt)
filp = shmem_file_setup_with_mnt(huge_mnt, "drm mm object",
- size, VM_NORESERVE);
+ size, flags);
else
- filp = shmem_file_setup("drm mm object", size, VM_NORESERVE);
+ filp = shmem_file_setup("drm mm object", size, flags);
if (IS_ERR(filp))
return PTR_ERR(filp);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index 6ad1d6f99363..95b13d172913 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -499,7 +499,7 @@ static int __create_shmem(struct drm_i915_private *i915,
resource_size_t size,
unsigned int flags)
{
- unsigned long shmem_flags = VM_NORESERVE;
+ const vma_flags_t shmem_flags = mk_vma_flags(VMA_NORESERVE_BIT);
struct vfsmount *huge_mnt;
struct file *filp;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index f65fe86c02b5..7b1a7d01db2b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -200,7 +200,8 @@ static int i915_ttm_tt_shmem_populate(struct ttm_device *bdev,
struct address_space *mapping;
gfp_t mask;
- filp = shmem_file_setup("i915-shmem-tt", size, VM_NORESERVE);
+ filp = shmem_file_setup("i915-shmem-tt", size,
+ mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(filp))
return PTR_ERR(filp);
diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c
index 365c4b8b04f4..5f37c699a320 100644
--- a/drivers/gpu/drm/i915/gt/shmem_utils.c
+++ b/drivers/gpu/drm/i915/gt/shmem_utils.c
@@ -19,7 +19,8 @@ struct file *shmem_create_from_data(const char *name, void *data, size_t len)
struct file *file;
int err;
- file = shmem_file_setup(name, PAGE_ALIGN(len), VM_NORESERVE);
+ file = shmem_file_setup(name, PAGE_ALIGN(len),
+ mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(file))
return file;
diff --git a/drivers/gpu/drm/ttm/tests/ttm_tt_test.c b/drivers/gpu/drm/ttm/tests/ttm_tt_test.c
index 61ec6f580b62..bd5f7d0b9b62 100644
--- a/drivers/gpu/drm/ttm/tests/ttm_tt_test.c
+++ b/drivers/gpu/drm/ttm/tests/ttm_tt_test.c
@@ -143,7 +143,7 @@ static void ttm_tt_fini_shmem(struct kunit *test)
err = ttm_tt_init(tt, bo, 0, caching, 0);
KUNIT_ASSERT_EQ(test, err, 0);
- shmem = shmem_file_setup("ttm swap", BO_SIZE, 0);
+ shmem = shmem_file_setup("ttm swap", BO_SIZE, EMPTY_VMA_FLAGS);
tt->swap_storage = shmem;
ttm_tt_fini(tt);
diff --git a/drivers/gpu/drm/ttm/ttm_backup.c b/drivers/gpu/drm/ttm/ttm_backup.c
index 32530c75f038..6bd4c123d94c 100644
--- a/drivers/gpu/drm/ttm/ttm_backup.c
+++ b/drivers/gpu/drm/ttm/ttm_backup.c
@@ -178,5 +178,6 @@ EXPORT_SYMBOL_GPL(ttm_backup_bytes_avail);
*/
struct file *ttm_backup_shmem_create(loff_t size)
{
- return shmem_file_setup("ttm shmem backup", size, 0);
+ return shmem_file_setup("ttm shmem backup", size,
+ EMPTY_VMA_FLAGS);
}
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index af33fa020249..fbf713abd547 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -330,7 +330,7 @@ int ttm_tt_swapout(struct ttm_device *bdev, struct ttm_tt *ttm,
struct page *to_page;
int i, ret;
- swap_storage = shmem_file_setup("ttm swap", size, 0);
+ swap_storage = shmem_file_setup("ttm swap", size, EMPTY_VMA_FLAGS);
if (IS_ERR(swap_storage)) {
pr_err("Failed allocating swap storage\n");
return PTR_ERR(swap_storage);
diff --git a/fs/aio.c b/fs/aio.c
index 0a23a8c0717f..59b67b8da1b2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -394,7 +394,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = {
static int aio_ring_mmap_prepare(struct vm_area_desc *desc)
{
- desc->vm_flags |= VM_DONTEXPAND;
+ vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT);
desc->vm_ops = &aio_ring_vm_ops;
return 0;
}
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index a28084ef796b..f79ee80627d9 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -473,11 +473,12 @@ static int erofs_file_mmap_prepare(struct vm_area_desc *desc)
if (!IS_DAX(file_inode(desc->file)))
return generic_file_readonly_mmap_prepare(desc);
- if ((desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE))
+ if (vma_desc_test_flags(desc, VMA_SHARED_BIT) &&
+ vma_desc_test_flags(desc, VMA_MAYWRITE_BIT))
return -EINVAL;
desc->vm_ops = &erofs_dax_vm_ops;
- desc->vm_flags |= VM_HUGEPAGE;
+ vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT);
return 0;
}
#else
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4320ebff74f3..f1dc5ce791a7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -818,13 +818,13 @@ static int ext4_file_mmap_prepare(struct vm_area_desc *desc)
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev))
+ if (!daxdev_mapping_supported(desc, file_inode(file), dax_dev))
return -EOPNOTSUPP;
file_accessed(file);
if (IS_DAX(file_inode(file))) {
desc->vm_ops = &ext4_dax_vm_ops;
- desc->vm_flags |= VM_HUGEPAGE;
+ vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT);
} else {
desc->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3b4c152c5c73..95a5b23b4808 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -109,7 +109,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
loff_t len, vma_len;
int ret;
struct hstate *h = hstate_file(file);
- vm_flags_t vm_flags;
+ vma_flags_t vma_flags;
/*
* vma address alignment (but not the pgoff alignment) has
@@ -119,7 +119,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
* way when do_mmap unwinds (may be important on powerpc
* and ia64).
*/
- desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+ vma_desc_set_flags(desc, VMA_HUGETLB_BIT, VMA_DONTEXPAND_BIT);
desc->vm_ops = &hugetlb_vm_ops;
/*
@@ -148,23 +148,23 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
ret = -ENOMEM;
- vm_flags = desc->vm_flags;
+ vma_flags = desc->vma_flags;
/*
* for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
* reserving here. Note: only for SHM hugetlbfs file, the inode
* flag S_PRIVATE is set.
*/
if (inode->i_flags & S_PRIVATE)
- vm_flags |= VM_NORESERVE;
+ vma_flags_set(&vma_flags, VMA_NORESERVE_BIT);
if (hugetlb_reserve_pages(inode,
desc->pgoff >> huge_page_order(h),
len >> huge_page_shift(h), desc,
- vm_flags) < 0)
+ vma_flags) < 0)
goto out;
ret = 0;
- if ((desc->vm_flags & VM_WRITE) && inode->i_size < len)
+ if (vma_desc_test_flags(desc, VMA_WRITE_BIT) && inode->i_size < len)
i_size_write(inode, len);
out:
inode_unlock(inode);
@@ -1527,7 +1527,7 @@ static int get_hstate_idx(int page_size_log)
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/
struct file *hugetlb_file_setup(const char *name, size_t size,
- vm_flags_t acctflag, int creat_flags,
+ vma_flags_t acctflag, int creat_flags,
int page_size_log)
{
struct inode *inode;
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index ae8c47cac406..f53037e0ecb6 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -276,7 +276,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc)
struct file *file = desc->file;
struct inode *inode = file_inode(file);
struct ntfs_inode *ni = ntfs_i(inode);
- bool rw = desc->vm_flags & VM_WRITE;
+ const bool rw = vma_desc_test_flags(desc, VMA_WRITE_BIT);
int err;
/* Avoid any operation if inode is bad. */
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index afd610a3fc68..42591252e239 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -411,8 +411,8 @@ static int orangefs_file_mmap_prepare(struct vm_area_desc *desc)
"orangefs_file_mmap: called on %pD\n", file);
/* set the sequential readahead hint */
- desc->vm_flags |= VM_SEQ_READ;
- desc->vm_flags &= ~VM_RAND_READ;
+ vma_desc_set_flags(desc, VMA_SEQ_READ_BIT);
+ vma_desc_clear_flags(desc, VMA_RAND_READ_BIT);
file_accessed(file);
desc->vm_ops = &orangefs_file_vm_ops;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 77b8ca2757e0..0f8e838ece07 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -264,7 +264,7 @@ out:
*/
static int ramfs_nommu_mmap_prepare(struct vm_area_desc *desc)
{
- if (!is_nommu_shared_mapping(desc->vm_flags))
+ if (!is_nommu_shared_vma_flags(&desc->vma_flags))
return -ENOSYS;
file_accessed(desc->file);
diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c
index 0bfc13c5b96d..e81d71abfe54 100644
--- a/fs/resctrl/pseudo_lock.c
+++ b/fs/resctrl/pseudo_lock.c
@@ -1044,7 +1044,7 @@ static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc)
* Ensure changes are carried directly to the memory being mapped,
* do not allow copy-on-write mapping.
*/
- if (!(desc->vm_flags & VM_SHARED)) {
+ if (!vma_desc_test_flags(desc, VMA_SHARED_BIT)) {
mutex_unlock(&rdtgroup_mutex);
return -EINVAL;
}
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index 4b77c6dc4418..7c3a1a7fecee 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -63,7 +63,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
*/
static int romfs_mmap_prepare(struct vm_area_desc *desc)
{
- return is_nommu_shared_mapping(desc->vm_flags) ? 0 : -ENOSYS;
+ return is_nommu_shared_vma_flags(&desc->vma_flags) ? 0 : -ENOSYS;
}
static unsigned romfs_mmap_capabilities(struct file *file)
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 2998c9b62f4b..bee0662fbdb6 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -61,7 +61,8 @@ xfile_create(
if (!xf)
return -ENOMEM;
- xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE);
+ xf->file = shmem_kernel_file_setup(description, isize,
+ mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(xf->file)) {
error = PTR_ERR(xf->file);
goto out_xfile;
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 0106da0a9f44..f1f23623e4a4 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -62,7 +62,7 @@ xmbuf_alloc(
if (!btp)
return -ENOMEM;
- file = shmem_kernel_file_setup(descr, 0, 0);
+ file = shmem_kernel_file_setup(descr, 0, EMPTY_VMA_FLAGS);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_btp;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 43d088a3bceb..6246f34df9fd 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -2010,14 +2010,14 @@ xfs_file_mmap_prepare(
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
+ if (!daxdev_mapping_supported(desc, file_inode(file),
target->bt_daxdev))
return -EOPNOTSUPP;
file_accessed(file);
desc->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(inode))
- desc->vm_flags |= VM_HUGEPAGE;
+ vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT);
return 0;
}
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index c1e5e30e90a0..8a7161fc49e5 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -333,7 +333,8 @@ static int zonefs_file_mmap_prepare(struct vm_area_desc *desc)
* ordering between msync() and page cache writeback.
*/
if (zonefs_inode_is_seq(file_inode(file)) &&
- (desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE))
+ vma_desc_test_flags(desc, VMA_SHARED_BIT) &&
+ vma_desc_test_flags(desc, VMA_MAYWRITE_BIT))
return -EINVAL;
file_accessed(file);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index cbd402b4f974..65d76a38974b 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -176,7 +176,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
task_unlock(current);
}
-extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
+extern void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask);
#else /* !CONFIG_CPUSETS */
static inline bool cpusets_enabled(void) { return false; }
@@ -299,9 +299,9 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
return false;
}
-static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+static inline void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask)
{
- return true;
+ nodes_copy(*mask, node_states[N_MEMORY]);
}
#endif /* !CONFIG_CPUSETS */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9d624f4d9df6..bf103f317cac 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -65,11 +65,11 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
/*
* Check if given mapping is supported by the file / underlying device.
*/
-static inline bool daxdev_mapping_supported(vm_flags_t vm_flags,
+static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc,
const struct inode *inode,
struct dax_device *dax_dev)
{
- if (!(vm_flags & VM_SYNC))
+ if (!vma_desc_test_flags(desc, VMA_SYNC_BIT))
return true;
if (!IS_DAX(inode))
return false;
@@ -111,11 +111,11 @@ static inline void set_dax_nomc(struct dax_device *dax_dev)
static inline void set_dax_synchronous(struct dax_device *dax_dev)
{
}
-static inline bool daxdev_mapping_supported(vm_flags_t vm_flags,
+static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc,
const struct inode *inode,
struct dax_device *dax_dev)
{
- return !(vm_flags & VM_SYNC);
+ return !vma_desc_test_flags(desc, VMA_SYNC_BIT);
}
static inline size_t dax_recovery_write(struct dax_device *dax_dev,
pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d2e48fa5f72e..65910437be1c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -148,7 +148,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
struct folio **foliop);
#endif /* CONFIG_USERFAULTFD */
long hugetlb_reserve_pages(struct inode *inode, long from, long to,
- struct vm_area_desc *desc, vm_flags_t vm_flags);
+ struct vm_area_desc *desc, vma_flags_t vma_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
@@ -527,7 +527,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
}
extern const struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
+struct file *hugetlb_file_setup(const char *name, size_t size, vma_flags_t acct,
int creat_flags, int page_size_log);
static inline bool is_file_hugepages(const struct file *file)
@@ -543,7 +543,7 @@ static inline struct hstate *hstate_inode(struct inode *i)
#define is_file_hugepages(file) false
static inline struct file *
-hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
+hugetlb_file_setup(const char *name, size_t size, vma_flags_t acctflag,
int creat_flags, int page_size_log)
{
return ERR_PTR(-ENOSYS);
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index a27aa0162918..593f5d4e108b 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -11,6 +11,11 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
return !!(vm_flags & VM_HUGETLB);
}
+static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags)
+{
+ return vma_flags_test(flags, VMA_HUGETLB_BIT);
+}
+
#else
static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
@@ -18,6 +23,11 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
return false;
}
+static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags)
+{
+ return false;
+}
+
#endif
static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 52bfe4157623..70b685a85bf4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1758,7 +1758,7 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
rcu_read_unlock();
}
-bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
+void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, nodemask_t *mask);
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);
@@ -1829,9 +1829,9 @@ static inline ino_t page_cgroup_ino(struct page *page)
return 0;
}
-static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+static inline void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg,
+ nodemask_t *mask)
{
- return true;
}
static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 7a805796fcfd..96987d9d95a8 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -53,11 +53,11 @@ struct memory_dev_type *mt_find_alloc_memory_type(int adist,
struct list_head *memory_types);
void mt_put_memory_types(struct list_head *memory_types);
#ifdef CONFIG_MIGRATION
-int next_demotion_node(int node);
+int next_demotion_node(int node, const nodemask_t *allowed_mask);
void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
bool node_is_toptier(int node);
#else
-static inline int next_demotion_node(int node)
+static inline int next_demotion_node(int node, const nodemask_t *allowed_mask)
{
return NUMA_NO_NODE;
}
@@ -101,7 +101,7 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt
}
-static inline int next_demotion_node(int node)
+static inline int next_demotion_node(int node, const nodemask_t *allowed_mask)
{
return NUMA_NO_NODE;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc1ad71a2a70..5be3d8a8f806 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_MM_H
#define _LINUX_MM_H
+#include <linux/args.h>
#include <linux/errno.h>
#include <linux/mmdebug.h>
#include <linux/gfp.h>
@@ -551,17 +552,18 @@ enum {
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
- * VM_IO tells people not to look at these pages
+ * IO tells people not to look at these pages
* (accesses can have side effects).
- * VM_PFNMAP tells the core MM that the base pages are just
+ * PFNMAP tells the core MM that the base pages are just
* raw PFN mappings, and do not have a "struct page" associated
* with them.
- * VM_DONTEXPAND
+ * DONTEXPAND
* Disable vma merging and expanding with mremap().
- * VM_DONTDUMP
+ * DONTDUMP
* Omit vma from core dump, even when VM_IO turned off.
*/
-#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP)
+#define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, \
+ VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)
/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
@@ -945,7 +947,7 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma,
* system word.
*/
if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) {
- unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags);
+ unsigned long *bitmap = vma->flags.__vma_flags;
bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG);
}
@@ -989,8 +991,7 @@ static inline void vm_flags_mod(struct vm_area_struct *vma,
__vm_flags_mod(vma, set, clear);
}
-static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma,
- vma_flag_t bit)
+static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_t bit)
{
const vm_flags_t mask = BIT((__force int)bit);
@@ -1005,13 +1006,12 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma,
* Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific
* valid flags are allowed to do this.
*/
-static inline void vma_flag_set_atomic(struct vm_area_struct *vma,
- vma_flag_t bit)
+static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit)
{
- unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags);
+ unsigned long *bitmap = vma->flags.__vma_flags;
vma_assert_stabilised(vma);
- if (__vma_flag_atomic_valid(vma, bit))
+ if (__vma_atomic_valid_flag(vma, bit))
set_bit((__force int)bit, bitmap);
}
@@ -1022,15 +1022,211 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma,
* This is necessarily racey, so callers must ensure that serialisation is
* achieved through some other means, or that races are permissible.
*/
-static inline bool vma_flag_test_atomic(struct vm_area_struct *vma,
- vma_flag_t bit)
+static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit)
{
- if (__vma_flag_atomic_valid(vma, bit))
+ if (__vma_atomic_valid_flag(vma, bit))
return test_bit((__force int)bit, &vma->vm_flags);
return false;
}
+/* Set an individual VMA flag in flags, non-atomically. */
+static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit)
+{
+ unsigned long *bitmap = flags->__vma_flags;
+
+ __set_bit((__force int)bit, bitmap);
+}
+
+static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits)
+{
+ vma_flags_t flags;
+ int i;
+
+ vma_flags_clear_all(&flags);
+ for (i = 0; i < count; i++)
+ vma_flag_set(&flags, bits[i]);
+ return flags;
+}
+
+/*
+ * Helper macro which bitwise-or combines the specified input flags into a
+ * vma_flags_t bitmap value. E.g.:
+ *
+ * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT,
+ * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT);
+ *
+ * The compiler cleverly optimises away all of the work and this ends up being
+ * equivalent to aggregating the values manually.
+ */
+#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \
+ (const vma_flag_t []){__VA_ARGS__})
+
+/* Test each of to_test flags in flags, non-atomically. */
+static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags,
+ vma_flags_t to_test)
+{
+ const unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+ return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Test whether any specified VMA flag is set, e.g.:
+ *
+ * if (vma_flags_test(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
+ */
+#define vma_flags_test(flags, ...) \
+ vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/* Test that ALL of the to_test flags are set, non-atomically. */
+static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags,
+ vma_flags_t to_test)
+{
+ const unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+ return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Test whether ALL specified VMA flags are set, e.g.:
+ *
+ * if (vma_flags_test_all(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
+ */
+#define vma_flags_test_all(flags, ...) \
+ vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/* Set each of the to_set flags in flags, non-atomically. */
+static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set)
+{
+ unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_set = to_set.__vma_flags;
+
+ bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Set all specified VMA flags, e.g.:
+ *
+ * vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+ */
+#define vma_flags_set(flags, ...) \
+ vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/* Clear all of the to-clear flags in flags, non-atomically. */
+static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear)
+{
+ unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_clear = to_clear.__vma_flags;
+
+ bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Clear all specified individual flags, e.g.:
+ *
+ * vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+ */
+#define vma_flags_clear(flags, ...) \
+ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/*
+ * Helper to test that ALL specified flags are set in a VMA.
+ *
+ * Note: appropriate locks must be held, this function does not acquire them for
+ * you.
+ */
+static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma,
+ vma_flags_t flags)
+{
+ return vma_flags_test_all_mask(&vma->flags, flags);
+}
+
+/*
+ * Helper macro for checking that ALL specified flags are set in a VMA, e.g.:
+ *
+ * if (vma_test_all_flags(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... }
+ */
+#define vma_test_all_flags(vma, ...) \
+ vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+/*
+ * Helper to set all VMA flags in a VMA.
+ *
+ * Note: appropriate locks must be held, this function does not acquire them for
+ * you.
+ */
+static inline void vma_set_flags_mask(struct vm_area_struct *vma,
+ vma_flags_t flags)
+{
+ vma_flags_set_mask(&vma->flags, flags);
+}
+
+/*
+ * Helper macro for specifying VMA flags in a VMA, e.g.:
+ *
+ * vma_set_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
+ * VMA_DONTDUMP_BIT);
+ *
+ * Note: appropriate locks must be held, this function does not acquire them for
+ * you.
+ */
+#define vma_set_flags(vma, ...) \
+ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+/* Helper to test all VMA flags in a VMA descriptor. */
+static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc,
+ vma_flags_t flags)
+{
+ return vma_flags_test_mask(&desc->vma_flags, flags);
+}
+
+/*
+ * Helper macro for testing VMA flags for an input pointer to a struct
+ * vm_area_desc object describing a proposed VMA, e.g.:
+ *
+ * if (vma_desc_test_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT,
+ * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... }
+ */
+#define vma_desc_test_flags(desc, ...) \
+ vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+/* Helper to set all VMA flags in a VMA descriptor. */
+static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc,
+ vma_flags_t flags)
+{
+ vma_flags_set_mask(&desc->vma_flags, flags);
+}
+
+/*
+ * Helper macro for specifying VMA flags for an input pointer to a struct
+ * vm_area_desc object describing a proposed VMA, e.g.:
+ *
+ * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
+ * VMA_DONTDUMP_BIT);
+ */
+#define vma_desc_set_flags(desc, ...) \
+ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+/* Helper to clear all VMA flags in a VMA descriptor. */
+static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc,
+ vma_flags_t flags)
+{
+ vma_flags_clear_mask(&desc->vma_flags, flags);
+}
+
+/*
+ * Helper macro for clearing VMA flags for an input pointer to a struct
+ * vm_area_desc object describing a proposed VMA, e.g.:
+ *
+ * vma_desc_clear_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
+ * VMA_DONTDUMP_BIT);
+ */
+#define vma_desc_clear_flags(desc, ...) \
+ vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
vma->vm_ops = NULL;
@@ -1096,15 +1292,20 @@ static inline bool vma_is_accessible(const struct vm_area_struct *vma)
return vma->vm_flags & VM_ACCESS_FLAGS;
}
-static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags)
{
return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
(VM_SHARED | VM_MAYWRITE);
}
+static inline bool is_shared_maywrite(const vma_flags_t *flags)
+{
+ return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT);
+}
+
static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma)
{
- return is_shared_maywrite(vma->vm_flags);
+ return is_shared_maywrite(&vma->flags);
}
static inline
@@ -1732,6 +1933,14 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
+static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc)
+{
+ const vma_flags_t *flags = &desc->vma_flags;
+
+ return vma_flags_test(flags, VMA_MAYWRITE_BIT) &&
+ !vma_flags_test(flags, VMA_SHARED_BIT);
+}
+
#ifndef CONFIG_MMU
static inline bool is_nommu_shared_mapping(vm_flags_t flags)
{
@@ -1745,6 +1954,11 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags)
*/
return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
}
+
+static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags)
+{
+ return vma_flags_test(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT);
+}
#endif
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -2627,10 +2841,6 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
zap_page_range_single(vma, vma->vm_start,
vma->vm_end - vma->vm_start, NULL);
}
-void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
- struct vm_area_struct *start_vma, unsigned long start,
- unsigned long end, unsigned long tree_end);
-
struct mmu_notifier_range;
void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8731606d8d36..3cc8ae722886 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -844,7 +844,7 @@ struct mmap_action {
/*
* If specified, this hook is invoked when an error occurred when
- * attempting the selection action.
+ * attempting the selected action.
*
* The hook can return an error code in order to filter the error, but
* it is not valid to clear the error here.
@@ -866,7 +866,9 @@ struct mmap_action {
#define NUM_VMA_FLAG_BITS BITS_PER_LONG
typedef struct {
DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS);
-} __private vma_flags_t;
+} vma_flags_t;
+
+#define EMPTY_VMA_FLAGS ((vma_flags_t){ })
/*
* Describes a VMA that is about to be mmap()'ed. Drivers may choose to
@@ -885,10 +887,7 @@ struct vm_area_desc {
/* Mutable fields. Populated with initial state. */
pgoff_t pgoff;
struct file *vm_file;
- union {
- vm_flags_t vm_flags;
- vma_flags_t vma_flags;
- };
+ vma_flags_t vma_flags;
pgprot_t page_prot;
/* Write-only fields. */
@@ -1059,7 +1058,7 @@ struct vm_area_struct {
/* Clears all bits in the VMA flags bitmap, non-atomically. */
static inline void vma_flags_clear_all(vma_flags_t *flags)
{
- bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS);
+ bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS);
}
/*
@@ -1070,7 +1069,9 @@ static inline void vma_flags_clear_all(vma_flags_t *flags)
*/
static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value)
{
- *ACCESS_PRIVATE(flags, __vma_flags) = value;
+ unsigned long *bitmap = flags->__vma_flags;
+
+ bitmap[0] = value;
}
/*
@@ -1081,7 +1082,7 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va
*/
static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value)
{
- unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+ unsigned long *bitmap = flags->__vma_flags;
WRITE_ONCE(*bitmap, value);
}
@@ -1089,7 +1090,7 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo
/* Update the first system word of VMA flags setting bits, non-atomically. */
static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value)
{
- unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+ unsigned long *bitmap = flags->__vma_flags;
*bitmap |= value;
}
@@ -1097,7 +1098,7 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value)
/* Update the first system word of VMA flags clearing bits, non-atomically. */
static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value)
{
- unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+ unsigned long *bitmap = flags->__vma_flags;
*bitmap &= ~value;
}
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index d1094c2d5fb6..07a2bbaf86e9 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -515,16 +515,17 @@ static inline void mmu_notifier_range_init_owner(
range->owner = owner;
}
-#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
+#define clear_flush_young_ptes_notify(__vma, __address, __ptep, __nr) \
({ \
int __young; \
struct vm_area_struct *___vma = __vma; \
unsigned long ___address = __address; \
- __young = ptep_clear_flush_young(___vma, ___address, __ptep); \
+ unsigned int ___nr = __nr; \
+ __young = clear_flush_young_ptes(___vma, ___address, __ptep, ___nr); \
__young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \
___address, \
___address + \
- PAGE_SIZE); \
+ ___nr * PAGE_SIZE); \
__young; \
})
@@ -650,7 +651,7 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
#define mmu_notifier_range_update_to_read_only(r) false
-#define ptep_clear_flush_young_notify ptep_clear_flush_young
+#define clear_flush_young_ptes_notify clear_flush_young_ptes
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
#define pmdp_clear_young_notify pmdp_test_and_clear_young
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 827dca25c0bc..a50df42a893f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -23,25 +23,6 @@
#endif
/*
- * On almost all architectures and configurations, 0 can be used as the
- * upper ceiling to free_pgtables(): on many architectures it has the same
- * effect as using TASK_SIZE. However, there is one configuration which
- * must impose a more careful limit, to avoid freeing kernel pgtables.
- */
-#ifndef USER_PGTABLES_CEILING
-#define USER_PGTABLES_CEILING 0UL
-#endif
-
-/*
- * This defines the first usable user address. Platforms
- * can override its value with custom FIRST_USER_ADDRESS
- * defined in their respective <asm/pgtable.h>.
- */
-#ifndef FIRST_USER_ADDRESS
-#define FIRST_USER_ADDRESS 0UL
-#endif
-
-/*
* This defines the generic helper for accessing PMD page
* table page. Although platforms can still override this
* via their respective <asm/pgtable.h>.
@@ -1087,6 +1068,41 @@ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
}
#endif
+#ifndef clear_flush_young_ptes
+/**
+ * clear_flush_young_ptes - Mark PTEs that map consecutive pages of the same
+ * folio as old and flush the TLB.
+ * @vma: The virtual memory area the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear access bit.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_clear_flush_young().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline int clear_flush_young_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr)
+{
+ int young = 0;
+
+ for (;;) {
+ young |= ptep_clear_flush_young(vma, addr, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+
+ return young;
+}
+#endif
+
/*
* On some architectures hardware does not set page access bit when accessing
* memory page, it is responsibility of software setting this bit. It brings
@@ -1630,6 +1646,25 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
#endif /* CONFIG_MMU */
/*
+ * On almost all architectures and configurations, 0 can be used as the
+ * upper ceiling to free_pgtables(): on many architectures it has the same
+ * effect as using TASK_SIZE. However, there is one configuration which
+ * must impose a more careful limit, to avoid freeing kernel pgtables.
+ */
+#ifndef USER_PGTABLES_CEILING
+#define USER_PGTABLES_CEILING 0UL
+#endif
+
+/*
+ * This defines the first usable user address. Platforms
+ * can override its value with custom FIRST_USER_ADDRESS
+ * defined in their respective <asm/pgtable.h>.
+ */
+#ifndef FIRST_USER_ADDRESS
+#define FIRST_USER_ADDRESS 0UL
+#endif
+
+/*
* No-op macros that just return the current protection value. Defined here
* because these macros can be used even if CONFIG_MMU is not defined.
*/
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index e2069b3179c4..a8273b32e041 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -102,12 +102,10 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
extern const struct fs_parameter_spec shmem_fs_parameters[];
extern void shmem_init(void);
extern int shmem_init_fs_context(struct fs_context *fc);
-extern struct file *shmem_file_setup(const char *name,
- loff_t size, unsigned long flags);
-extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
- unsigned long flags);
+struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags);
+struct file *shmem_kernel_file_setup(const char *name, loff_t size, vma_flags_t vma_flags);
extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
- const char *name, loff_t size, unsigned long flags);
+ const char *name, loff_t size, vma_flags_t flags);
int shmem_zero_setup(struct vm_area_struct *vma);
int shmem_zero_setup_desc(struct vm_area_desc *desc);
extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
diff --git a/ipc/shm.c b/ipc/shm.c
index 3db36773dd10..e8c7d1924c50 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -707,9 +707,10 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
int error;
struct shmid_kernel *shp;
size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ const bool has_no_reserve = shmflg & SHM_NORESERVE;
+ vma_flags_t acctflag = EMPTY_VMA_FLAGS;
struct file *file;
char name[13];
- vm_flags_t acctflag = 0;
if (size < SHMMIN || size > ns->shm_ctlmax)
return -EINVAL;
@@ -749,8 +750,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
hugesize = ALIGN(size, huge_page_size(hs));
/* hugetlb_file_setup applies strict accounting */
- if (shmflg & SHM_NORESERVE)
- acctflag = VM_NORESERVE;
+ if (has_no_reserve)
+ vma_flags_set(&acctflag, VMA_NORESERVE_BIT);
file = hugetlb_file_setup(name, hugesize, acctflag,
HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
} else {
@@ -758,9 +759,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
* Do not allow no accounting for OVERCOMMIT_NEVER, even
* if it's asked for.
*/
- if ((shmflg & SHM_NORESERVE) &&
- sysctl_overcommit_memory != OVERCOMMIT_NEVER)
- acctflag = VM_NORESERVE;
+ if (has_no_reserve && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ vma_flags_set(&acctflag, VMA_NORESERVE_BIT);
file = shmem_kernel_file_setup(name, size, acctflag);
}
error = PTR_ERR(file);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 832179236529..7607dfe516e6 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4145,40 +4145,58 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
return allowed;
}
-bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+/**
+ * cpuset_nodes_allowed - return effective_mems mask from a cgroup cpuset.
+ * @cgroup: pointer to struct cgroup.
+ * @mask: pointer to struct nodemask_t to be returned.
+ *
+ * Returns effective_mems mask from a cgroup cpuset if it is cgroup v2 and
+ * has cpuset subsys. Otherwise, returns node_states[N_MEMORY].
+ *
+ * This function intentionally avoids taking the cpuset_mutex or callback_lock
+ * when accessing effective_mems. This is because the obtained effective_mems
+ * is stale immediately after the query anyway (e.g., effective_mems is updated
+ * immediately after releasing the lock but before returning).
+ *
+ * As a result, returned @mask may be empty because cs->effective_mems can be
+ * rebound during this call. Besides, nodes in @mask are not guaranteed to be
+ * online due to hot plugins. Callers should check the mask for validity on
+ * return based on its subsequent use.
+ **/
+void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask)
{
struct cgroup_subsys_state *css;
struct cpuset *cs;
- bool allowed;
/*
* In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
* and mems_allowed is likely to be empty even if we could get to it,
- * so return true to avoid taking a global lock on the empty check.
+ * so return directly to avoid taking a global lock on the empty check.
*/
- if (!cpuset_v2())
- return true;
+ if (!cgroup || !cpuset_v2()) {
+ nodes_copy(*mask, node_states[N_MEMORY]);
+ return;
+ }
css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
- if (!css)
- return true;
+ if (!css) {
+ nodes_copy(*mask, node_states[N_MEMORY]);
+ return;
+ }
/*
- * Normally, accessing effective_mems would require the cpuset_mutex
- * or callback_lock - but node_isset is atomic and the reference
- * taken via cgroup_get_e_css is sufficient to protect css.
- *
- * Since this interface is intended for use by migration paths, we
- * relax locking here to avoid taking global locks - while accepting
- * there may be rare scenarios where the result may be innaccurate.
+ * The reference taken via cgroup_get_e_css is sufficient to
+ * protect css, but it does not imply safe accesses to effective_mems.
*
- * Reclaim and migration are subject to these same race conditions, and
- * cannot make strong isolation guarantees, so this is acceptable.
+ * Normally, accessing effective_mems would require the cpuset_mutex
+ * or callback_lock - but the correctness of this information is stale
+ * immediately after the query anyway. We do not acquire the lock
+ * during this process to save lock contention in exchange for racing
+ * against mems_allowed rebinds.
*/
cs = container_of(css, struct cpuset, css);
- allowed = node_isset(nid, cs->effective_mems);
+ nodes_copy(*mask, cs->effective_mems);
css_put(css);
- return allowed;
}
/**
diff --git a/kernel/relay.c b/kernel/relay.c
index 6ed6bc929bf9..5c665b729132 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -91,7 +91,7 @@ static int relay_mmap_prepare_buf(struct rchan_buf *buf,
return -EINVAL;
desc->vm_ops = &relay_file_mmap_ops;
- desc->vm_flags |= VM_DONTEXPAND;
+ vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT);
desc->private_data = buf;
return 0;
diff --git a/mm/filemap.c b/mm/filemap.c
index ebd75684cb0a..6cd7974d4ada 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -4012,7 +4012,7 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
{
- if (is_shared_maywrite(desc->vm_flags))
+ if (is_shared_maywrite(&desc->vma_flags))
return -EINVAL;
return generic_file_mmap_prepare(desc);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 90182724d4cf..6e855a32de3d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1193,16 +1193,16 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map)
{
- VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
- VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
+ VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
+ VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT));
desc->private_data = map;
}
static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags)
{
- VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
- VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
+ VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
+ VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT));
desc->private_data = (void *)((unsigned long)desc->private_data | flags);
}
@@ -1216,7 +1216,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
{
- VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+ VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags));
return ((unsigned long)desc->private_data) & flag;
}
@@ -6571,7 +6571,7 @@ next:
long hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_desc *desc,
- vm_flags_t vm_flags)
+ vma_flags_t vma_flags)
{
long chg = -1, add = -1, spool_resv, gbl_resv;
struct hstate *h = hstate_inode(inode);
@@ -6592,7 +6592,7 @@ long hugetlb_reserve_pages(struct inode *inode,
* attempt will be made for VM_NORESERVE to allocate a page
* without using reserves
*/
- if (vm_flags & VM_NORESERVE)
+ if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT))
return 0;
/*
@@ -6601,7 +6601,7 @@ long hugetlb_reserve_pages(struct inode *inode,
* to reserve the full area even if read-only as mprotect() may be
* called to make the mapping read-write. Assume !desc is a shm mapping
*/
- if (!desc || desc->vm_flags & VM_MAYSHARE) {
+ if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) {
/*
* resv_map can not be NULL as hugetlb_reserve_pages is only
* called for inodes for which resv_maps were created (see
@@ -6635,7 +6635,7 @@ long hugetlb_reserve_pages(struct inode *inode,
if (err < 0)
goto out_err;
- if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) {
+ if (desc && !vma_desc_test_flags(desc, VMA_MAYSHARE_BIT) && h_cg) {
/* For private mappings, the hugetlb_cgroup uncharge info hangs
* of the resv_map.
*/
@@ -6672,7 +6672,7 @@ long hugetlb_reserve_pages(struct inode *inode,
* consumed reservations are stored in the map. Hence, nothing
* else has to be done for private mappings here
*/
- if (!desc || desc->vm_flags & VM_MAYSHARE) {
+ if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) {
add = region_add(resv_map, from, to, regions_needed, h, h_cg);
if (unlikely(add < 0)) {
@@ -6736,7 +6736,7 @@ out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
- if (!desc || desc->vm_flags & VM_MAYSHARE)
+ if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT))
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
*/
diff --git a/mm/internal.h b/mm/internal.h
index 7493d2b2743c..cb0af847d7d9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -197,6 +197,9 @@ static inline void vma_close(struct vm_area_struct *vma)
}
}
+/* unmap_vmas is in mm/memory.c */
+void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
+
#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
@@ -509,9 +512,8 @@ bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
void folio_activate(struct folio *folio);
-void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
- struct vm_area_struct *start_vma, unsigned long floor,
- unsigned long ceiling, bool mm_wr_locked);
+void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc);
+
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
struct zap_details;
@@ -1044,7 +1046,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool write, int *locked);
-bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
+bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked,
unsigned long bytes);
/*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ea7a3ad2a2c2..eff9e3061925 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1732,7 +1732,7 @@ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
* obtained on guard region installation after the flag is set, so this
* check being performed under this lock excludes races.
*/
- if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT))
+ if (vma_test_atomic_flag(vma, VMA_MAYBE_GUARD_BIT))
return false;
return true;
diff --git a/mm/madvise.c b/mm/madvise.c
index 1f3040688f04..8debb2d434aa 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1140,7 +1140,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
* acquire an mmap/VMA write lock to read it. All remaining readers may
* or may not see the flag set, but we don't care.
*/
- vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT);
+ vma_set_atomic_flag(vma, VMA_MAYBE_GUARD_BIT);
/*
* If anonymous and we are establishing page tables the VMA ought to
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b730233a481d..f2b87e02574e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5649,9 +5649,21 @@ subsys_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_SWAP */
-bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+void mem_cgroup_node_filter_allowed(struct mem_cgroup *memcg, nodemask_t *mask)
{
- return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+ nodemask_t allowed;
+
+ if (!memcg)
+ return;
+
+ /*
+ * Since this interface is intended for use by migration paths, and
+ * reclaim and migration are subject to race conditions such as changes
+ * in effective_mems and hot-unpluging of nodes, inaccurate allowed
+ * mask is acceptable.
+ */
+ cpuset_nodes_allowed(memcg->css.cgroup, &allowed);
+ nodes_and(*mask, *mask, allowed);
}
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
diff --git a/mm/memfd.c b/mm/memfd.c
index 82a3f38aa30a..919c2a53eb96 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -86,7 +86,7 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
idx >>= huge_page_order(h);
- nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0);
+ nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, EMPTY_VMA_FLAGS);
if (nr_resv < 0)
return ERR_PTR(nr_resv);
@@ -463,12 +463,12 @@ struct file *memfd_alloc_file(const char *name, unsigned int flags)
int err = 0;
if (flags & MFD_HUGETLB) {
- file = hugetlb_file_setup(name, 0, VM_NORESERVE,
+ file = hugetlb_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT),
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
} else {
- file = shmem_file_setup(name, 0, VM_NORESERVE);
+ file = shmem_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT));
}
if (IS_ERR(file))
return file;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0ae8bec86346..545e34626df7 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -320,16 +320,17 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
/**
* next_demotion_node() - Get the next node in the demotion path
* @node: The starting node to lookup the next node
+ * @allowed_mask: The pointer to allowed node mask
*
* Return: node id for next memory node in the demotion path hierarchy
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
* @node online or guarantee that it *continues* to be the next demotion
* target.
*/
-int next_demotion_node(int node)
+int next_demotion_node(int node, const nodemask_t *allowed_mask)
{
struct demotion_nodes *nd;
- int target;
+ nodemask_t mask;
if (!node_demotion)
return NUMA_NO_NODE;
@@ -344,6 +345,10 @@ int next_demotion_node(int node)
* node_demotion[] reads need to be consistent.
*/
rcu_read_lock();
+ /* Filter out nodes that are not in allowed_mask. */
+ nodes_and(mask, nd->preferred, *allowed_mask);
+ rcu_read_unlock();
+
/*
* If there are multiple target nodes, just select one
* target node randomly.
@@ -356,10 +361,16 @@ int next_demotion_node(int node)
* caching issue, which seems more complicated. So selecting
* target node randomly seems better until now.
*/
- target = node_random(&nd->preferred);
- rcu_read_unlock();
+ if (!nodes_empty(mask))
+ return node_random(&mask);
- return target;
+ /*
+ * Preferred nodes are not in allowed_mask. Flip bits in
+ * allowed_mask as used node mask. Then, use it to get the
+ * closest demotion target.
+ */
+ nodes_complement(mask, *allowed_mask);
+ return find_next_best_node(node, &mask);
}
static void disable_all_demotion_targets(void)
diff --git a/mm/memory.c b/mm/memory.c
index b0d487229b2e..876bf73959c6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -370,11 +370,32 @@ void free_pgd_range(struct mmu_gather *tlb,
} while (pgd++, addr = next, addr != end);
}
-void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
- struct vm_area_struct *vma, unsigned long floor,
- unsigned long ceiling, bool mm_wr_locked)
+/**
+ * free_pgtables() - Free a range of page tables
+ * @tlb: The mmu gather
+ * @unmap: The unmap_desc
+ *
+ * Note: pg_start and pg_end are provided to indicate the absolute range of the
+ * page tables that should be removed. This can differ from the vma mappings on
+ * some archs that may have mappings that need to be removed outside the vmas.
+ * Note that the prev->vm_end and next->vm_start are often used.
+ *
+ * The vma_end differs from the pg_end when a dup_mmap() failed and the tree has
+ * unrelated data to the mm_struct being torn down.
+ */
+void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap)
{
struct unlink_vma_file_batch vb;
+ struct ma_state *mas = unmap->mas;
+ struct vm_area_struct *vma = unmap->first;
+
+ /*
+ * Note: USER_PGTABLES_CEILING may be passed as the value of pg_end and
+ * may be 0. Underflow is expected in this case. Otherwise the
+ * pagetable end is exclusive. vma_end is exclusive. The last vma
+ * address should never be larger than the pagetable end.
+ */
+ WARN_ON_ONCE(unmap->vma_end - 1 > unmap->pg_end - 1);
tlb_free_vmas(tlb);
@@ -382,19 +403,13 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
- /*
- * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
- * be 0. This will underflow and is okay.
- */
- next = mas_find(mas, ceiling - 1);
- if (unlikely(xa_is_zero(next)))
- next = NULL;
+ next = mas_find(mas, unmap->tree_end - 1);
/*
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
- if (mm_wr_locked)
+ if (unmap->mm_wr_locked)
vma_start_write(vma);
unlink_anon_vmas(vma);
@@ -406,18 +421,16 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
vma = next;
- next = mas_find(mas, ceiling - 1);
- if (unlikely(xa_is_zero(next)))
- next = NULL;
- if (mm_wr_locked)
+ next = mas_find(mas, unmap->tree_end - 1);
+ if (unmap->mm_wr_locked)
vma_start_write(vma);
unlink_anon_vmas(vma);
unlink_file_vma_batch_add(&vb, vma);
}
unlink_file_vma_batch_final(&vb);
- free_pgd_range(tlb, addr, vma->vm_end,
- floor, next ? next->vm_start : ceiling);
+ free_pgd_range(tlb, addr, vma->vm_end, unmap->pg_start,
+ next ? next->vm_start : unmap->pg_end);
vma = next;
} while (vma);
}
@@ -2124,11 +2137,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
- * @mas: the maple state
- * @vma: the starting vma
- * @start_addr: virtual address at which to start unmapping
- * @end_addr: virtual address at which to end unmapping
- * @tree_end: The maximum index to check
+ * @unmap: The unmap_desc
*
* Unmap all pages in the vma list.
*
@@ -2141,10 +2150,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
- struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, unsigned long tree_end)
+void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
{
+ struct vm_area_struct *vma;
struct mmu_notifier_range range;
struct zap_details details = {
.zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
@@ -2152,17 +2160,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
.even_cows = true,
};
+ vma = unmap->first;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
- start_addr, end_addr);
+ unmap->vma_start, unmap->vma_end);
mmu_notifier_invalidate_range_start(&range);
do {
- unsigned long start = start_addr;
- unsigned long end = end_addr;
+ unsigned long start = unmap->vma_start;
+ unsigned long end = unmap->vma_end;
hugetlb_zap_begin(vma, &start, &end);
unmap_single_vma(tlb, vma, start, end, &details);
hugetlb_zap_end(vma, &details);
- vma = mas_find(mas, tree_end - 1);
- } while (vma && likely(!xa_is_zero(vma)));
+ vma = mas_find(unmap->mas, unmap->tree_end - 1);
+ } while (vma);
mmu_notifier_invalidate_range_end(&range);
}
@@ -2948,7 +2957,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
-static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
+static int get_remap_pgoff(bool is_cow, unsigned long addr,
unsigned long end, unsigned long vm_start, unsigned long vm_end,
unsigned long pfn, pgoff_t *vm_pgoff_p)
{
@@ -2958,7 +2967,7 @@ static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
* See vm_normal_page() for details.
*/
- if (is_cow_mapping(vm_flags)) {
+ if (is_cow) {
if (addr != vm_start || end != vm_end)
return -EINVAL;
*vm_pgoff_p = pfn;
@@ -2979,7 +2988,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
return -EINVAL;
- VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS);
+ VM_WARN_ON_ONCE(!vma_test_all_flags_mask(vma, VMA_REMAP_FLAGS));
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
@@ -3103,9 +3112,9 @@ void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
* check it again on complete and will fail there if specified addr is
* invalid.
*/
- get_remap_pgoff(desc->vm_flags, desc->start, desc->end,
+ get_remap_pgoff(vma_desc_is_cow_mapping(desc), desc->start, desc->end,
desc->start, desc->end, pfn, &desc->pgoff);
- desc->vm_flags |= VM_REMAP_FLAGS;
+ vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS);
}
static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr,
@@ -3114,13 +3123,12 @@ static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long
unsigned long end = addr + PAGE_ALIGN(size);
int err;
- err = get_remap_pgoff(vma->vm_flags, addr, end,
- vma->vm_start, vma->vm_end,
- pfn, &vma->vm_pgoff);
+ err = get_remap_pgoff(is_cow_mapping(vma->vm_flags), addr, end,
+ vma->vm_start, vma->vm_end, pfn, &vma->vm_pgoff);
if (err)
return err;
- vm_flags_set(vma, VM_REMAP_FLAGS);
+ vma_set_flags_mask(vma, VMA_REMAP_FLAGS);
return 0;
}
@@ -7316,7 +7324,7 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint)
const unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio));
const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE;
const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1);
- const int radius = FOLIO_ZERO_LOCALITY_RADIUS;
+ const long radius = FOLIO_ZERO_LOCALITY_RADIUS;
struct range r[3];
int i;
@@ -7324,20 +7332,19 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint)
* Faulting page and its immediate neighbourhood. Will be cleared at the
* end to keep its cachelines hot.
*/
- r[2] = DEFINE_RANGE(clamp_t(s64, fault_idx - radius, pg.start, pg.end),
- clamp_t(s64, fault_idx + radius, pg.start, pg.end));
+ r[2] = DEFINE_RANGE(fault_idx - radius < (long)pg.start ? pg.start : fault_idx - radius,
+ fault_idx + radius > (long)pg.end ? pg.end : fault_idx + radius);
+
/* Region to the left of the fault */
- r[1] = DEFINE_RANGE(pg.start,
- clamp_t(s64, r[2].start - 1, pg.start - 1, r[2].start));
+ r[1] = DEFINE_RANGE(pg.start, r[2].start - 1);
/* Region to the right of the fault: always valid for the common fault_idx=0 case. */
- r[0] = DEFINE_RANGE(clamp_t(s64, r[2].end + 1, r[2].end, pg.end + 1),
- pg.end);
+ r[0] = DEFINE_RANGE(r[2].end + 1, pg.end);
for (i = 0; i < ARRAY_SIZE(r); i++) {
const unsigned long addr = base_addr + r[i].start * PAGE_SIZE;
- const unsigned int nr_pages = range_len(&r[i]);
+ const long nr_pages = (long)range_len(&r[i]);
struct page *page = folio_page(folio, r[i].start);
if (nr_pages > 0)
diff --git a/mm/mmap.c b/mm/mmap.c
index 4bdb9ffa9e25..843160946aa5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -108,7 +108,8 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;
- return mlock_future_ok(current->mm, current->mm->def_flags, len)
+ return mlock_future_ok(current->mm,
+ current->mm->def_flags & VM_LOCKED, len)
? 0 : -EAGAIN;
}
@@ -225,12 +226,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
- unsigned long bytes)
+bool mlock_future_ok(const struct mm_struct *mm, bool is_vma_locked,
+ unsigned long bytes)
{
unsigned long locked_pages, limit_pages;
- if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+ if (!is_vma_locked || capable(CAP_IPC_LOCK))
return true;
locked_pages = bytes >> PAGE_SHIFT;
@@ -416,7 +417,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- if (!mlock_future_ok(mm, vm_flags, len))
+ if (!mlock_future_ok(mm, vm_flags & VM_LOCKED, len))
return -EAGAIN;
if (file) {
@@ -594,7 +595,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
* taken when vm_ops->mmap() is called
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
- VM_NORESERVE,
+ mk_vma_flags(VMA_NORESERVE_BIT),
HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
@@ -1247,6 +1248,29 @@ limits_failed:
}
EXPORT_SYMBOL(vm_brk_flags);
+static
+unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi,
+ struct vm_area_struct *vma, unsigned long end)
+{
+ unsigned long nr_accounted = 0;
+ int count = 0;
+
+ mmap_assert_write_locked(mm);
+ vma_iter_set(vmi, vma->vm_end);
+ do {
+ if (vma->vm_flags & VM_ACCOUNT)
+ nr_accounted += vma_pages(vma);
+ vma_mark_detached(vma);
+ remove_vma(vma);
+ count++;
+ cond_resched();
+ vma = vma_next(vmi);
+ } while (vma && vma->vm_end <= end);
+
+ VM_WARN_ON_ONCE(count != mm->map_count);
+ return nr_accounted;
+}
+
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
@@ -1254,7 +1278,7 @@ void exit_mmap(struct mm_struct *mm)
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
VMA_ITERATOR(vmi, mm, 0);
- int count = 0;
+ struct unmap_desc unmap;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
@@ -1263,18 +1287,19 @@ void exit_mmap(struct mm_struct *mm)
arch_exit_mmap(mm);
vma = vma_next(&vmi);
- if (!vma || unlikely(xa_is_zero(vma))) {
+ if (!vma) {
/* Can happen if dup_mmap() received an OOM */
mmap_read_unlock(mm);
mmap_write_lock(mm);
goto destroy;
}
+ unmap_all_init(&unmap, &vmi, vma);
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX);
+ unmap_vmas(&tlb, &unmap);
mmap_read_unlock(mm);
/*
@@ -1283,10 +1308,10 @@ void exit_mmap(struct mm_struct *mm)
*/
mm_flags_set(MMF_OOM_SKIP, mm);
mmap_write_lock(mm);
+ unmap.mm_wr_locked = true;
mt_clear_in_rcu(&mm->mm_mt);
- vma_iter_set(&vmi, vma->vm_end);
- free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
- USER_PGTABLES_CEILING, true);
+ unmap_pgtable_init(&unmap, &vmi);
+ free_pgtables(&tlb, &unmap);
tlb_finish_mmu(&tlb);
/*
@@ -1294,22 +1319,11 @@ void exit_mmap(struct mm_struct *mm)
* enabled, without holding any MM locks besides the unreachable
* mmap_write_lock.
*/
- vma_iter_set(&vmi, vma->vm_end);
- do {
- if (vma->vm_flags & VM_ACCOUNT)
- nr_accounted += vma_pages(vma);
- vma_mark_detached(vma);
- remove_vma(vma);
- count++;
- cond_resched();
- vma = vma_next(&vmi);
- } while (vma && likely(!xa_is_zero(vma)));
+ nr_accounted = tear_down_vmas(mm, &vmi, vma, ULONG_MAX);
- BUG_ON(count != mm->map_count);
-
- trace_exit_mmap(mm);
destroy:
__mt_destroy(&mm->mm_mt);
+ trace_exit_mmap(mm);
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -1840,20 +1854,46 @@ loop_out:
ksm_fork(mm, oldmm);
khugepaged_fork(mm, oldmm);
} else {
+ unsigned long end;
/*
- * The entire maple tree has already been duplicated. If the
- * mmap duplication fails, mark the failure point with
- * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
- * stop releasing VMAs that have not been duplicated after this
- * point.
+ * The entire maple tree has already been duplicated, but
+ * replacing the vmas failed at mpnt (which could be NULL if
+ * all were allocated but the last vma was not fully set up).
+ * Use the start address of the failure point to clean up the
+ * partially initialized tree.
*/
- if (mpnt) {
- mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
- mas_store(&vmi.mas, XA_ZERO_ENTRY);
- /* Avoid OOM iterating a broken tree */
- mm_flags_set(MMF_OOM_SKIP, mm);
+ if (!mm->map_count) {
+ /* zero vmas were written to the new tree. */
+ end = 0;
+ } else if (mpnt) {
+ /* partial tree failure */
+ end = mpnt->vm_start;
+ } else {
+ /* All vmas were written to the new tree */
+ end = ULONG_MAX;
+ }
+
+ /* Hide mm from oom killer because the memory is being freed */
+ mm_flags_set(MMF_OOM_SKIP, mm);
+ if (end) {
+ vma_iter_set(&vmi, 0);
+ tmp = vma_next(&vmi);
+ UNMAP_STATE(unmap, &vmi, /* first = */ tmp,
+ /* vma_start = */ 0, /* vma_end = */ end,
+ /* prev = */ NULL, /* next = */ NULL);
+
+ /*
+ * Don't iterate over vmas beyond the failure point for
+ * both unmap_vma() and free_pgtables().
+ */
+ unmap.tree_end = end;
+ flush_cache_mm(mm);
+ unmap_region(&unmap);
+ charge = tear_down_vmas(mm, &vmi, tmp, end);
+ vm_unacct_memory(charge);
}
+ __mt_destroy(&mm->mm_mt);
/*
* The mm_struct is going to exit, but the locks will be dropped
* first. Set the mm_struct as unstable is advisable as it is
diff --git a/mm/mremap.c b/mm/mremap.c
index 8391ae17de64..2be876a70cc0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1740,7 +1740,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm)
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return -EFAULT;
- if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
+ if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, vrm->delta))
return -EAGAIN;
if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 167d4b710786..fcc32737f451 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1429,6 +1429,7 @@ __always_inline bool __free_pages_prepare(struct page *page,
page_cpupid_reset_last(page);
page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page->private = 0;
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
diff --git a/mm/rmap.c b/mm/rmap.c
index ab099405151f..0f00570d1b9e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -913,9 +913,11 @@ static bool folio_referenced_one(struct folio *folio,
struct folio_referenced_arg *pra = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
int ptes = 0, referenced = 0;
+ unsigned int nr;
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
+ nr = 1;
if (vma->vm_flags & VM_LOCKED) {
ptes++;
@@ -960,9 +962,21 @@ static bool folio_referenced_one(struct folio *folio,
if (lru_gen_look_around(&pvmw))
referenced++;
} else if (pvmw.pte) {
- if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte))
+ if (folio_test_large(folio)) {
+ unsigned long end_addr = pmd_addr_end(address, vma->vm_end);
+ unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT;
+ pte_t pteval = ptep_get(pvmw.pte);
+
+ nr = folio_pte_batch(folio, pvmw.pte,
+ pteval, max_nr);
+ }
+
+ ptes += nr;
+ if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr))
referenced++;
+ /* Skip the batched PTEs */
+ pvmw.pte += nr - 1;
+ pvmw.address += (nr - 1) * PAGE_SIZE;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_flush_young_notify(vma, address,
pvmw.pmd))
@@ -972,7 +986,15 @@ static bool folio_referenced_one(struct folio *folio,
WARN_ON_ONCE(1);
}
- pra->mapcount--;
+ pra->mapcount -= nr;
+ /*
+ * If we are sure that we batched the entire folio,
+ * we can just optimize and stop right here.
+ */
+ if (ptes == pvmw.nr_pages) {
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
}
if (referenced)
@@ -1923,12 +1945,16 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
end_addr = pmd_addr_end(addr, vma->vm_end);
max_nr = (end_addr - addr) >> PAGE_SHIFT;
- /* We only support lazyfree batching for now ... */
- if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
+ /* We only support lazyfree or file folios batching for now ... */
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio))
return 1;
+
if (pte_unused(pte))
return 1;
+ if (userfaultfd_wp(vma))
+ return 1;
+
return folio_pte_batch(folio, pvmw->pte, pte, max_nr);
}
@@ -2291,7 +2317,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*
* See Documentation/mm/mmu_notifier.rst
*/
- dec_mm_counter(mm, mm_counter_file(folio));
+ add_mm_counter(mm, mm_counter_file(folio), -nr_pages);
}
discard:
if (unlikely(folio_test_hugetlb(folio))) {
diff --git a/mm/secretmem.c b/mm/secretmem.c
index edf111e0a1bb..11a779c812a7 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -122,13 +122,12 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc)
{
const unsigned long len = vma_desc_size(desc);
- if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+ if (!vma_desc_test_flags(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT))
return -EINVAL;
- if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len))
+ vma_desc_set_flags(desc, VMA_LOCKED_BIT, VMA_DONTDUMP_BIT);
+ if (!mlock_future_ok(desc->mm, /*is_vma_locked=*/ true, len))
return -EAGAIN;
-
- desc->vm_flags |= VM_LOCKED | VM_DONTDUMP;
desc->vm_ops = &secretmem_vm_ops;
return 0;
diff --git a/mm/shmem.c b/mm/shmem.c
index c40d786a21c6..d129f4eb5ca9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3062,9 +3062,9 @@ static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
}
static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
- struct super_block *sb,
- struct inode *dir, umode_t mode,
- dev_t dev, unsigned long flags)
+ struct super_block *sb,
+ struct inode *dir, umode_t mode,
+ dev_t dev, vma_flags_t flags)
{
struct inode *inode;
struct shmem_inode_info *info;
@@ -3092,7 +3092,8 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
spin_lock_init(&info->lock);
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
- info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
+ info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT)
+ ? SHMEM_F_NORESERVE : 0;
info->i_crtime = inode_get_mtime(inode);
info->fsflags = (dir == NULL) ? 0 :
SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
@@ -3145,7 +3146,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
#ifdef CONFIG_TMPFS_QUOTA
static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
struct super_block *sb, struct inode *dir,
- umode_t mode, dev_t dev, unsigned long flags)
+ umode_t mode, dev_t dev, vma_flags_t flags)
{
int err;
struct inode *inode;
@@ -3171,9 +3172,9 @@ errout:
return ERR_PTR(err);
}
#else
-static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
+static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
struct super_block *sb, struct inode *dir,
- umode_t mode, dev_t dev, unsigned long flags)
+ umode_t mode, dev_t dev, vma_flags_t flags)
{
return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
}
@@ -3880,7 +3881,8 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
return -EINVAL;
- inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev,
+ mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -3915,7 +3917,8 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct inode *inode;
int error;
- inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
+ inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0,
+ mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto err_out;
@@ -4112,7 +4115,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
return -ENAMETOOLONG;
inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
- VM_NORESERVE);
+ mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5113,7 +5116,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif /* CONFIG_TMPFS_QUOTA */
inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
- S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
+ S_IFDIR | sbinfo->mode, 0,
+ mk_vma_flags(VMA_NORESERVE_BIT));
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto failed;
@@ -5814,7 +5818,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
struct super_block *sb, struct inode *dir,
- umode_t mode, dev_t dev, unsigned long flags)
+ umode_t mode, dev_t dev, vma_flags_t flags)
{
struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
return inode ? inode : ERR_PTR(-ENOSPC);
@@ -5825,10 +5829,11 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
/* common code */
static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
- loff_t size, unsigned long vm_flags,
+ loff_t size, vma_flags_t flags,
unsigned int i_flags)
{
- unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
+ const unsigned long shmem_flags =
+ vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0;
struct inode *inode;
struct file *res;
@@ -5841,13 +5846,13 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
if (is_idmapped_mnt(mnt))
return ERR_PTR(-EINVAL);
- if (shmem_acct_size(flags, size))
+ if (shmem_acct_size(shmem_flags, size))
return ERR_PTR(-ENOMEM);
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
- S_IFREG | S_IRWXUGO, 0, vm_flags);
+ S_IFREG | S_IRWXUGO, 0, flags);
if (IS_ERR(inode)) {
- shmem_unacct_size(flags, size);
+ shmem_unacct_size(shmem_flags, size);
return ERR_CAST(inode);
}
inode->i_flags |= i_flags;
@@ -5870,9 +5875,10 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
* checks are provided at the key or shm level rather than the inode.
* @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
- * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
*/
-struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
+struct file *shmem_kernel_file_setup(const char *name, loff_t size,
+ vma_flags_t flags)
{
return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}
@@ -5882,9 +5888,9 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
* shmem_file_setup - get an unlinked file living in tmpfs
* @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
- * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
*/
-struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
+struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags)
{
return __shmem_file_setup(shm_mnt, name, size, flags, 0);
}
@@ -5895,16 +5901,17 @@ EXPORT_SYMBOL_GPL(shmem_file_setup);
* @mnt: the tmpfs mount where the file will be created
* @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
- * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size
*/
struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
- loff_t size, unsigned long flags)
+ loff_t size, vma_flags_t flags)
{
return __shmem_file_setup(mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
-static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags)
+static struct file *__shmem_zero_setup(unsigned long start, unsigned long end,
+ vma_flags_t flags)
{
loff_t size = end - start;
@@ -5914,7 +5921,7 @@ static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, v
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
*/
- return shmem_kernel_file_setup("dev/zero", size, vm_flags);
+ return shmem_kernel_file_setup("dev/zero", size, flags);
}
/**
@@ -5924,7 +5931,7 @@ static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, v
*/
int shmem_zero_setup(struct vm_area_struct *vma)
{
- struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags);
+ struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->flags);
if (IS_ERR(file))
return PTR_ERR(file);
@@ -5945,7 +5952,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
*/
int shmem_zero_setup_desc(struct vm_area_desc *desc)
{
- struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
+ struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vma_flags);
if (IS_ERR(file))
return PTR_ERR(file);
diff --git a/mm/util.c b/mm/util.c
index 97cae40c0209..b05ab6f97e11 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1154,7 +1154,7 @@ int __compat_vma_mmap(const struct file_operations *f_op,
.pgoff = vma->vm_pgoff,
.vm_file = vma->vm_file,
- .vm_flags = vma->vm_flags,
+ .vma_flags = vma->flags,
.page_prot = vma->vm_page_prot,
.action.type = MMAP_NOTHING, /* Default */
diff --git a/mm/vma.c b/mm/vma.c
index 3dbe414eff89..be64f781a3aa 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -15,7 +15,10 @@ struct mmap_state {
unsigned long end;
pgoff_t pgoff;
unsigned long pglen;
- vm_flags_t vm_flags;
+ union {
+ vm_flags_t vm_flags;
+ vma_flags_t vma_flags;
+ };
struct file *file;
pgprot_t page_prot;
@@ -472,19 +475,16 @@ void remove_vma(struct vm_area_struct *vma)
*
* Called with the mm semaphore held.
*/
-void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct vm_area_struct *next)
+void unmap_region(struct unmap_desc *unmap)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = unmap->first->vm_mm;
struct mmu_gather tlb;
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end);
- mas_set(mas, vma->vm_end);
- free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : USER_PGTABLES_CEILING,
- /* mm_wr_locked = */ true);
+ unmap_vmas(&tlb, unmap);
+ mas_set(unmap->mas, unmap->tree_reset);
+ free_pgtables(&tlb, unmap);
tlb_finish_mmu(&tlb);
}
@@ -1256,26 +1256,32 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
struct ma_state *mas_detach, bool mm_wr_locked)
{
- struct mmu_gather tlb;
+ struct unmap_desc unmap = {
+ .mas = mas_detach,
+ .first = vms->vma,
+ /* start and end may be different if there is no prev or next vma. */
+ .pg_start = vms->unmap_start,
+ .pg_end = vms->unmap_end,
+ .vma_start = vms->start,
+ .vma_end = vms->end,
+ /*
+ * The tree limits and reset differ from the normal case since it's a
+ * side-tree
+ */
+ .tree_reset = 1,
+ .tree_end = vms->vma_count,
+ /*
+ * We can free page tables without write-locking mmap_lock because VMAs
+ * were isolated before we downgraded mmap_lock.
+ */
+ .mm_wr_locked = mm_wr_locked,
+ };
if (!vms->clear_ptes) /* Nothing to do */
return;
- /*
- * We can free page tables without write-locking mmap_lock because VMAs
- * were isolated before we downgraded mmap_lock.
- */
mas_set(mas_detach, 1);
- tlb_gather_mmu(&tlb, vms->vma->vm_mm);
- update_hiwater_rss(vms->vma->vm_mm);
- unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
- vms->vma_count);
-
- mas_set(mas_detach, 1);
- /* start and end may be different if there is no prev or next vma. */
- free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
- vms->unmap_end, mm_wr_locked);
- tlb_finish_mmu(&tlb);
+ unmap_region(&unmap);
vms->clear_ptes = false;
}
@@ -2366,7 +2372,7 @@ static void set_desc_from_map(struct vm_area_desc *desc,
desc->pgoff = map->pgoff;
desc->vm_file = map->file;
- desc->vm_flags = map->vm_flags;
+ desc->vma_flags = map->vma_flags;
desc->page_prot = map->page_prot;
}
@@ -2461,13 +2467,14 @@ static int __mmap_new_file_vma(struct mmap_state *map,
error = mmap_file(vma->vm_file, vma);
if (error) {
+ UNMAP_STATE(unmap, vmi, vma, vma->vm_start, vma->vm_end,
+ map->prev, map->next);
fput(vma->vm_file);
vma->vm_file = NULL;
vma_iter_set(vmi, vma->vm_end);
/* Undo any partial mapping done by a device driver. */
- unmap_region(&vmi->mas, vma, map->prev, map->next);
-
+ unmap_region(&unmap);
return error;
}
@@ -2646,7 +2653,7 @@ static int call_mmap_prepare(struct mmap_state *map,
map->file_doesnt_need_get = true;
map->file = desc->vm_file;
}
- map->vm_flags = desc->vm_flags;
+ map->vma_flags = desc->vma_flags;
map->page_prot = desc->page_prot;
/* User-defined fields. */
map->vm_ops = desc->vm_ops;
@@ -2819,7 +2826,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return -EINVAL;
/* Map writable and ensure this isn't a sealed memfd. */
- if (file && is_shared_maywrite(vm_flags)) {
+ if (file && is_shared_maywrite_vm_flags(vm_flags)) {
int error = mapping_map_writable(file->f_mapping);
if (error)
@@ -3049,7 +3056,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
return -ENOMEM;
/* mlock limit tests */
- if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
+ if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, grow << PAGE_SHIFT))
return -ENOMEM;
/* Check to ensure the stack will not grow into a hugetlb-only region */
diff --git a/mm/vma.h b/mm/vma.h
index d51efd9da113..eba388c61ef4 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -155,6 +155,72 @@ struct vma_merge_struct {
};
+struct unmap_desc {
+ struct ma_state *mas; /* the maple state point to the first vma */
+ struct vm_area_struct *first; /* The first vma */
+ unsigned long pg_start; /* The first pagetable address to free (floor) */
+ unsigned long pg_end; /* The last pagetable address to free (ceiling) */
+ unsigned long vma_start; /* The min vma address */
+ unsigned long vma_end; /* The max vma address */
+ unsigned long tree_end; /* Maximum for the vma tree search */
+ unsigned long tree_reset; /* Where to reset the vma tree walk */
+ bool mm_wr_locked; /* If the mmap write lock is held */
+};
+
+/*
+ * unmap_all_init() - Initialize unmap_desc to remove all vmas, point the
+ * pg_start and pg_end to a safe location.
+ */
+static inline void unmap_all_init(struct unmap_desc *unmap,
+ struct vma_iterator *vmi, struct vm_area_struct *vma)
+{
+ unmap->mas = &vmi->mas;
+ unmap->first = vma;
+ unmap->pg_start = FIRST_USER_ADDRESS;
+ unmap->pg_end = USER_PGTABLES_CEILING;
+ unmap->vma_start = 0;
+ unmap->vma_end = ULONG_MAX;
+ unmap->tree_end = ULONG_MAX;
+ unmap->tree_reset = vma->vm_end;
+ unmap->mm_wr_locked = false;
+}
+
+/*
+ * unmap_pgtable_init() - Initialize unmap_desc to remove all page tables within
+ * the user range.
+ *
+ * ARM can have mappings outside of vmas.
+ * See: e2cdef8c847b4 ("[PATCH] freepgt: free_pgtables from FIRST_USER_ADDRESS")
+ *
+ * ARM LPAE uses page table mappings beyond the USER_PGTABLES_CEILING
+ * See: CONFIG_ARM_LPAE in arch/arm/include/asm/pgtable.h
+ */
+static inline void unmap_pgtable_init(struct unmap_desc *unmap,
+ struct vma_iterator *vmi)
+{
+ vma_iter_set(vmi, unmap->tree_reset);
+ unmap->vma_start = FIRST_USER_ADDRESS;
+ unmap->vma_end = USER_PGTABLES_CEILING;
+ unmap->tree_end = USER_PGTABLES_CEILING;
+}
+
+#define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next) \
+ struct unmap_desc name = { \
+ .mas = &(_vmi)->mas, \
+ .first = _vma, \
+ .pg_start = _prev ? ((struct vm_area_struct *)_prev)->vm_end : \
+ FIRST_USER_ADDRESS, \
+ .pg_end = _next ? ((struct vm_area_struct *)_next)->vm_start : \
+ USER_PGTABLES_CEILING, \
+ .vma_start = _vma_start, \
+ .vma_end = _vma_end, \
+ .tree_end = _next ? \
+ ((struct vm_area_struct *)_next)->vm_start : \
+ USER_PGTABLES_CEILING, \
+ .tree_reset = _vma->vm_end, \
+ .mm_wr_locked = true, \
+ }
+
static inline bool vmg_nomem(struct vma_merge_struct *vmg)
{
return vmg->state == VMA_MERGE_ERROR_NOMEM;
@@ -243,8 +309,7 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma,
vma->vm_pgoff = desc->pgoff;
if (desc->vm_file != vma->vm_file)
vma_set_file(vma, desc->vm_file);
- if (desc->vm_flags != vma->vm_flags)
- vm_flags_set(vma, desc->vm_flags);
+ vma->flags = desc->vma_flags;
vma->vm_page_prot = desc->page_prot;
/* User-defined fields. */
@@ -262,9 +327,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
bool unlock);
void remove_vma(struct vm_area_struct *vma);
-
-void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct vm_area_struct *next);
+void unmap_region(struct unmap_desc *unmap);
/**
* vma_modify_flags() - Perform any necessary split/merge in preparation for
diff --git a/mm/vma_internal.h b/mm/vma_internal.h
index 2f05735ff190..2da6d224c1a8 100644
--- a/mm/vma_internal.h
+++ b/mm/vma_internal.h
@@ -46,6 +46,7 @@
#include <linux/swap.h>
#include <linux/uprobes.h>
#include <linux/userfaultfd_k.h>
+#include <linux/pgtable.h>
#include <asm/current.h>
#include <asm/tlb.h>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3fc4a4461927..44e4fcd6463c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -343,19 +343,21 @@ static void flush_reclaim_state(struct scan_control *sc)
static bool can_demote(int nid, struct scan_control *sc,
struct mem_cgroup *memcg)
{
- int demotion_nid;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ nodemask_t allowed_mask;
- if (!numa_demotion_enabled)
+ if (!pgdat || !numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- demotion_nid = next_demotion_node(nid);
- if (demotion_nid == NUMA_NO_NODE)
+ node_get_allowed_targets(pgdat, &allowed_mask);
+ if (nodes_empty(allowed_mask))
return false;
- /* If demotion node isn't in the cgroup's mems_allowed, fall back */
- return mem_cgroup_node_allowed(memcg, demotion_nid);
+ /* Filter out nodes that are not in cgroup's mems_allowed. */
+ mem_cgroup_node_filter_allowed(memcg, &allowed_mask);
+ return !nodes_empty(allowed_mask);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -1017,9 +1019,10 @@ static struct folio *alloc_demote_folio(struct folio *src,
* Folios which are not demoted are left on @demote_folios.
*/
static unsigned int demote_folio_list(struct list_head *demote_folios,
- struct pglist_data *pgdat)
+ struct pglist_data *pgdat,
+ struct mem_cgroup *memcg)
{
- int target_nid = next_demotion_node(pgdat->node_id);
+ int target_nid;
unsigned int nr_succeeded;
nodemask_t allowed_mask;
@@ -1031,7 +1034,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
__GFP_NOMEMALLOC | GFP_NOWAIT,
- .nid = target_nid,
.nmask = &allowed_mask,
.reason = MR_DEMOTION,
};
@@ -1039,10 +1041,17 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
if (list_empty(demote_folios))
return 0;
+ node_get_allowed_targets(pgdat, &allowed_mask);
+ mem_cgroup_node_filter_allowed(memcg, &allowed_mask);
+ if (nodes_empty(allowed_mask))
+ return 0;
+
+ target_nid = next_demotion_node(pgdat->node_id, &allowed_mask);
if (target_nid == NUMA_NO_NODE)
+ /* No lower-tier nodes or nodes were hot-unplugged. */
return 0;
- node_get_allowed_targets(pgdat, &allowed_mask);
+ mtc.nid = target_nid;
/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_folios, alloc_demote_folio, NULL,
@@ -1564,7 +1573,7 @@ keep:
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
nr_reclaimed += nr_demoted;
stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
diff --git a/security/keys/big_key.c b/security/keys/big_key.c
index d46862ab90d6..268f702df380 100644
--- a/security/keys/big_key.c
+++ b/security/keys/big_key.c
@@ -103,7 +103,7 @@ int big_key_preparse(struct key_preparsed_payload *prep)
0, enckey);
/* save aligned data to file */
- file = shmem_kernel_file_setup("", enclen, 0);
+ file = shmem_kernel_file_setup("", enclen, EMPTY_VMA_FLAGS);
if (IS_ERR(file)) {
ret = PTR_ERR(file);
goto err_enckey;
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 0d992245c600..250883090a5d 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -24,6 +24,10 @@ void __bitmap_set(unsigned long *map, unsigned int start, int len);
void __bitmap_clear(unsigned long *map, unsigned int start, int len);
bool __bitmap_intersects(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits);
+bool __bitmap_subset(const unsigned long *bitmap1,
+ const unsigned long *bitmap2, unsigned int nbits);
+bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, unsigned int nbits);
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
@@ -81,6 +85,15 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
__bitmap_or(dst, src1, src2, nbits);
}
+static __always_inline
+bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
+ const unsigned long *src2, unsigned int nbits)
+{
+ if (small_const_nbits(nbits))
+ return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
+ return __bitmap_andnot(dst, src1, src2, nbits);
+}
+
static inline unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags __maybe_unused)
{
return malloc(bitmap_size(nbits));
@@ -157,6 +170,15 @@ static inline bool bitmap_intersects(const unsigned long *src1,
return __bitmap_intersects(src1, src2, nbits);
}
+static __always_inline
+bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
+{
+ if (small_const_nbits(nbits))
+ return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
+ else
+ return __bitmap_subset(src1, src2, nbits);
+}
+
static inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits)
{
if (__builtin_constant_p(nbits) && nbits == 1)
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
index 51255c69754d..aa83d22c45e3 100644
--- a/tools/lib/bitmap.c
+++ b/tools/lib/bitmap.c
@@ -140,3 +140,32 @@ void __bitmap_clear(unsigned long *map, unsigned int start, int len)
*p &= ~mask_to_clear;
}
}
+
+bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, unsigned int bits)
+{
+ unsigned int k;
+ unsigned int lim = bits/BITS_PER_LONG;
+ unsigned long result = 0;
+
+ for (k = 0; k < lim; k++)
+ result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
+ if (bits % BITS_PER_LONG)
+ result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
+ BITMAP_LAST_WORD_MASK(bits));
+ return result != 0;
+}
+
+bool __bitmap_subset(const unsigned long *bitmap1,
+ const unsigned long *bitmap2, unsigned int bits)
+{
+ unsigned int k, lim = bits/BITS_PER_LONG;
+ for (k = 0; k < lim; ++k)
+ if (bitmap1[k] & ~bitmap2[k])
+ return false;
+
+ if (bits % BITS_PER_LONG)
+ if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
+ return false;
+ return true;
+}
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index c2a8586e51a1..83ad9454dd9d 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -12,6 +12,7 @@ map_hugetlb
map_populate
thuge-gen
compaction_test
+memory-failure
migration
mlock2-tests
mrelease_test
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index dca8f590c1e6..7a5de4e9bf52 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -75,6 +75,7 @@ TEST_GEN_FILES += map_populate
ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64 loongarch32 loongarch64))
TEST_GEN_FILES += memfd_secret
endif
+TEST_GEN_FILES += memory-failure
TEST_GEN_FILES += migration
TEST_GEN_FILES += mkdirty
TEST_GEN_FILES += mlock-random-test
@@ -154,6 +155,7 @@ TEST_PROGS += ksft_ksm_numa.sh
TEST_PROGS += ksft_madv_guard.sh
TEST_PROGS += ksft_madv_populate.sh
TEST_PROGS += ksft_memfd_secret.sh
+TEST_PROGS += ksft_memory_failure.sh
TEST_PROGS += ksft_migration.sh
TEST_PROGS += ksft_mkdirty.sh
TEST_PROGS += ksft_mlock.sh
diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config
index deba93379c80..1dbe2b4558ab 100644
--- a/tools/testing/selftests/mm/config
+++ b/tools/testing/selftests/mm/config
@@ -11,3 +11,5 @@ CONFIG_ANON_VMA_NAME=y
CONFIG_FTRACE=y
CONFIG_PROFILING=y
CONFIG_UPROBES=y
+CONFIG_MEMORY_FAILURE=y
+CONFIG_HWPOISON_INJECT=m
diff --git a/tools/testing/selftests/mm/ksft_memory_failure.sh b/tools/testing/selftests/mm/ksft_memory_failure.sh
new file mode 100755
index 000000000000..ae1614d4d49b
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_memory_failure.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t memory-failure
diff --git a/tools/testing/selftests/mm/memory-failure.c b/tools/testing/selftests/mm/memory-failure.c
new file mode 100644
index 000000000000..3d9e0b9ffb41
--- /dev/null
+++ b/tools/testing/selftests/mm/memory-failure.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory-failure functional tests.
+ *
+ * Author(s): Miaohe Lin <linmiaohe@huawei.com>
+ */
+
+#include "../kselftest_harness.h"
+
+#include <sys/mman.h>
+#include <linux/mman.h>
+#include <linux/string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/vfs.h>
+#include <linux/magic.h>
+#include <errno.h>
+
+#include "vm_util.h"
+
+enum inject_type {
+ MADV_HARD,
+ MADV_SOFT,
+};
+
+enum result_type {
+ MADV_HARD_ANON,
+ MADV_HARD_CLEAN_PAGECACHE,
+ MADV_HARD_DIRTY_PAGECACHE,
+ MADV_SOFT_ANON,
+ MADV_SOFT_CLEAN_PAGECACHE,
+ MADV_SOFT_DIRTY_PAGECACHE,
+};
+
+static jmp_buf signal_jmp_buf;
+static siginfo_t siginfo;
+const char *pagemap_proc = "/proc/self/pagemap";
+const char *kpageflags_proc = "/proc/kpageflags";
+
+FIXTURE(memory_failure)
+{
+ unsigned long page_size;
+ unsigned long corrupted_size;
+ unsigned long pfn;
+ int pagemap_fd;
+ int kpageflags_fd;
+ bool triggered;
+};
+
+FIXTURE_VARIANT(memory_failure)
+{
+ enum inject_type type;
+ int (*inject)(FIXTURE_DATA(memory_failure) * self, void *vaddr);
+};
+
+static int madv_hard_inject(FIXTURE_DATA(memory_failure) * self, void *vaddr)
+{
+ return madvise(vaddr, self->page_size, MADV_HWPOISON);
+}
+
+FIXTURE_VARIANT_ADD(memory_failure, madv_hard)
+{
+ .type = MADV_HARD,
+ .inject = madv_hard_inject,
+};
+
+static int madv_soft_inject(FIXTURE_DATA(memory_failure) * self, void *vaddr)
+{
+ return madvise(vaddr, self->page_size, MADV_SOFT_OFFLINE);
+}
+
+FIXTURE_VARIANT_ADD(memory_failure, madv_soft)
+{
+ .type = MADV_SOFT,
+ .inject = madv_soft_inject,
+};
+
+static void sigbus_action(int signo, siginfo_t *si, void *args)
+{
+ memcpy(&siginfo, si, sizeof(siginfo_t));
+ siglongjmp(signal_jmp_buf, 1);
+}
+
+static int setup_sighandler(void)
+{
+ struct sigaction sa = {
+ .sa_sigaction = sigbus_action,
+ .sa_flags = SA_SIGINFO,
+ };
+
+ return sigaction(SIGBUS, &sa, NULL);
+}
+
+FIXTURE_SETUP(memory_failure)
+{
+ memset(self, 0, sizeof(*self));
+
+ self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
+
+ memset(&siginfo, 0, sizeof(siginfo));
+ if (setup_sighandler())
+ SKIP(return, "setup sighandler failed.\n");
+
+ self->pagemap_fd = open(pagemap_proc, O_RDONLY);
+ if (self->pagemap_fd == -1)
+ SKIP(return, "open %s failed.\n", pagemap_proc);
+
+ self->kpageflags_fd = open(kpageflags_proc, O_RDONLY);
+ if (self->kpageflags_fd == -1)
+ SKIP(return, "open %s failed.\n", kpageflags_proc);
+}
+
+static void teardown_sighandler(void)
+{
+ struct sigaction sa = {
+ .sa_handler = SIG_DFL,
+ .sa_flags = SA_SIGINFO,
+ };
+
+ sigaction(SIGBUS, &sa, NULL);
+}
+
+FIXTURE_TEARDOWN(memory_failure)
+{
+ close(self->kpageflags_fd);
+ close(self->pagemap_fd);
+ teardown_sighandler();
+}
+
+static void prepare(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self,
+ void *vaddr)
+{
+ self->pfn = pagemap_get_pfn(self->pagemap_fd, vaddr);
+ ASSERT_NE(self->pfn, -1UL);
+
+ ASSERT_EQ(get_hardware_corrupted_size(&self->corrupted_size), 0);
+}
+
+static bool check_memory(void *vaddr, unsigned long size)
+{
+ char buf[64];
+
+ memset(buf, 0xce, sizeof(buf));
+ while (size >= sizeof(buf)) {
+ if (memcmp(vaddr, buf, sizeof(buf)))
+ return false;
+ size -= sizeof(buf);
+ vaddr += sizeof(buf);
+ }
+
+ return true;
+}
+
+static void check(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self,
+ void *vaddr, enum result_type type, int setjmp)
+{
+ unsigned long size;
+ uint64_t pfn_flags;
+
+ switch (type) {
+ case MADV_SOFT_ANON:
+ case MADV_HARD_CLEAN_PAGECACHE:
+ case MADV_SOFT_CLEAN_PAGECACHE:
+ case MADV_SOFT_DIRTY_PAGECACHE:
+ /* It is not expected to receive a SIGBUS signal. */
+ ASSERT_EQ(setjmp, 0);
+
+ /* The page content should remain unchanged. */
+ ASSERT_TRUE(check_memory(vaddr, self->page_size));
+
+ /* The backing pfn of addr should have changed. */
+ ASSERT_NE(pagemap_get_pfn(self->pagemap_fd, vaddr), self->pfn);
+ break;
+ case MADV_HARD_ANON:
+ case MADV_HARD_DIRTY_PAGECACHE:
+ /* The SIGBUS signal should have been received. */
+ ASSERT_EQ(setjmp, 1);
+
+ /* Check if siginfo contains correct SIGBUS context. */
+ ASSERT_EQ(siginfo.si_signo, SIGBUS);
+ ASSERT_EQ(siginfo.si_code, BUS_MCEERR_AR);
+ ASSERT_EQ(1UL << siginfo.si_addr_lsb, self->page_size);
+ ASSERT_EQ(siginfo.si_addr, vaddr);
+
+ /* XXX Check backing pte is hwpoison entry when supported. */
+ ASSERT_TRUE(pagemap_is_swapped(self->pagemap_fd, vaddr));
+ break;
+ default:
+ SKIP(return, "unexpected inject type %d.\n", type);
+ }
+
+ /* Check if the value of HardwareCorrupted has increased. */
+ ASSERT_EQ(get_hardware_corrupted_size(&size), 0);
+ ASSERT_EQ(size, self->corrupted_size + self->page_size / 1024);
+
+ /* Check if HWPoison flag is set. */
+ ASSERT_EQ(pageflags_get(self->pfn, self->kpageflags_fd, &pfn_flags), 0);
+ ASSERT_EQ(pfn_flags & KPF_HWPOISON, KPF_HWPOISON);
+}
+
+static void cleanup(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self,
+ void *vaddr)
+{
+ unsigned long size;
+ uint64_t pfn_flags;
+
+ ASSERT_EQ(unpoison_memory(self->pfn), 0);
+
+ /* Check if HWPoison flag is cleared. */
+ ASSERT_EQ(pageflags_get(self->pfn, self->kpageflags_fd, &pfn_flags), 0);
+ ASSERT_NE(pfn_flags & KPF_HWPOISON, KPF_HWPOISON);
+
+ /* Check if the value of HardwareCorrupted has decreased. */
+ ASSERT_EQ(get_hardware_corrupted_size(&size), 0);
+ ASSERT_EQ(size, self->corrupted_size);
+}
+
+TEST_F(memory_failure, anon)
+{
+ char *addr;
+ int ret;
+
+ addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (addr == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+ memset(addr, 0xce, self->page_size);
+
+ prepare(_metadata, self, addr);
+
+ ret = sigsetjmp(signal_jmp_buf, 1);
+ if (!self->triggered) {
+ self->triggered = true;
+ ASSERT_EQ(variant->inject(self, addr), 0);
+ FORCE_READ(*addr);
+ }
+
+ if (variant->type == MADV_HARD)
+ check(_metadata, self, addr, MADV_HARD_ANON, ret);
+ else
+ check(_metadata, self, addr, MADV_SOFT_ANON, ret);
+
+ cleanup(_metadata, self, addr);
+
+ ASSERT_EQ(munmap(addr, self->page_size), 0);
+}
+
+static int prepare_file(const char *fname, unsigned long size)
+{
+ int fd;
+
+ fd = open(fname, O_RDWR | O_CREAT, 0664);
+ if (fd >= 0) {
+ unlink(fname);
+ ftruncate(fd, size);
+ }
+ return fd;
+}
+
+/* Borrowed from mm/gup_longterm.c. */
+static int get_fs_type(int fd)
+{
+ struct statfs fs;
+ int ret;
+
+ do {
+ ret = fstatfs(fd, &fs);
+ } while (ret && errno == EINTR);
+
+ return ret ? 0 : (int)fs.f_type;
+}
+
+TEST_F(memory_failure, clean_pagecache)
+{
+ int fd;
+ char *addr;
+ int ret;
+ int fs_type;
+
+ fd = prepare_file("./clean-page-cache-test-file", self->page_size);
+ if (fd < 0)
+ SKIP(return, "failed to open test file.\n");
+ fs_type = get_fs_type(fd);
+ if (!fs_type || fs_type == TMPFS_MAGIC)
+ SKIP(return, "unsupported filesystem :%x\n", fs_type);
+
+ addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ if (addr == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+ memset(addr, 0xce, self->page_size);
+ fsync(fd);
+
+ prepare(_metadata, self, addr);
+
+ ret = sigsetjmp(signal_jmp_buf, 1);
+ if (!self->triggered) {
+ self->triggered = true;
+ ASSERT_EQ(variant->inject(self, addr), 0);
+ FORCE_READ(*addr);
+ }
+
+ if (variant->type == MADV_HARD)
+ check(_metadata, self, addr, MADV_HARD_CLEAN_PAGECACHE, ret);
+ else
+ check(_metadata, self, addr, MADV_SOFT_CLEAN_PAGECACHE, ret);
+
+ cleanup(_metadata, self, addr);
+
+ ASSERT_EQ(munmap(addr, self->page_size), 0);
+
+ ASSERT_EQ(close(fd), 0);
+}
+
+TEST_F(memory_failure, dirty_pagecache)
+{
+ int fd;
+ char *addr;
+ int ret;
+ int fs_type;
+
+ fd = prepare_file("./dirty-page-cache-test-file", self->page_size);
+ if (fd < 0)
+ SKIP(return, "failed to open test file.\n");
+ fs_type = get_fs_type(fd);
+ if (!fs_type || fs_type == TMPFS_MAGIC)
+ SKIP(return, "unsupported filesystem :%x\n", fs_type);
+
+ addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ if (addr == MAP_FAILED)
+ SKIP(return, "mmap failed, not enough memory.\n");
+ memset(addr, 0xce, self->page_size);
+
+ prepare(_metadata, self, addr);
+
+ ret = sigsetjmp(signal_jmp_buf, 1);
+ if (!self->triggered) {
+ self->triggered = true;
+ ASSERT_EQ(variant->inject(self, addr), 0);
+ FORCE_READ(*addr);
+ }
+
+ if (variant->type == MADV_HARD)
+ check(_metadata, self, addr, MADV_HARD_DIRTY_PAGECACHE, ret);
+ else
+ check(_metadata, self, addr, MADV_SOFT_DIRTY_PAGECACHE, ret);
+
+ cleanup(_metadata, self, addr);
+
+ ASSERT_EQ(munmap(addr, self->page_size), 0);
+
+ ASSERT_EQ(close(fd), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 29be9038bfb0..afdcfd0d7cef 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -91,6 +91,8 @@ separated by spaces:
test VMA merge cases behave as expected
- rmap
test rmap behaves as expected
+- memory-failure
+ test memory-failure behaves as expected
example: ./run_vmtests.sh -t "hmm mmap ksm"
EOF
@@ -527,6 +529,25 @@ CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned
CATEGORY="rmap" run_test ./rmap
+# Try to load hwpoison_inject if not present.
+HWPOISON_DIR=/sys/kernel/debug/hwpoison/
+if [ ! -d "$HWPOISON_DIR" ]; then
+ if ! modprobe -q -R hwpoison_inject; then
+ echo "Module hwpoison_inject not found, skipping..."
+ else
+ modprobe hwpoison_inject > /dev/null 2>&1
+ LOADED_MOD=1
+ fi
+fi
+
+if [ -d "$HWPOISON_DIR" ]; then
+ CATEGORY="memory-failure" run_test ./memory-failure
+fi
+
+if [ -n "${LOADED_MOD}" ]; then
+ modprobe -r hwpoison_inject > /dev/null 2>&1
+fi
+
if [ "${HAVE_HUGEPAGES}" = 1 ]; then
echo "$orig_nr_hugepgs" > /proc/sys/vm/nr_hugepages
fi
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index d954bf91afd5..a6d4ff7dfdc0 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -723,3 +723,44 @@ int ksm_stop(void)
close(ksm_fd);
return ret == 1 ? 0 : -errno;
}
+
+int get_hardware_corrupted_size(unsigned long *val)
+{
+ unsigned long size;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+ int ret = -1;
+
+ if (!f)
+ return ret;
+
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "HardwareCorrupted: %12lu kB", &size) == 1) {
+ *val = size;
+ ret = 0;
+ break;
+ }
+ }
+
+ free(line);
+ fclose(f);
+ return ret;
+}
+
+int unpoison_memory(unsigned long pfn)
+{
+ int unpoison_fd, len;
+ char buf[32];
+ ssize_t ret;
+
+ unpoison_fd = open("/sys/kernel/debug/hwpoison/unpoison-pfn", O_WRONLY);
+ if (unpoison_fd < 0)
+ return -errno;
+
+ len = sprintf(buf, "0x%lx\n", pfn);
+ ret = write(unpoison_fd, buf, len);
+ close(unpoison_fd);
+
+ return ret > 0 ? 0 : -errno;
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index 522f7f9050f5..e9c4e24769c1 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -20,6 +20,7 @@
#define KPF_COMPOUND_HEAD BIT_ULL(15)
#define KPF_COMPOUND_TAIL BIT_ULL(16)
+#define KPF_HWPOISON BIT_ULL(19)
#define KPF_THP BIT_ULL(22)
/*
* Ignore the checkpatch warning, we must read from x but don't want to do
@@ -154,6 +155,8 @@ long ksm_get_full_scans(void);
int ksm_use_zero_pages(void);
int ksm_start(void);
int ksm_stop(void);
+int get_hardware_corrupted_size(unsigned long *val);
+int unpoison_memory(unsigned long pfn);
/*
* On ppc64 this will only work with radix 2M hugepage size
diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
index 66f3831a668f..e72b45dedda5 100644
--- a/tools/testing/vma/Makefile
+++ b/tools/testing/vma/Makefile
@@ -6,10 +6,13 @@ default: vma
include ../shared/shared.mk
-OFILES = $(SHARED_OFILES) vma.o maple-shim.o
+OFILES = $(SHARED_OFILES) main.o shared.o maple-shim.o
TARGETS = vma
-vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h
+# These can be varied to test different sizes.
+CFLAGS += -DNUM_VMA_FLAG_BITS=128 -DNUM_MM_FLAG_BITS=128
+
+main.o: main.c shared.c shared.h vma_internal.h tests/merge.c tests/mmap.c tests/vma.c ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h include/custom.h include/dup.h include/stubs.h
vma: $(OFILES)
$(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h
new file mode 100644
index 000000000000..802a76317245
--- /dev/null
+++ b/tools/testing/vma/include/custom.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#pragma once
+
+/*
+ * Contains declarations that exist in the kernel which have been CUSTOMISED for
+ * testing purposes to faciliate userland VMA testing.
+ */
+
+#ifdef CONFIG_MMU
+extern unsigned long mmap_min_addr;
+extern unsigned long dac_mmap_min_addr;
+#else
+#define mmap_min_addr 0UL
+#define dac_mmap_min_addr 0UL
+#endif
+
+#define VM_WARN_ON(_expr) (WARN_ON(_expr))
+#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr))
+#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr))
+#define VM_BUG_ON(_expr) (BUG_ON(_expr))
+#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr))
+
+/* We hardcode this for now. */
+#define sysctl_max_map_count 0x1000000UL
+
+#define TASK_SIZE ((1ul << 47)-PAGE_SIZE)
+
+/*
+ * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...)
+ * either way :)
+ */
+#define pr_warn_once pr_err
+
+#define pgtable_supports_soft_dirty() 1
+
+struct anon_vma {
+ struct anon_vma *root;
+ struct rb_root_cached rb_root;
+
+ /* Test fields. */
+ bool was_cloned;
+ bool was_unlinked;
+};
+
+static inline void unlink_anon_vmas(struct vm_area_struct *vma)
+{
+ /* For testing purposes, indicate that the anon_vma was unlinked. */
+ vma->anon_vma->was_unlinked = true;
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+ /* Used to indicate to tests that a write operation has begun. */
+ vma->vm_lock_seq++;
+}
+
+static inline __must_check
+int vma_start_write_killable(struct vm_area_struct *vma)
+{
+ /* Used to indicate to tests that a write operation has begun. */
+ vma->vm_lock_seq++;
+ return 0;
+}
+
+static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+ enum vma_operation operation)
+{
+ /* For testing purposes. We indicate that an anon_vma has been cloned. */
+ if (src->anon_vma != NULL) {
+ dst->anon_vma = src->anon_vma;
+ dst->anon_vma->was_cloned = true;
+ }
+
+ return 0;
+}
+
+static inline int __anon_vma_prepare(struct vm_area_struct *vma)
+{
+ struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma));
+
+ if (!anon_vma)
+ return -ENOMEM;
+
+ anon_vma->root = anon_vma;
+ vma->anon_vma = anon_vma;
+
+ return 0;
+}
+
+static inline int anon_vma_prepare(struct vm_area_struct *vma)
+{
+ if (likely(vma->anon_vma))
+ return 0;
+
+ return __anon_vma_prepare(vma);
+}
+
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
+{
+ if (reset_refcnt)
+ refcount_set(&vma->vm_refcnt, 0);
+}
+
+static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits)
+{
+ vma_flags_t flags;
+ int i;
+
+ /*
+ * For testing purposes: allow invalid bit specification so we can
+ * easily test.
+ */
+ vma_flags_clear_all(&flags);
+ for (i = 0; i < count; i++)
+ if (bits[i] < NUM_VMA_FLAG_BITS)
+ vma_flag_set(&flags, bits[i]);
+ return flags;
+}
diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
new file mode 100644
index 000000000000..3078ff1487d3
--- /dev/null
+++ b/tools/testing/vma/include/dup.h
@@ -0,0 +1,1320 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#pragma once
+
+/* Forward declarations to avoid header cycle. */
+struct vm_area_struct;
+static inline void vma_start_write(struct vm_area_struct *vma);
+
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+extern unsigned long stack_guard_gap;
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+extern unsigned long rlimit(unsigned int limit);
+struct task_struct *get_current(void);
+
+#define MMF_HAS_MDWE 28
+#define current get_current()
+
+/*
+ * Define the task command name length as enum, then it can be visible to
+ * BPF programs.
+ */
+enum {
+ TASK_COMM_LEN = 16,
+};
+
+/* PARTIALLY implemented types. */
+struct mm_struct {
+ struct maple_tree mm_mt;
+ int map_count; /* number of VMAs */
+ unsigned long total_vm; /* Total pages mapped */
+ unsigned long locked_vm; /* Pages that have PG_mlocked set */
+ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
+ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
+ unsigned long stack_vm; /* VM_STACK */
+
+ unsigned long def_flags;
+
+ mm_flags_t flags; /* Must use mm_flags_* helpers to access */
+};
+struct address_space {
+ struct rb_root_cached i_mmap;
+ unsigned long flags;
+ atomic_t i_mmap_writable;
+};
+struct file_operations {
+ int (*mmap)(struct file *, struct vm_area_struct *);
+ int (*mmap_prepare)(struct vm_area_desc *);
+};
+struct file {
+ struct address_space *f_mapping;
+ const struct file_operations *f_op;
+};
+struct anon_vma_chain {
+ struct anon_vma *anon_vma;
+ struct list_head same_vma;
+};
+struct task_struct {
+ char comm[TASK_COMM_LEN];
+ pid_t pid;
+ struct mm_struct *mm;
+
+ /* Used for emulating ABI behavior of previous Linux versions: */
+ unsigned int personality;
+};
+
+struct kref {
+ refcount_t refcount;
+};
+
+struct anon_vma_name {
+ struct kref kref;
+ /* The name needs to be at the end because it is dynamically sized. */
+ char name[];
+};
+
+/*
+ * Contains declarations that are DUPLICATED from kernel source in order to
+ * faciliate userland VMA testing.
+ *
+ * These must be kept in sync with kernel source.
+ */
+
+#define VMA_LOCK_OFFSET 0x40000000
+
+typedef struct { unsigned long v; } freeptr_t;
+
+#define VM_NONE 0x00000000
+
+typedef int __bitwise vma_flag_t;
+
+#define ACCESS_PRIVATE(p, member) ((p)->member)
+
+#define DECLARE_VMA_BIT(name, bitnum) \
+ VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
+#define DECLARE_VMA_BIT_ALIAS(name, aliased) \
+ VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT
+enum {
+ DECLARE_VMA_BIT(READ, 0),
+ DECLARE_VMA_BIT(WRITE, 1),
+ DECLARE_VMA_BIT(EXEC, 2),
+ DECLARE_VMA_BIT(SHARED, 3),
+ /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
+ DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */
+ DECLARE_VMA_BIT(MAYWRITE, 5),
+ DECLARE_VMA_BIT(MAYEXEC, 6),
+ DECLARE_VMA_BIT(MAYSHARE, 7),
+ DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */
+#ifdef CONFIG_MMU
+ DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
+#else
+ /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
+ DECLARE_VMA_BIT(MAYOVERLAY, 9),
+#endif /* CONFIG_MMU */
+ /* Page-ranges managed without "struct page", just pure PFN */
+ DECLARE_VMA_BIT(PFNMAP, 10),
+ DECLARE_VMA_BIT(MAYBE_GUARD, 11),
+ DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */
+ DECLARE_VMA_BIT(LOCKED, 13),
+ DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */
+ DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */
+ DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */
+ DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */
+ DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
+ DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
+ DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */
+ DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */
+ DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */
+ DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */
+ DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */
+ DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
+ DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */
+ DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */
+ DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */
+ DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */
+ DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
+ DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */
+ /* These bits are reused, we define specific uses below. */
+ DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
+ DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
+ DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
+ DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
+ DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
+ DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
+ DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
+ /*
+ * This flag is used to connect VFIO to arch specific KVM code. It
+ * indicates that the memory under this VMA is safe for use with any
+ * non-cachable memory type inside KVM. Some VFIO devices, on some
+ * platforms, are thought to be unsafe and can cause machine crashes
+ * if KVM does not lock down the memory type.
+ */
+ DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
+#ifdef CONFIG_PPC32
+ DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
+#else
+ DECLARE_VMA_BIT(DROPPABLE, 40),
+#endif
+ DECLARE_VMA_BIT(UFFD_MINOR, 41),
+ DECLARE_VMA_BIT(SEALED, 42),
+ /* Flags that reuse flags above. */
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
+#if defined(CONFIG_X86_USER_SHADOW_STACK)
+ /*
+ * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
+ * support core mm.
+ *
+ * These VMAs will get a single end guard page. This helps userspace
+ * protect itself from attacks. A single page is enough for current
+ * shadow stack archs (x86). See the comments near alloc_shstk() in
+ * arch/x86/kernel/shstk.c for more details on the guard size.
+ */
+ DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
+#elif defined(CONFIG_ARM64_GCS)
+ /*
+ * arm64's Guarded Control Stack implements similar functionality and
+ * has similar constraints to shadow stacks.
+ */
+ DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
+#endif
+ DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */
+ DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */
+ DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */
+ DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */
+ DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */
+ DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */
+ DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */
+ DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
+#ifdef CONFIG_STACK_GROWSUP
+ DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
+ DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
+#else
+ DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
+#endif
+};
+
+#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
+#define VM_READ INIT_VM_FLAG(READ)
+#define VM_WRITE INIT_VM_FLAG(WRITE)
+#define VM_EXEC INIT_VM_FLAG(EXEC)
+#define VM_SHARED INIT_VM_FLAG(SHARED)
+#define VM_MAYREAD INIT_VM_FLAG(MAYREAD)
+#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE)
+#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC)
+#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE)
+#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN)
+#ifdef CONFIG_MMU
+#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING)
+#else
+#define VM_UFFD_MISSING VM_NONE
+#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY)
+#endif
+#define VM_PFNMAP INIT_VM_FLAG(PFNMAP)
+#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD)
+#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP)
+#define VM_LOCKED INIT_VM_FLAG(LOCKED)
+#define VM_IO INIT_VM_FLAG(IO)
+#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ)
+#define VM_RAND_READ INIT_VM_FLAG(RAND_READ)
+#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY)
+#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND)
+#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT)
+#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT)
+#define VM_NORESERVE INIT_VM_FLAG(NORESERVE)
+#define VM_HUGETLB INIT_VM_FLAG(HUGETLB)
+#define VM_SYNC INIT_VM_FLAG(SYNC)
+#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1)
+#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK)
+#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP)
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY)
+#else
+#define VM_SOFTDIRTY VM_NONE
+#endif
+#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP)
+#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE)
+#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE)
+#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE)
+#define VM_STACK INIT_VM_FLAG(STACK)
+#ifdef CONFIG_STACK_GROWS_UP
+#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY)
+#else
+#define VM_STACK_EARLY VM_NONE
+#endif
+#ifdef CONFIG_ARCH_HAS_PKEYS
+#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
+/* Despite the naming, these are FLAGS not bits. */
+#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
+#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
+#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
+#if CONFIG_ARCH_PKEY_BITS > 3
+#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
+#else
+#define VM_PKEY_BIT3 VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 3 */
+#if CONFIG_ARCH_PKEY_BITS > 4
+#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
+#else
+#define VM_PKEY_BIT4 VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 4 */
+#endif /* CONFIG_ARCH_HAS_PKEYS */
+#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS)
+#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK)
+#else
+#define VM_SHADOW_STACK VM_NONE
+#endif
+#if defined(CONFIG_PPC64)
+#define VM_SAO INIT_VM_FLAG(SAO)
+#elif defined(CONFIG_PARISC)
+#define VM_GROWSUP INIT_VM_FLAG(GROWSUP)
+#elif defined(CONFIG_SPARC64)
+#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI)
+#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR)
+#elif defined(CONFIG_ARM64)
+#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI)
+#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR)
+#elif !defined(CONFIG_MMU)
+#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY)
+#endif
+#ifndef VM_GROWSUP
+#define VM_GROWSUP VM_NONE
+#endif
+#ifdef CONFIG_ARM64_MTE
+#define VM_MTE INIT_VM_FLAG(MTE)
+#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED)
+#else
+#define VM_MTE VM_NONE
+#define VM_MTE_ALLOWED VM_NONE
+#endif
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR)
+#else
+#define VM_UFFD_MINOR VM_NONE
+#endif
+#ifdef CONFIG_64BIT
+#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
+#define VM_SEALED INIT_VM_FLAG(SEALED)
+#else
+#define VM_ALLOW_ANY_UNCACHED VM_NONE
+#define VM_SEALED VM_NONE
+#endif
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE)
+#else
+#define VM_DROPPABLE VM_NONE
+#endif
+
+/* Bits set in the VMA until the stack is in its final location */
+#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
+
+#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
+
+/* Common data flag combinations */
+#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \
+ VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC
+#endif
+
+#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
+#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
+#endif
+
+#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
+
+#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
+
+/* VMA basic access permission flags */
+#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
+
+/*
+ * Special vmas that are non-mergable, non-mlock()able.
+ */
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+
+#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_LOW DEFAULT_MAP_WINDOW
+#define TASK_SIZE_MAX DEFAULT_MAP_WINDOW
+#define STACK_TOP TASK_SIZE_LOW
+#define STACK_TOP_MAX TASK_SIZE_MAX
+
+/* This mask represents all the VMA flag bits used by mlock */
+#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
+
+#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
+
+#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#define RLIMIT_STACK 3 /* max stack size */
+#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */
+
+#define CAP_IPC_LOCK 14
+
+#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
+
+#define VM_IGNORE_MERGE VM_STICKY
+
+#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
+
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) } )
+
+#define for_each_vma(__vmi, __vma) \
+ while (((__vma) = vma_next(&(__vmi))) != NULL)
+
+/* The MM code likes to work with exclusive end addresses */
+#define for_each_vma_range(__vmi, __vma, __end) \
+ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
+
+#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
+
+#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT))
+
+#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr)
+#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr)
+
+#define AS_MM_ALL_LOCKS 2
+
+#define swap(a, b) \
+ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
+/*
+ * Flags for bug emulation.
+ *
+ * These occupy the top three bytes.
+ */
+enum {
+ READ_IMPLIES_EXEC = 0x0400000,
+};
+
+struct vma_iterator {
+ struct ma_state mas;
+};
+
+#define VMA_ITERATOR(name, __mm, __addr) \
+ struct vma_iterator name = { \
+ .mas = { \
+ .tree = &(__mm)->mm_mt, \
+ .index = __addr, \
+ .node = NULL, \
+ .status = ma_start, \
+ }, \
+ }
+
+#define DEFINE_MUTEX(mutexname) \
+ struct mutex mutexname = {}
+
+#define DECLARE_BITMAP(name, bits) \
+ unsigned long name[BITS_TO_LONGS(bits)]
+
+#define EMPTY_VMA_FLAGS ((vma_flags_t){ })
+
+/* What action should be taken after an .mmap_prepare call is complete? */
+enum mmap_action_type {
+ MMAP_NOTHING, /* Mapping is complete, no further action. */
+ MMAP_REMAP_PFN, /* Remap PFN range. */
+ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */
+};
+
+/*
+ * Describes an action an mmap_prepare hook can instruct to be taken to complete
+ * the mapping of a VMA. Specified in vm_area_desc.
+ */
+struct mmap_action {
+ union {
+ /* Remap range. */
+ struct {
+ unsigned long start;
+ unsigned long start_pfn;
+ unsigned long size;
+ pgprot_t pgprot;
+ } remap;
+ };
+ enum mmap_action_type type;
+
+ /*
+ * If specified, this hook is invoked after the selected action has been
+ * successfully completed. Note that the VMA write lock still held.
+ *
+ * The absolute minimum ought to be done here.
+ *
+ * Returns 0 on success, or an error code.
+ */
+ int (*success_hook)(const struct vm_area_struct *vma);
+
+ /*
+ * If specified, this hook is invoked when an error occurred when
+ * attempting the selection action.
+ *
+ * The hook can return an error code in order to filter the error, but
+ * it is not valid to clear the error here.
+ */
+ int (*error_hook)(int err);
+
+ /*
+ * This should be set in rare instances where the operation required
+ * that the rmap should not be able to access the VMA until
+ * completely set up.
+ */
+ bool hide_from_rmap_until_complete :1;
+};
+
+/* Operations which modify VMAs. */
+enum vma_operation {
+ VMA_OP_SPLIT,
+ VMA_OP_MERGE_UNFAULTED,
+ VMA_OP_REMAP,
+ VMA_OP_FORK,
+};
+
+/*
+ * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
+ * manipulate mutable fields which will cause those fields to be updated in the
+ * resultant VMA.
+ *
+ * Helper functions are not required for manipulating any field.
+ */
+struct vm_area_desc {
+ /* Immutable state. */
+ const struct mm_struct *const mm;
+ struct file *const file; /* May vary from vm_file in stacked callers. */
+ unsigned long start;
+ unsigned long end;
+
+ /* Mutable fields. Populated with initial state. */
+ pgoff_t pgoff;
+ struct file *vm_file;
+ union {
+ vm_flags_t vm_flags;
+ vma_flags_t vma_flags;
+ };
+ pgprot_t page_prot;
+
+ /* Write-only fields. */
+ const struct vm_operations_struct *vm_ops;
+ void *private_data;
+
+ /* Take further action? */
+ struct mmap_action action;
+};
+
+struct vm_area_struct {
+ /* The first cache line has the info for VMA tree walking. */
+
+ union {
+ struct {
+ /* VMA covers [vm_start; vm_end) addresses within mm */
+ unsigned long vm_start;
+ unsigned long vm_end;
+ };
+ freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
+ };
+
+ struct mm_struct *vm_mm; /* The address space we belong to. */
+ pgprot_t vm_page_prot; /* Access permissions of this VMA. */
+
+ /*
+ * Flags, see mm.h.
+ * To modify use vm_flags_{init|reset|set|clear|mod} functions.
+ */
+ union {
+ const vm_flags_t vm_flags;
+ vma_flags_t flags;
+ };
+
+#ifdef CONFIG_PER_VMA_LOCK
+ /*
+ * Can only be written (using WRITE_ONCE()) while holding both:
+ * - mmap_lock (in write mode)
+ * - vm_refcnt bit at VMA_LOCK_OFFSET is set
+ * Can be read reliably while holding one of:
+ * - mmap_lock (in read or write mode)
+ * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
+ * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
+ * while holding nothing (except RCU to keep the VMA struct allocated).
+ *
+ * This sequence counter is explicitly allowed to overflow; sequence
+ * counter reuse can only lead to occasional unnecessary use of the
+ * slowpath.
+ */
+ unsigned int vm_lock_seq;
+#endif
+
+ /*
+ * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
+ * list, after a COW of one of the file pages. A MAP_SHARED vma
+ * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
+ * or brk vma (with NULL file) can only be in an anon_vma list.
+ */
+ struct list_head anon_vma_chain; /* Serialized by mmap_lock &
+ * page_table_lock */
+ struct anon_vma *anon_vma; /* Serialized by page_table_lock */
+
+ /* Function pointers to deal with this struct. */
+ const struct vm_operations_struct *vm_ops;
+
+ /* Information about our backing store: */
+ unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
+ units */
+ struct file * vm_file; /* File we map to (can be NULL). */
+ void * vm_private_data; /* was vm_pte (shared mem) */
+
+#ifdef CONFIG_SWAP
+ atomic_long_t swap_readahead_info;
+#endif
+#ifndef CONFIG_MMU
+ struct vm_region *vm_region; /* NOMMU mapping region */
+#endif
+#ifdef CONFIG_NUMA
+ struct mempolicy *vm_policy; /* NUMA policy for the VMA */
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ struct vma_numab_state *numab_state; /* NUMA Balancing state */
+#endif
+#ifdef CONFIG_PER_VMA_LOCK
+ /* Unstable RCU readers are allowed to read this. */
+ refcount_t vm_refcnt;
+#endif
+ /*
+ * For areas with an address space and backing store,
+ * linkage into the address_space->i_mmap interval tree.
+ *
+ */
+ struct {
+ struct rb_node rb;
+ unsigned long rb_subtree_last;
+ } shared;
+#ifdef CONFIG_ANON_VMA_NAME
+ /*
+ * For private and shared anonymous mappings, a pointer to a null
+ * terminated string containing the name given to the vma, or NULL if
+ * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
+ */
+ struct anon_vma_name *anon_name;
+#endif
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+} __randomize_layout;
+
+struct vm_operations_struct {
+ void (*open)(struct vm_area_struct * area);
+ /**
+ * @close: Called when the VMA is being removed from the MM.
+ * Context: User context. May sleep. Caller holds mmap_lock.
+ */
+ void (*close)(struct vm_area_struct * area);
+ /* Called any time before splitting to check if it's allowed */
+ int (*may_split)(struct vm_area_struct *area, unsigned long addr);
+ int (*mremap)(struct vm_area_struct *area);
+ /*
+ * Called by mprotect() to make driver-specific permission
+ * checks before mprotect() is finalised. The VMA must not
+ * be modified. Returns 0 if mprotect() can proceed.
+ */
+ int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long newflags);
+ vm_fault_t (*fault)(struct vm_fault *vmf);
+ vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
+ vm_fault_t (*map_pages)(struct vm_fault *vmf,
+ pgoff_t start_pgoff, pgoff_t end_pgoff);
+ unsigned long (*pagesize)(struct vm_area_struct * area);
+
+ /* notification that a previously read-only page is about to become
+ * writable, if an error is returned it will cause a SIGBUS */
+ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
+
+ /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
+ vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
+
+ /* called by access_process_vm when get_user_pages() fails, typically
+ * for use by special VMAs. See also generic_access_phys() for a generic
+ * implementation useful for any iomem mapping.
+ */
+ int (*access)(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write);
+
+ /* Called by the /proc/PID/maps code to ask the vma whether it
+ * has a special name. Returning non-NULL will also cause this
+ * vma to be dumped unconditionally. */
+ const char *(*name)(struct vm_area_struct *vma);
+
+#ifdef CONFIG_NUMA
+ /*
+ * set_policy() op must add a reference to any non-NULL @new mempolicy
+ * to hold the policy upon return. Caller should pass NULL @new to
+ * remove a policy and fall back to surrounding context--i.e. do not
+ * install a MPOL_DEFAULT policy, nor the task or system default
+ * mempolicy.
+ */
+ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
+
+ /*
+ * get_policy() op must add reference [mpol_get()] to any policy at
+ * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure
+ * in mm/mempolicy.c will do this automatically.
+ * get_policy() must NOT add a ref if the policy at (vma,addr) is not
+ * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
+ * If no [shared/vma] mempolicy exists at the addr, get_policy() op
+ * must return NULL--i.e., do not "fallback" to task or system default
+ * policy.
+ */
+ struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t *ilx);
+#endif
+#ifdef CONFIG_FIND_NORMAL_PAGE
+ /*
+ * Called by vm_normal_page() for special PTEs in @vma at @addr. This
+ * allows for returning a "normal" page from vm_normal_page() even
+ * though the PTE indicates that the "struct page" either does not exist
+ * or should not be touched: "special".
+ *
+ * Do not add new users: this really only works when a "normal" page
+ * was mapped, but then the PTE got changed to something weird (+
+ * marked special) that would not make pte_pfn() identify the originally
+ * inserted page.
+ */
+ struct page *(*find_normal_page)(struct vm_area_struct *vma,
+ unsigned long addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
+};
+
+struct vm_unmapped_area_info {
+#define VM_UNMAPPED_AREA_TOPDOWN 1
+ unsigned long flags;
+ unsigned long length;
+ unsigned long low_limit;
+ unsigned long high_limit;
+ unsigned long align_mask;
+ unsigned long align_offset;
+ unsigned long start_gap;
+};
+
+struct pagetable_move_control {
+ struct vm_area_struct *old; /* Source VMA. */
+ struct vm_area_struct *new; /* Destination VMA. */
+ unsigned long old_addr; /* Address from which the move begins. */
+ unsigned long old_end; /* Exclusive address at which old range ends. */
+ unsigned long new_addr; /* Address to move page tables to. */
+ unsigned long len_in; /* Bytes to remap specified by user. */
+
+ bool need_rmap_locks; /* Do rmap locks need to be taken? */
+ bool for_stack; /* Is this an early temp stack being moved? */
+};
+
+#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
+ struct pagetable_move_control name = { \
+ .old = old_, \
+ .new = new_, \
+ .old_addr = old_addr_, \
+ .old_end = (old_addr_) + (len_), \
+ .new_addr = new_addr_, \
+ .len_in = len_, \
+ }
+
+static inline void vma_iter_invalidate(struct vma_iterator *vmi)
+{
+ mas_pause(&vmi->mas);
+}
+
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+ return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot));
+}
+
+static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
+{
+ return __pgprot(vm_flags);
+}
+
+static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
+{
+ return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+/*
+ * Copy value to the first system word of VMA flags, non-atomically.
+ *
+ * IMPORTANT: This does not overwrite bytes past the first system word. The
+ * caller must account for this.
+ */
+static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value)
+{
+ *ACCESS_PRIVATE(flags, __vma_flags) = value;
+}
+
+/*
+ * Copy value to the first system word of VMA flags ONCE, non-atomically.
+ *
+ * IMPORTANT: This does not overwrite bytes past the first system word. The
+ * caller must account for this.
+ */
+static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+ WRITE_ONCE(*bitmap, value);
+}
+
+/* Update the first system word of VMA flags setting bits, non-atomically. */
+static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+ *bitmap |= value;
+}
+
+/* Update the first system word of VMA flags clearing bits, non-atomically. */
+static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+ *bitmap &= ~value;
+}
+
+static inline void vma_flags_clear_all(vma_flags_t *flags)
+{
+ bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS);
+}
+
+static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+ __set_bit((__force int)bit, bitmap);
+}
+
+/* Use when VMA is not part of the VMA tree and needs no locking */
+static inline void vm_flags_init(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ vma_flags_clear_all(&vma->flags);
+ vma_flags_overwrite_word(&vma->flags, flags);
+}
+
+/*
+ * Use when VMA is part of the VMA tree and modifications need coordination
+ * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
+ * it should be locked explicitly beforehand.
+ */
+static inline void vm_flags_reset(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ vma_assert_write_locked(vma);
+ vm_flags_init(vma, flags);
+}
+
+static inline void vm_flags_reset_once(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ vma_assert_write_locked(vma);
+ /*
+ * The user should only be interested in avoiding reordering of
+ * assignment to the first word.
+ */
+ vma_flags_clear_all(&vma->flags);
+ vma_flags_overwrite_word_once(&vma->flags, flags);
+}
+
+static inline void vm_flags_set(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ vma_start_write(vma);
+ vma_flags_set_word(&vma->flags, flags);
+}
+
+static inline void vm_flags_clear(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ vma_start_write(vma);
+ vma_flags_clear_word(&vma->flags, flags);
+}
+
+static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits);
+
+#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \
+ (const vma_flag_t []){__VA_ARGS__})
+
+static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags,
+ vma_flags_t to_test)
+{
+ const unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+ return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_test(flags, ...) \
+ vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags,
+ vma_flags_t to_test)
+{
+ const unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+ return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_test_all(flags, ...) \
+ vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set)
+{
+ unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_set = to_set.__vma_flags;
+
+ bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_set(flags, ...) \
+ vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear)
+{
+ unsigned long *bitmap = flags->__vma_flags;
+ const unsigned long *bitmap_to_clear = to_clear.__vma_flags;
+
+ bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_clear(flags, ...) \
+ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma,
+ vma_flags_t flags)
+{
+ return vma_flags_test_all_mask(&vma->flags, flags);
+}
+
+#define vma_test_all_flags(vma, ...) \
+ vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags)
+{
+ return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+ (VM_SHARED | VM_MAYWRITE);
+}
+
+static inline void vma_set_flags_mask(struct vm_area_struct *vma,
+ vma_flags_t flags)
+{
+ vma_flags_set_mask(&vma->flags, flags);
+}
+
+#define vma_set_flags(vma, ...) \
+ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc,
+ vma_flags_t flags)
+{
+ return vma_flags_test_mask(&desc->vma_flags, flags);
+}
+
+#define vma_desc_test_flags(desc, ...) \
+ vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc,
+ vma_flags_t flags)
+{
+ vma_flags_set_mask(&desc->vma_flags, flags);
+}
+
+#define vma_desc_set_flags(desc, ...) \
+ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc,
+ vma_flags_t flags)
+{
+ vma_flags_clear_mask(&desc->vma_flags, flags);
+}
+
+#define vma_desc_clear_flags(desc, ...) \
+ vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+static inline bool is_shared_maywrite(const vma_flags_t *flags)
+{
+ return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+ return is_shared_maywrite(&vma->flags);
+}
+
+static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
+{
+ /*
+ * Uses mas_find() to get the first VMA when the iterator starts.
+ * Calling mas_next() could skip the first entry.
+ */
+ return mas_find(&vmi->mas, ULONG_MAX);
+}
+
+/*
+ * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
+ * assertions should be made either under mmap_write_lock or when the object
+ * has been isolated under mmap_write_lock, ensuring no competing writers.
+ */
+static inline void vma_assert_attached(struct vm_area_struct *vma)
+{
+ WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
+}
+
+static inline void vma_assert_detached(struct vm_area_struct *vma)
+{
+ WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *);
+static inline void vma_mark_attached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_detached(vma);
+ refcount_set_release(&vma->vm_refcnt, 1);
+}
+
+static inline void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+ /* We are the only writer, so no need to use vma_refcount_put(). */
+ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+ /*
+ * Reader must have temporarily raised vm_refcnt but it will
+ * drop it without using the vma since vma is write-locked.
+ */
+ }
+}
+
+static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
+{
+ memset(vma, 0, sizeof(*vma));
+ vma->vm_mm = mm;
+ vma->vm_ops = &vma_dummy_vm_ops;
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
+ vma->vm_lock_seq = UINT_MAX;
+}
+
+/*
+ * These are defined in vma.h, but sadly vm_stat_account() is referenced by
+ * kernel/fork.c, so we have to these broadly available there, and temporarily
+ * define them here to resolve the dependency cycle.
+ */
+#define is_exec_mapping(flags) \
+ ((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC)
+
+#define is_stack_mapping(flags) \
+ (((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK))
+
+#define is_data_mapping(flags) \
+ ((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE)
+
+static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags,
+ long npages)
+{
+ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
+
+ if (is_exec_mapping(flags))
+ mm->exec_vm += npages;
+ else if (is_stack_mapping(flags))
+ mm->stack_vm += npages;
+ else if (is_data_mapping(flags))
+ mm->data_vm += npages;
+}
+
+#undef is_exec_mapping
+#undef is_stack_mapping
+#undef is_data_mapping
+
+static inline void vm_unacct_memory(long pages)
+{
+ vm_acct_memory(-pages);
+}
+
+static inline void mapping_allow_writable(struct address_space *mapping)
+{
+ atomic_inc(&mapping->i_mmap_writable);
+}
+
+static inline
+struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
+{
+ return mas_find(&vmi->mas, max - 1);
+}
+
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+ unsigned long start, unsigned long end, gfp_t gfp)
+{
+ __mas_set_range(&vmi->mas, start, end - 1);
+ mas_store_gfp(&vmi->mas, NULL, gfp);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static inline void vma_set_anonymous(struct vm_area_struct *vma)
+{
+ vma->vm_ops = NULL;
+}
+
+/* Declared in vma.h. */
+static inline void set_vma_from_desc(struct vm_area_struct *vma,
+ struct vm_area_desc *desc);
+
+static inline int __compat_vma_mmap(const struct file_operations *f_op,
+ struct file *file, struct vm_area_struct *vma)
+{
+ struct vm_area_desc desc = {
+ .mm = vma->vm_mm,
+ .file = file,
+ .start = vma->vm_start,
+ .end = vma->vm_end,
+
+ .pgoff = vma->vm_pgoff,
+ .vm_file = vma->vm_file,
+ .vm_flags = vma->vm_flags,
+ .page_prot = vma->vm_page_prot,
+
+ .action.type = MMAP_NOTHING, /* Default */
+ };
+ int err;
+
+ err = f_op->mmap_prepare(&desc);
+ if (err)
+ return err;
+
+ mmap_action_prepare(&desc.action, &desc);
+ set_vma_from_desc(vma, &desc);
+ return mmap_action_complete(&desc.action, vma);
+}
+
+static inline int compat_vma_mmap(struct file *file,
+ struct vm_area_struct *vma)
+{
+ return __compat_vma_mmap(file->f_op, file, vma);
+}
+
+
+static inline void vma_iter_init(struct vma_iterator *vmi,
+ struct mm_struct *mm, unsigned long addr)
+{
+ mas_init(&vmi->mas, &mm->mm_mt, addr);
+}
+
+static inline unsigned long vma_pages(struct vm_area_struct *vma)
+{
+ return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+}
+
+static inline void mmap_assert_locked(struct mm_struct *);
+static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
+{
+ unsigned long index = start_addr;
+
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+
+static inline
+struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
+{
+ return mtree_load(&mm->mm_mt, addr);
+}
+
+static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
+{
+ return mas_prev(&vmi->mas, 0);
+}
+
+static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
+{
+ mas_set(&vmi->mas, addr);
+}
+
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+ return !vma->vm_ops;
+}
+
+/* Defined in vma.h, so temporarily define here to avoid circular dependency. */
+#define vma_iter_load(vmi) \
+ mas_walk(&(vmi)->mas)
+
+static inline struct vm_area_struct *
+find_vma_prev(struct mm_struct *mm, unsigned long addr,
+ struct vm_area_struct **pprev)
+{
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, addr);
+
+ vma = vma_iter_load(&vmi);
+ *pprev = vma_prev(&vmi);
+ if (!vma)
+ vma = vma_next(&vmi);
+ return vma;
+}
+
+#undef vma_iter_load
+
+static inline void vma_iter_free(struct vma_iterator *vmi)
+{
+ mas_destroy(&vmi->mas);
+}
+
+static inline
+struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
+{
+ return mas_next_range(&vmi->mas, ULONG_MAX);
+}
+
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
+
+/* Update vma->vm_page_prot to reflect vma->vm_flags. */
+static inline void vma_set_page_prot(struct vm_area_struct *vma)
+{
+ vm_flags_t vm_flags = vma->vm_flags;
+ pgprot_t vm_page_prot;
+
+ /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
+ vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags));
+
+ if (vma_wants_writenotify(vma, vm_page_prot)) {
+ vm_flags &= ~VM_SHARED;
+ /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
+ vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags));
+ }
+ /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
+ WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
+}
+
+static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_GROWSDOWN)
+ return stack_guard_gap;
+
+ /* See reasoning around the VM_SHADOW_STACK definition */
+ if (vma->vm_flags & VM_SHADOW_STACK)
+ return PAGE_SIZE;
+
+ return 0;
+}
+
+static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
+{
+ unsigned long gap = stack_guard_start_gap(vma);
+ unsigned long vm_start = vma->vm_start;
+
+ vm_start -= gap;
+ if (vm_start > vma->vm_start)
+ vm_start = 0;
+ return vm_start;
+}
+
+static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
+{
+ unsigned long vm_end = vma->vm_end;
+
+ if (vma->vm_flags & VM_GROWSUP) {
+ vm_end += stack_guard_gap;
+ if (vm_end < vma->vm_end)
+ vm_end = -PAGE_SIZE;
+ }
+ return vm_end;
+}
+
+static inline bool vma_is_accessible(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_ACCESS_FLAGS;
+}
+
+static inline bool mlock_future_ok(const struct mm_struct *mm,
+ vm_flags_t vm_flags, unsigned long bytes)
+{
+ unsigned long locked_pages, limit_pages;
+
+ if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+ return true;
+
+ locked_pages = bytes >> PAGE_SHIFT;
+ locked_pages += mm->locked_vm;
+
+ limit_pages = rlimit(RLIMIT_MEMLOCK);
+ limit_pages >>= PAGE_SHIFT;
+
+ return locked_pages <= limit_pages;
+}
+
+static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
+{
+ /* If MDWE is disabled, we have nothing to deny. */
+ if (mm_flags_test(MMF_HAS_MDWE, current->mm))
+ return false;
+
+ /* If the new VMA is not executable, we have nothing to deny. */
+ if (!(new & VM_EXEC))
+ return false;
+
+ /* Under MDWE we do not accept newly writably executable VMAs... */
+ if (new & VM_WRITE)
+ return true;
+
+ /* ...nor previously non-executable VMAs becoming executable. */
+ if (!(old & VM_EXEC))
+ return true;
+
+ return false;
+}
+
+static inline int mapping_map_writable(struct address_space *mapping)
+{
+ return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
+ 0 : -EPERM;
+}
+
+/* Did the driver provide valid mmap hook configuration? */
+static inline bool can_mmap_file(struct file *file)
+{
+ bool has_mmap = file->f_op->mmap;
+ bool has_mmap_prepare = file->f_op->mmap_prepare;
+
+ /* Hooks are mutually exclusive. */
+ if (WARN_ON_ONCE(has_mmap && has_mmap_prepare))
+ return false;
+ if (!has_mmap && !has_mmap_prepare)
+ return false;
+
+ return true;
+}
+
+static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if (file->f_op->mmap_prepare)
+ return compat_vma_mmap(file, vma);
+
+ return file->f_op->mmap(file, vma);
+}
+
+static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
+{
+ return file->f_op->mmap_prepare(desc);
+}
+
+static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
+{
+ /* Changing an anonymous vma with this is illegal */
+ get_file(file);
+ swap(vma->vm_file, file);
+ fput(file);
+}
diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h
new file mode 100644
index 000000000000..947a3a0c2566
--- /dev/null
+++ b/tools/testing/vma/include/stubs.h
@@ -0,0 +1,428 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#pragma once
+
+/*
+ * Contains declarations that are STUBBED, that is that are rendered no-ops, in
+ * order to faciliate userland VMA testing.
+ */
+
+/* Forward declarations. */
+struct mm_struct;
+struct vm_area_struct;
+struct vm_area_desc;
+struct pagetable_move_control;
+struct mmap_action;
+struct file;
+struct anon_vma;
+struct anon_vma_chain;
+struct address_space;
+struct unmap_desc;
+
+#define __bitwise
+#define __randomize_layout
+
+#define FIRST_USER_ADDRESS 0UL
+#define USER_PGTABLES_CEILING 0UL
+
+#define vma_policy(vma) NULL
+
+#define down_write_nest_lock(sem, nest_lock)
+
+#define data_race(expr) expr
+
+#define ASSERT_EXCLUSIVE_WRITER(x)
+
+struct vm_userfaultfd_ctx {};
+struct mempolicy {};
+struct mmu_gather {};
+struct mutex {};
+struct vm_fault {};
+
+static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
+ struct list_head *uf)
+{
+}
+
+static inline unsigned long move_page_tables(struct pagetable_move_control *pmc)
+{
+ return 0;
+}
+
+static inline void free_pgd_range(struct mmu_gather *tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+}
+
+static inline int ksm_execve(struct mm_struct *mm)
+{
+ return 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm)
+{
+}
+
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+}
+
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
+{
+}
+
+static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
+ struct vm_area_struct *new_vma)
+{
+}
+
+static inline void free_anon_vma_name(struct vm_area_struct *vma)
+{
+}
+
+static inline void mmap_action_prepare(struct mmap_action *action,
+ struct vm_area_desc *desc)
+{
+}
+
+static inline int mmap_action_complete(struct mmap_action *action,
+ struct vm_area_struct *vma)
+{
+ return 0;
+}
+
+static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
+{
+}
+
+static inline bool shmem_file(struct file *file)
+{
+ return false;
+}
+
+static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
+ const struct file *file, vm_flags_t vm_flags)
+{
+ return vm_flags;
+}
+
+static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
+{
+}
+
+static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t pgprot)
+{
+ return 0;
+}
+
+static inline int do_munmap(struct mm_struct *, unsigned long, size_t,
+ struct list_head *uf)
+{
+ return 0;
+}
+
+/* Currently stubbed but we may later wish to un-stub. */
+static inline void vm_acct_memory(long pages);
+
+static inline void mmap_assert_locked(struct mm_struct *mm)
+{
+}
+
+
+static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
+{
+}
+
+static inline void i_mmap_unlock_write(struct address_space *mapping)
+{
+}
+
+static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ struct list_head *unmaps)
+{
+ return 0;
+}
+
+static inline void mmap_write_downgrade(struct mm_struct *mm)
+{
+}
+
+static inline void mmap_read_unlock(struct mm_struct *mm)
+{
+}
+
+static inline void mmap_write_unlock(struct mm_struct *mm)
+{
+}
+
+static inline int mmap_write_lock_killable(struct mm_struct *mm)
+{
+ return 0;
+}
+
+static inline bool can_modify_mm(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ return true;
+}
+
+static inline void arch_unmap(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+}
+
+static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
+{
+ return true;
+}
+
+static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
+{
+}
+
+static inline bool mapping_can_writeback(struct address_space *mapping)
+{
+ return true;
+}
+
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+static inline bool userfaultfd_wp(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+static inline void mmap_assert_write_locked(struct mm_struct *mm)
+{
+}
+
+static inline void mutex_lock(struct mutex *lock)
+{
+}
+
+static inline void mutex_unlock(struct mutex *lock)
+{
+}
+
+static inline bool mutex_is_locked(struct mutex *lock)
+{
+ return true;
+}
+
+static inline bool signal_pending(void *p)
+{
+ return false;
+}
+
+static inline bool is_file_hugepages(struct file *file)
+{
+ return false;
+}
+
+static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
+{
+ return 0;
+}
+
+static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags,
+ unsigned long npages)
+{
+ return true;
+}
+
+static inline int shmem_zero_setup(struct vm_area_struct *vma)
+{
+ return 0;
+}
+
+
+static inline void vm_acct_memory(long pages)
+{
+}
+
+static inline void vma_interval_tree_insert(struct vm_area_struct *vma,
+ struct rb_root_cached *rb)
+{
+}
+
+static inline void vma_interval_tree_remove(struct vm_area_struct *vma,
+ struct rb_root_cached *rb)
+{
+}
+
+static inline void flush_dcache_mmap_unlock(struct address_space *mapping)
+{
+}
+
+static inline void anon_vma_interval_tree_insert(struct anon_vma_chain *avc,
+ struct rb_root_cached *rb)
+{
+}
+
+static inline void anon_vma_interval_tree_remove(struct anon_vma_chain *avc,
+ struct rb_root_cached *rb)
+{
+}
+
+static inline void uprobe_mmap(struct vm_area_struct *vma)
+{
+}
+
+static inline void uprobe_munmap(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+}
+
+static inline void i_mmap_lock_write(struct address_space *mapping)
+{
+}
+
+static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
+{
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+}
+
+static inline void ksm_add_vma(struct vm_area_struct *vma)
+{
+}
+
+static inline void perf_event_mmap(struct vm_area_struct *vma)
+{
+}
+
+static inline bool vma_is_dax(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+ return NULL;
+}
+
+static inline bool arch_validate_flags(vm_flags_t flags)
+{
+ return true;
+}
+
+static inline void vma_close(struct vm_area_struct *vma)
+{
+}
+
+static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
+{
+ return 0;
+}
+
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long len)
+{
+ return 0;
+}
+
+static inline bool capable(int cap)
+{
+ return true;
+}
+
+static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
+{
+ return NULL;
+}
+
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+ struct vm_userfaultfd_ctx vm_ctx)
+{
+ return true;
+}
+
+static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
+ struct anon_vma_name *anon_name2)
+{
+ return true;
+}
+
+static inline void might_sleep(void)
+{
+}
+
+static inline void fput(struct file *file)
+{
+}
+
+static inline void mpol_put(struct mempolicy *pol)
+{
+}
+
+static inline void lru_add_drain(void)
+{
+}
+
+static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
+{
+}
+
+static inline void update_hiwater_rss(struct mm_struct *mm)
+{
+}
+
+static inline void update_hiwater_vm(struct mm_struct *mm)
+{
+}
+
+static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
+{
+}
+
+static inline void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap)
+{
+}
+
+static inline void mapping_unmap_writable(struct address_space *mapping)
+{
+}
+
+static inline void flush_dcache_mmap_lock(struct address_space *mapping)
+{
+}
+
+static inline void tlb_finish_mmu(struct mmu_gather *tlb)
+{
+}
+
+static inline struct file *get_file(struct file *f)
+{
+ return f;
+}
+
+static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
+{
+ return 0;
+}
+
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ struct vm_area_struct *next)
+{
+}
+
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
diff --git a/tools/testing/vma/main.c b/tools/testing/vma/main.c
new file mode 100644
index 000000000000..49b09e97a51f
--- /dev/null
+++ b/tools/testing/vma/main.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "shared.h"
+/*
+ * Directly import the VMA implementation here. Our vma_internal.h wrapper
+ * provides userland-equivalent functionality for everything vma.c uses.
+ */
+#include "../../../mm/vma_init.c"
+#include "../../../mm/vma_exec.c"
+#include "../../../mm/vma.c"
+
+/* Tests are included directly so they can test static functions in mm/vma.c. */
+#include "tests/merge.c"
+#include "tests/mmap.c"
+#include "tests/vma.c"
+
+/* Helper functions which utilise static kernel functions. */
+
+struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg)
+{
+ struct vm_area_struct *vma;
+
+ vma = vma_merge_existing_range(vmg);
+ if (vma)
+ vma_assert_attached(vma);
+ return vma;
+}
+
+int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ int res;
+
+ res = vma_link(mm, vma);
+ if (!res)
+ vma_assert_attached(vma);
+ return res;
+}
+
+/* Main test running which invokes tests/ *.c runners. */
+int main(void)
+{
+ int num_tests = 0, num_fail = 0;
+
+ maple_tree_init();
+ vma_state_init();
+
+ run_merge_tests(&num_tests, &num_fail);
+ run_mmap_tests(&num_tests, &num_fail);
+ run_vma_tests(&num_tests, &num_fail);
+
+ printf("%d tests run, %d passed, %d failed.\n",
+ num_tests, num_tests - num_fail, num_fail);
+
+ return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tools/testing/vma/shared.c b/tools/testing/vma/shared.c
new file mode 100644
index 000000000000..bda578cc3304
--- /dev/null
+++ b/tools/testing/vma/shared.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "shared.h"
+
+
+bool fail_prealloc;
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
+
+const struct vm_operations_struct vma_dummy_vm_ops;
+struct anon_vma dummy_anon_vma;
+struct task_struct __current;
+
+struct vm_area_struct *alloc_vma(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ pgoff_t pgoff, vm_flags_t vm_flags)
+{
+ struct vm_area_struct *vma = vm_area_alloc(mm);
+
+ if (vma == NULL)
+ return NULL;
+
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ vm_flags_reset(vma, vm_flags);
+ vma_assert_detached(vma);
+
+ return vma;
+}
+
+void detach_free_vma(struct vm_area_struct *vma)
+{
+ vma_mark_detached(vma);
+ vm_area_free(vma);
+}
+
+struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ pgoff_t pgoff, vm_flags_t vm_flags)
+{
+ struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags);
+
+ if (vma == NULL)
+ return NULL;
+
+ if (attach_vma(mm, vma)) {
+ detach_free_vma(vma);
+ return NULL;
+ }
+
+ /*
+ * Reset this counter which we use to track whether writes have
+ * begun. Linking to the tree will have caused this to be incremented,
+ * which means we will get a false positive otherwise.
+ */
+ vma->vm_lock_seq = UINT_MAX;
+
+ return vma;
+}
+
+void reset_dummy_anon_vma(void)
+{
+ dummy_anon_vma.was_cloned = false;
+ dummy_anon_vma.was_unlinked = false;
+}
+
+int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
+{
+ struct vm_area_struct *vma;
+ int count = 0;
+
+ fail_prealloc = false;
+ reset_dummy_anon_vma();
+
+ vma_iter_set(vmi, 0);
+ for_each_vma(*vmi, vma) {
+ detach_free_vma(vma);
+ count++;
+ }
+
+ mtree_destroy(&mm->mm_mt);
+ mm->map_count = 0;
+ return count;
+}
+
+bool vma_write_started(struct vm_area_struct *vma)
+{
+ int seq = vma->vm_lock_seq;
+
+ /* We reset after each check. */
+ vma->vm_lock_seq = UINT_MAX;
+
+ /* The vma_start_write() stub simply increments this value. */
+ return seq > -1;
+}
+
+void __vma_set_dummy_anon_vma(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc, struct anon_vma *anon_vma)
+{
+ vma->anon_vma = anon_vma;
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
+ list_add(&avc->same_vma, &vma->anon_vma_chain);
+ avc->anon_vma = vma->anon_vma;
+}
+
+void vma_set_dummy_anon_vma(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc)
+{
+ __vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma);
+}
+
+struct task_struct *get_current(void)
+{
+ return &__current;
+}
+
+unsigned long rlimit(unsigned int limit)
+{
+ return (unsigned long)-1;
+}
+
+void vma_set_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ pgoff_t pgoff)
+{
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+}
diff --git a/tools/testing/vma/shared.h b/tools/testing/vma/shared.h
new file mode 100644
index 000000000000..6c64211cfa22
--- /dev/null
+++ b/tools/testing/vma/shared.h
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "generated/bit-length.h"
+#include "maple-shared.h"
+#include "vma_internal.h"
+#include "../../../mm/vma.h"
+
+/* Simple test runner. Assumes local num_[fail, tests] counters. */
+#define TEST(name) \
+ do { \
+ (*num_tests)++; \
+ if (!test_##name()) { \
+ (*num_fail)++; \
+ fprintf(stderr, "Test " #name " FAILED\n"); \
+ } \
+ } while (0)
+
+#define ASSERT_TRUE(_expr) \
+ do { \
+ if (!(_expr)) { \
+ fprintf(stderr, \
+ "Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \
+ __FILE__, __LINE__, __FUNCTION__, #_expr); \
+ return false; \
+ } \
+ } while (0)
+
+#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr))
+#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2))
+#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2))
+
+#define IS_SET(_val, _flags) ((_val & _flags) == _flags)
+
+extern bool fail_prealloc;
+
+/* Override vma_iter_prealloc() so we can choose to fail it. */
+#define vma_iter_prealloc(vmi, vma) \
+ (fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL))
+
+#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536
+
+extern unsigned long mmap_min_addr;
+extern unsigned long dac_mmap_min_addr;
+extern unsigned long stack_guard_gap;
+
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+extern struct anon_vma dummy_anon_vma;
+extern struct task_struct __current;
+
+/*
+ * Helper function which provides a wrapper around a merge existing VMA
+ * operation.
+ *
+ * Declared in main.c as uses static VMA function.
+ */
+struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg);
+
+/*
+ * Helper function to allocate a VMA and link it to the tree.
+ *
+ * Declared in main.c as uses static VMA function.
+ */
+int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma);
+
+/* Helper function providing a dummy vm_ops->close() method.*/
+static inline void dummy_close(struct vm_area_struct *)
+{
+}
+
+/* Helper function to simply allocate a VMA. */
+struct vm_area_struct *alloc_vma(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ pgoff_t pgoff, vm_flags_t vm_flags);
+
+/* Helper function to detach and free a VMA. */
+void detach_free_vma(struct vm_area_struct *vma);
+
+/* Helper function to allocate a VMA and link it to the tree. */
+struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ pgoff_t pgoff, vm_flags_t vm_flags);
+
+/*
+ * Helper function to reset the dummy anon_vma to indicate it has not been
+ * duplicated.
+ */
+void reset_dummy_anon_vma(void);
+
+/*
+ * Helper function to remove all VMAs and destroy the maple tree associated with
+ * a virtual address space. Returns a count of VMAs in the tree.
+ */
+int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi);
+
+/* Helper function to determine if VMA has had vma_start_write() performed. */
+bool vma_write_started(struct vm_area_struct *vma);
+
+void __vma_set_dummy_anon_vma(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc, struct anon_vma *anon_vma);
+
+/* Provide a simple dummy VMA/anon_vma dummy setup for testing. */
+void vma_set_dummy_anon_vma(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc);
+
+/* Helper function to specify a VMA's range. */
+void vma_set_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ pgoff_t pgoff);
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/tests/merge.c
index 93d21bc7e112..3708dc6945b0 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/tests/merge.c
@@ -1,132 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "generated/bit-length.h"
-
-#include "maple-shared.h"
-#include "vma_internal.h"
-
-/* Include so header guard set. */
-#include "../../../mm/vma.h"
-
-static bool fail_prealloc;
-
-/* Then override vma_iter_prealloc() so we can choose to fail it. */
-#define vma_iter_prealloc(vmi, vma) \
- (fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL))
-
-#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536
-
-unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
-unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
-unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
-
-/*
- * Directly import the VMA implementation here. Our vma_internal.h wrapper
- * provides userland-equivalent functionality for everything vma.c uses.
- */
-#include "../../../mm/vma_init.c"
-#include "../../../mm/vma_exec.c"
-#include "../../../mm/vma.c"
-
-const struct vm_operations_struct vma_dummy_vm_ops;
-static struct anon_vma dummy_anon_vma;
-
-#define ASSERT_TRUE(_expr) \
- do { \
- if (!(_expr)) { \
- fprintf(stderr, \
- "Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \
- __FILE__, __LINE__, __FUNCTION__, #_expr); \
- return false; \
- } \
- } while (0)
-#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr))
-#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2))
-#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2))
-
-#define IS_SET(_val, _flags) ((_val & _flags) == _flags)
-
-static struct task_struct __current;
-
-struct task_struct *get_current(void)
-{
- return &__current;
-}
-
-unsigned long rlimit(unsigned int limit)
-{
- return (unsigned long)-1;
-}
-
-/* Helper function to simply allocate a VMA. */
-static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- pgoff_t pgoff,
- vm_flags_t vm_flags)
-{
- struct vm_area_struct *vma = vm_area_alloc(mm);
-
- if (vma == NULL)
- return NULL;
-
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_pgoff = pgoff;
- vm_flags_reset(vma, vm_flags);
- vma_assert_detached(vma);
-
- return vma;
-}
-
-/* Helper function to allocate a VMA and link it to the tree. */
-static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- int res;
-
- res = vma_link(mm, vma);
- if (!res)
- vma_assert_attached(vma);
- return res;
-}
-
-static void detach_free_vma(struct vm_area_struct *vma)
-{
- vma_mark_detached(vma);
- vm_area_free(vma);
-}
-
-/* Helper function to allocate a VMA and link it to the tree. */
-static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- pgoff_t pgoff,
- vm_flags_t vm_flags)
-{
- struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags);
-
- if (vma == NULL)
- return NULL;
-
- if (attach_vma(mm, vma)) {
- detach_free_vma(vma);
- return NULL;
- }
-
- /*
- * Reset this counter which we use to track whether writes have
- * begun. Linking to the tree will have caused this to be incremented,
- * which means we will get a false positive otherwise.
- */
- vma->vm_lock_seq = UINT_MAX;
-
- return vma;
-}
-
/* Helper function which provides a wrapper around a merge new VMA operation. */
static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
{
@@ -147,20 +20,6 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
}
/*
- * Helper function which provides a wrapper around a merge existing VMA
- * operation.
- */
-static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg)
-{
- struct vm_area_struct *vma;
-
- vma = vma_merge_existing_range(vmg);
- if (vma)
- vma_assert_attached(vma);
- return vma;
-}
-
-/*
* Helper function which provides a wrapper around the expansion of an existing
* VMA.
*/
@@ -173,8 +32,8 @@ static int expand_existing(struct vma_merge_struct *vmg)
* Helper function to reset merge state the associated VMA iterator to a
* specified new range.
*/
-static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
- unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags)
+void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
+ unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags)
{
vma_iter_set(vmg->vmi, start);
@@ -197,8 +56,8 @@ static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
/* Helper function to set both the VMG range and its anon_vma. */
static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start,
- unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags,
- struct anon_vma *anon_vma)
+ unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags,
+ struct anon_vma *anon_vma)
{
vmg_set_range(vmg, start, end, pgoff, vm_flags);
vmg->anon_vma = anon_vma;
@@ -211,10 +70,9 @@ static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long s
* VMA, link it to the maple tree and return it.
*/
static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm,
- struct vma_merge_struct *vmg,
- unsigned long start, unsigned long end,
- pgoff_t pgoff, vm_flags_t vm_flags,
- bool *was_merged)
+ struct vma_merge_struct *vmg, unsigned long start,
+ unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags,
+ bool *was_merged)
{
struct vm_area_struct *merged;
@@ -234,72 +92,6 @@ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm,
return alloc_and_link_vma(mm, start, end, pgoff, vm_flags);
}
-/*
- * Helper function to reset the dummy anon_vma to indicate it has not been
- * duplicated.
- */
-static void reset_dummy_anon_vma(void)
-{
- dummy_anon_vma.was_cloned = false;
- dummy_anon_vma.was_unlinked = false;
-}
-
-/*
- * Helper function to remove all VMAs and destroy the maple tree associated with
- * a virtual address space. Returns a count of VMAs in the tree.
- */
-static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
-{
- struct vm_area_struct *vma;
- int count = 0;
-
- fail_prealloc = false;
- reset_dummy_anon_vma();
-
- vma_iter_set(vmi, 0);
- for_each_vma(*vmi, vma) {
- detach_free_vma(vma);
- count++;
- }
-
- mtree_destroy(&mm->mm_mt);
- mm->map_count = 0;
- return count;
-}
-
-/* Helper function to determine if VMA has had vma_start_write() performed. */
-static bool vma_write_started(struct vm_area_struct *vma)
-{
- int seq = vma->vm_lock_seq;
-
- /* We reset after each check. */
- vma->vm_lock_seq = UINT_MAX;
-
- /* The vma_start_write() stub simply increments this value. */
- return seq > -1;
-}
-
-/* Helper function providing a dummy vm_ops->close() method.*/
-static void dummy_close(struct vm_area_struct *)
-{
-}
-
-static void __vma_set_dummy_anon_vma(struct vm_area_struct *vma,
- struct anon_vma_chain *avc,
- struct anon_vma *anon_vma)
-{
- vma->anon_vma = anon_vma;
- INIT_LIST_HEAD(&vma->anon_vma_chain);
- list_add(&avc->same_vma, &vma->anon_vma_chain);
- avc->anon_vma = vma->anon_vma;
-}
-
-static void vma_set_dummy_anon_vma(struct vm_area_struct *vma,
- struct anon_vma_chain *avc)
-{
- __vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma);
-}
-
static bool test_simple_merge(void)
{
struct vm_area_struct *vma;
@@ -1616,39 +1408,6 @@ static bool test_merge_extend(void)
return true;
}
-static bool test_copy_vma(void)
-{
- vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
- struct mm_struct mm = {};
- bool need_locks = false;
- VMA_ITERATOR(vmi, &mm, 0);
- struct vm_area_struct *vma, *vma_new, *vma_next;
-
- /* Move backwards and do not merge. */
-
- vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
- vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks);
- ASSERT_NE(vma_new, vma);
- ASSERT_EQ(vma_new->vm_start, 0);
- ASSERT_EQ(vma_new->vm_end, 0x2000);
- ASSERT_EQ(vma_new->vm_pgoff, 0);
- vma_assert_attached(vma_new);
-
- cleanup_mm(&mm, &vmi);
-
- /* Move a VMA into position next to another and merge the two. */
-
- vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
- vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags);
- vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks);
- vma_assert_attached(vma_new);
-
- ASSERT_EQ(vma_new, vma_next);
-
- cleanup_mm(&mm, &vmi);
- return true;
-}
-
static bool test_expand_only_mode(void)
{
vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
@@ -1689,73 +1448,8 @@ static bool test_expand_only_mode(void)
return true;
}
-static bool test_mmap_region_basic(void)
-{
- struct mm_struct mm = {};
- unsigned long addr;
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, &mm, 0);
-
- current->mm = &mm;
-
- /* Map at 0x300000, length 0x3000. */
- addr = __mmap_region(NULL, 0x300000, 0x3000,
- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
- 0x300, NULL);
- ASSERT_EQ(addr, 0x300000);
-
- /* Map at 0x250000, length 0x3000. */
- addr = __mmap_region(NULL, 0x250000, 0x3000,
- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
- 0x250, NULL);
- ASSERT_EQ(addr, 0x250000);
-
- /* Map at 0x303000, merging to 0x300000 of length 0x6000. */
- addr = __mmap_region(NULL, 0x303000, 0x3000,
- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
- 0x303, NULL);
- ASSERT_EQ(addr, 0x303000);
-
- /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */
- addr = __mmap_region(NULL, 0x24d000, 0x3000,
- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
- 0x24d, NULL);
- ASSERT_EQ(addr, 0x24d000);
-
- ASSERT_EQ(mm.map_count, 2);
-
- for_each_vma(vmi, vma) {
- if (vma->vm_start == 0x300000) {
- ASSERT_EQ(vma->vm_end, 0x306000);
- ASSERT_EQ(vma->vm_pgoff, 0x300);
- } else if (vma->vm_start == 0x24d000) {
- ASSERT_EQ(vma->vm_end, 0x253000);
- ASSERT_EQ(vma->vm_pgoff, 0x24d);
- } else {
- ASSERT_FALSE(true);
- }
- }
-
- cleanup_mm(&mm, &vmi);
- return true;
-}
-
-int main(void)
+static void run_merge_tests(int *num_tests, int *num_fail)
{
- int num_tests = 0, num_fail = 0;
-
- maple_tree_init();
- vma_state_init();
-
-#define TEST(name) \
- do { \
- num_tests++; \
- if (!test_##name()) { \
- num_fail++; \
- fprintf(stderr, "Test " #name " FAILED\n"); \
- } \
- } while (0)
-
/* Very simple tests to kick the tyres. */
TEST(simple_merge);
TEST(simple_modify);
@@ -1771,15 +1465,5 @@ int main(void)
TEST(dup_anon_vma);
TEST(vmi_prealloc_fail);
TEST(merge_extend);
- TEST(copy_vma);
TEST(expand_only_mode);
-
- TEST(mmap_region_basic);
-
-#undef TEST
-
- printf("%d tests run, %d passed, %d failed.\n",
- num_tests, num_tests - num_fail, num_fail);
-
- return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
}
diff --git a/tools/testing/vma/tests/mmap.c b/tools/testing/vma/tests/mmap.c
new file mode 100644
index 000000000000..bded4ecbe5db
--- /dev/null
+++ b/tools/testing/vma/tests/mmap.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+static bool test_mmap_region_basic(void)
+{
+ struct mm_struct mm = {};
+ unsigned long addr;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, &mm, 0);
+
+ current->mm = &mm;
+
+ /* Map at 0x300000, length 0x3000. */
+ addr = __mmap_region(NULL, 0x300000, 0x3000,
+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+ 0x300, NULL);
+ ASSERT_EQ(addr, 0x300000);
+
+ /* Map at 0x250000, length 0x3000. */
+ addr = __mmap_region(NULL, 0x250000, 0x3000,
+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+ 0x250, NULL);
+ ASSERT_EQ(addr, 0x250000);
+
+ /* Map at 0x303000, merging to 0x300000 of length 0x6000. */
+ addr = __mmap_region(NULL, 0x303000, 0x3000,
+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+ 0x303, NULL);
+ ASSERT_EQ(addr, 0x303000);
+
+ /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */
+ addr = __mmap_region(NULL, 0x24d000, 0x3000,
+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+ 0x24d, NULL);
+ ASSERT_EQ(addr, 0x24d000);
+
+ ASSERT_EQ(mm.map_count, 2);
+
+ for_each_vma(vmi, vma) {
+ if (vma->vm_start == 0x300000) {
+ ASSERT_EQ(vma->vm_end, 0x306000);
+ ASSERT_EQ(vma->vm_pgoff, 0x300);
+ } else if (vma->vm_start == 0x24d000) {
+ ASSERT_EQ(vma->vm_end, 0x253000);
+ ASSERT_EQ(vma->vm_pgoff, 0x24d);
+ } else {
+ ASSERT_FALSE(true);
+ }
+ }
+
+ cleanup_mm(&mm, &vmi);
+ return true;
+}
+
+static void run_mmap_tests(int *num_tests, int *num_fail)
+{
+ TEST(mmap_region_basic);
+}
diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c
new file mode 100644
index 000000000000..c54ffc954f11
--- /dev/null
+++ b/tools/testing/vma/tests/vma.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+static bool compare_legacy_flags(vm_flags_t legacy_flags, vma_flags_t flags)
+{
+ const unsigned long legacy_val = legacy_flags;
+ /* The lower word should contain the precise same value. */
+ const unsigned long flags_lower = flags.__vma_flags[0];
+#if NUM_VMA_FLAGS > BITS_PER_LONG
+ int i;
+
+ /* All bits in higher flag values should be zero. */
+ for (i = 1; i < NUM_VMA_FLAGS / BITS_PER_LONG; i++) {
+ if (flags.__vma_flags[i] != 0)
+ return false;
+ }
+#endif
+
+ static_assert(sizeof(legacy_flags) == sizeof(unsigned long));
+
+ return legacy_val == flags_lower;
+}
+
+static bool test_copy_vma(void)
+{
+ vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+ struct mm_struct mm = {};
+ bool need_locks = false;
+ VMA_ITERATOR(vmi, &mm, 0);
+ struct vm_area_struct *vma, *vma_new, *vma_next;
+
+ /* Move backwards and do not merge. */
+
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+ vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks);
+ ASSERT_NE(vma_new, vma);
+ ASSERT_EQ(vma_new->vm_start, 0);
+ ASSERT_EQ(vma_new->vm_end, 0x2000);
+ ASSERT_EQ(vma_new->vm_pgoff, 0);
+ vma_assert_attached(vma_new);
+
+ cleanup_mm(&mm, &vmi);
+
+ /* Move a VMA into position next to another and merge the two. */
+
+ vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
+ vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags);
+ vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks);
+ vma_assert_attached(vma_new);
+
+ ASSERT_EQ(vma_new, vma_next);
+
+ cleanup_mm(&mm, &vmi);
+ return true;
+}
+
+static bool test_vma_flags_unchanged(void)
+{
+ vma_flags_t flags = EMPTY_VMA_FLAGS;
+ vm_flags_t legacy_flags = 0;
+ int bit;
+ struct vm_area_struct vma;
+ struct vm_area_desc desc;
+
+
+ vma.flags = EMPTY_VMA_FLAGS;
+ desc.vma_flags = EMPTY_VMA_FLAGS;
+
+ for (bit = 0; bit < BITS_PER_LONG; bit++) {
+ vma_flags_t mask = mk_vma_flags(bit);
+
+ legacy_flags |= (1UL << bit);
+
+ /* Individual flags. */
+ vma_flags_set(&flags, bit);
+ ASSERT_TRUE(compare_legacy_flags(legacy_flags, flags));
+
+ /* Via mask. */
+ vma_flags_set_mask(&flags, mask);
+ ASSERT_TRUE(compare_legacy_flags(legacy_flags, flags));
+
+ /* Same for VMA. */
+ vma_set_flags(&vma, bit);
+ ASSERT_TRUE(compare_legacy_flags(legacy_flags, vma.flags));
+ vma_set_flags_mask(&vma, mask);
+ ASSERT_TRUE(compare_legacy_flags(legacy_flags, vma.flags));
+
+ /* Same for VMA descriptor. */
+ vma_desc_set_flags(&desc, bit);
+ ASSERT_TRUE(compare_legacy_flags(legacy_flags, desc.vma_flags));
+ vma_desc_set_flags_mask(&desc, mask);
+ ASSERT_TRUE(compare_legacy_flags(legacy_flags, desc.vma_flags));
+ }
+
+ return true;
+}
+
+static bool test_vma_flags_cleared(void)
+{
+ const vma_flags_t empty = EMPTY_VMA_FLAGS;
+ vma_flags_t flags;
+ int i;
+
+ /* Set all bits high. */
+ memset(&flags, 1, sizeof(flags));
+ /* Try to clear. */
+ vma_flags_clear_all(&flags);
+ /* Equal to EMPTY_VMA_FLAGS? */
+ ASSERT_EQ(memcmp(&empty, &flags, sizeof(flags)), 0);
+ /* Make sure every unsigned long entry in bitmap array zero. */
+ for (i = 0; i < sizeof(flags) / BITS_PER_LONG; i++) {
+ const unsigned long val = flags.__vma_flags[i];
+
+ ASSERT_EQ(val, 0);
+ }
+
+ return true;
+}
+
+/*
+ * Assert that VMA flag functions that operate at the system word level function
+ * correctly.
+ */
+static bool test_vma_flags_word(void)
+{
+ vma_flags_t flags = EMPTY_VMA_FLAGS;
+ const vma_flags_t comparison =
+ mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, 64, 65);
+
+ /* Set some custom high flags. */
+ vma_flags_set(&flags, 64, 65);
+ /* Now overwrite the first word. */
+ vma_flags_overwrite_word(&flags, VM_READ | VM_WRITE);
+ /* Ensure they are equal. */
+ ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0);
+
+ flags = EMPTY_VMA_FLAGS;
+ vma_flags_set(&flags, 64, 65);
+
+ /* Do the same with the _once() equivalent. */
+ vma_flags_overwrite_word_once(&flags, VM_READ | VM_WRITE);
+ ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0);
+
+ flags = EMPTY_VMA_FLAGS;
+ vma_flags_set(&flags, 64, 65);
+
+ /* Make sure we can set a word without disturbing other bits. */
+ vma_flags_set(&flags, VMA_WRITE_BIT);
+ vma_flags_set_word(&flags, VM_READ);
+ ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0);
+
+ flags = EMPTY_VMA_FLAGS;
+ vma_flags_set(&flags, 64, 65);
+
+ /* Make sure we can clear a word without disturbing other bits. */
+ vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+ vma_flags_clear_word(&flags, VM_EXEC);
+ ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0);
+
+ return true;
+}
+
+/* Ensure that vma_flags_test() and friends works correctly. */
+static bool test_vma_flags_test(void)
+{
+ const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT,
+ VMA_EXEC_BIT, 64, 65);
+ struct vm_area_struct vma;
+ struct vm_area_desc desc;
+
+ vma.flags = flags;
+ desc.vma_flags = flags;
+
+#define do_test(...) \
+ ASSERT_TRUE(vma_flags_test(&flags, __VA_ARGS__)); \
+ ASSERT_TRUE(vma_desc_test_flags(&desc, __VA_ARGS__))
+
+#define do_test_all_true(...) \
+ ASSERT_TRUE(vma_flags_test_all(&flags, __VA_ARGS__)); \
+ ASSERT_TRUE(vma_test_all_flags(&vma, __VA_ARGS__))
+
+#define do_test_all_false(...) \
+ ASSERT_FALSE(vma_flags_test_all(&flags, __VA_ARGS__)); \
+ ASSERT_FALSE(vma_test_all_flags(&vma, __VA_ARGS__))
+
+ /*
+ * Testing for some flags that are present, some that are not - should
+ * pass. ANY flags matching should work.
+ */
+ do_test(VMA_READ_BIT, VMA_MAYREAD_BIT, VMA_SEQ_READ_BIT);
+ /* However, the ...test_all() variant should NOT pass. */
+ do_test_all_false(VMA_READ_BIT, VMA_MAYREAD_BIT, VMA_SEQ_READ_BIT);
+ /* But should pass for flags present. */
+ do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65);
+ /* Also subsets... */
+ do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64);
+ do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+ do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT);
+ do_test_all_true(VMA_READ_BIT);
+ /*
+ * Check _mask variant. We don't need to test extensively as macro
+ * helper is the equivalent.
+ */
+ ASSERT_TRUE(vma_flags_test_mask(&flags, flags));
+ ASSERT_TRUE(vma_flags_test_all_mask(&flags, flags));
+
+ /* Single bits. */
+ do_test(VMA_READ_BIT);
+ do_test(VMA_WRITE_BIT);
+ do_test(VMA_EXEC_BIT);
+#if NUM_VMA_FLAG_BITS > 64
+ do_test(64);
+ do_test(65);
+#endif
+
+ /* Two bits. */
+ do_test(VMA_READ_BIT, VMA_WRITE_BIT);
+ do_test(VMA_READ_BIT, VMA_EXEC_BIT);
+ do_test(VMA_WRITE_BIT, VMA_EXEC_BIT);
+ /* Ordering shouldn't matter. */
+ do_test(VMA_WRITE_BIT, VMA_READ_BIT);
+ do_test(VMA_EXEC_BIT, VMA_READ_BIT);
+ do_test(VMA_EXEC_BIT, VMA_WRITE_BIT);
+#if NUM_VMA_FLAG_BITS > 64
+ do_test(VMA_READ_BIT, 64);
+ do_test(VMA_WRITE_BIT, 64);
+ do_test(64, VMA_READ_BIT);
+ do_test(64, VMA_WRITE_BIT);
+ do_test(VMA_READ_BIT, 65);
+ do_test(VMA_WRITE_BIT, 65);
+ do_test(65, VMA_READ_BIT);
+ do_test(65, VMA_WRITE_BIT);
+#endif
+ /* Three bits. */
+ do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+#if NUM_VMA_FLAG_BITS > 64
+ /* No need to consider every single permutation. */
+ do_test(VMA_READ_BIT, VMA_WRITE_BIT, 64);
+ do_test(VMA_READ_BIT, VMA_WRITE_BIT, 65);
+
+ /* Four bits. */
+ do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64);
+ do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 65);
+
+ /* Five bits. */
+ do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65);
+#endif
+
+#undef do_test
+#undef do_test_all_true
+#undef do_test_all_false
+
+ return true;
+}
+
+/* Ensure that vma_flags_clear() and friends works correctly. */
+static bool test_vma_flags_clear(void)
+{
+ vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT,
+ VMA_EXEC_BIT, 64, 65);
+ vma_flags_t mask = mk_vma_flags(VMA_EXEC_BIT, 64);
+ struct vm_area_struct vma;
+ struct vm_area_desc desc;
+
+ vma.flags = flags;
+ desc.vma_flags = flags;
+
+ /* Cursory check of _mask() variant, as the helper macros imply. */
+ vma_flags_clear_mask(&flags, mask);
+ vma_flags_clear_mask(&vma.flags, mask);
+ vma_desc_clear_flags_mask(&desc, mask);
+ ASSERT_FALSE(vma_flags_test(&flags, VMA_EXEC_BIT, 64));
+ ASSERT_FALSE(vma_flags_test(&vma.flags, VMA_EXEC_BIT, 64));
+ ASSERT_FALSE(vma_desc_test_flags(&desc, VMA_EXEC_BIT, 64));
+ /* Reset. */
+ vma_flags_set(&flags, VMA_EXEC_BIT, 64);
+ vma_set_flags(&vma, VMA_EXEC_BIT, 64);
+ vma_desc_set_flags(&desc, VMA_EXEC_BIT, 64);
+
+ /*
+ * Clear the flags and assert clear worked, then reset flags back to
+ * include specified flags.
+ */
+#define do_test_and_reset(...) \
+ vma_flags_clear(&flags, __VA_ARGS__); \
+ vma_flags_clear(&vma.flags, __VA_ARGS__); \
+ vma_desc_clear_flags(&desc, __VA_ARGS__); \
+ ASSERT_FALSE(vma_flags_test(&flags, __VA_ARGS__)); \
+ ASSERT_FALSE(vma_flags_test(&vma.flags, __VA_ARGS__)); \
+ ASSERT_FALSE(vma_desc_test_flags(&desc, __VA_ARGS__)); \
+ vma_flags_set(&flags, __VA_ARGS__); \
+ vma_set_flags(&vma, __VA_ARGS__); \
+ vma_desc_set_flags(&desc, __VA_ARGS__)
+
+ /* Single flags. */
+ do_test_and_reset(VMA_READ_BIT);
+ do_test_and_reset(VMA_WRITE_BIT);
+ do_test_and_reset(VMA_EXEC_BIT);
+ do_test_and_reset(64);
+ do_test_and_reset(65);
+
+ /* Two flags, in different orders. */
+ do_test_and_reset(VMA_READ_BIT, VMA_WRITE_BIT);
+ do_test_and_reset(VMA_READ_BIT, VMA_EXEC_BIT);
+ do_test_and_reset(VMA_READ_BIT, 64);
+ do_test_and_reset(VMA_READ_BIT, 65);
+ do_test_and_reset(VMA_WRITE_BIT, VMA_READ_BIT);
+ do_test_and_reset(VMA_WRITE_BIT, VMA_EXEC_BIT);
+ do_test_and_reset(VMA_WRITE_BIT, 64);
+ do_test_and_reset(VMA_WRITE_BIT, 65);
+ do_test_and_reset(VMA_EXEC_BIT, VMA_READ_BIT);
+ do_test_and_reset(VMA_EXEC_BIT, VMA_WRITE_BIT);
+ do_test_and_reset(VMA_EXEC_BIT, 64);
+ do_test_and_reset(VMA_EXEC_BIT, 65);
+ do_test_and_reset(64, VMA_READ_BIT);
+ do_test_and_reset(64, VMA_WRITE_BIT);
+ do_test_and_reset(64, VMA_EXEC_BIT);
+ do_test_and_reset(64, 65);
+ do_test_and_reset(65, VMA_READ_BIT);
+ do_test_and_reset(65, VMA_WRITE_BIT);
+ do_test_and_reset(65, VMA_EXEC_BIT);
+ do_test_and_reset(65, 64);
+
+ /* Three flags. */
+
+#undef do_test_some_missing
+#undef do_test_and_reset
+
+ return true;
+}
+
+static void run_vma_tests(int *num_tests, int *num_fail)
+{
+ TEST(copy_vma);
+ TEST(vma_flags_unchanged);
+ TEST(vma_flags_cleared);
+ TEST(vma_flags_word);
+ TEST(vma_flags_test);
+ TEST(vma_flags_clear);
+}
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 7fa56dcc53a6..0e1121e2ef23 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -12,16 +12,18 @@
#ifndef __MM_VMA_INTERNAL_H
#define __MM_VMA_INTERNAL_H
-#define __private
-#define __bitwise
-#define __randomize_layout
+#include <stdlib.h>
#define CONFIG_MMU
#define CONFIG_PER_VMA_LOCK
-#include <stdlib.h>
+#ifdef __CONCAT
+#undef __CONCAT
+#endif
+#include <linux/args.h>
#include <linux/atomic.h>
+#include <linux/bitmap.h>
#include <linux/list.h>
#include <linux/maple_tree.h>
#include <linux/mm.h>
@@ -29,1839 +31,28 @@
#include <linux/refcount.h>
#include <linux/slab.h>
-extern unsigned long stack_guard_gap;
-#ifdef CONFIG_MMU
-extern unsigned long mmap_min_addr;
-extern unsigned long dac_mmap_min_addr;
-#else
-#define mmap_min_addr 0UL
-#define dac_mmap_min_addr 0UL
-#endif
-
-#define VM_WARN_ON(_expr) (WARN_ON(_expr))
-#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr))
-#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr))
-#define VM_BUG_ON(_expr) (BUG_ON(_expr))
-#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr))
-
-#define MMF_HAS_MDWE 28
-
-/*
- * vm_flags in vm_area_struct, see mm_types.h.
- * When changing, update also include/trace/events/mmflags.h
- */
-
-#define VM_NONE 0x00000000
-
-/**
- * typedef vma_flag_t - specifies an individual VMA flag by bit number.
- *
- * This value is made type safe by sparse to avoid passing invalid flag values
- * around.
- */
-typedef int __bitwise vma_flag_t;
-
-#define DECLARE_VMA_BIT(name, bitnum) \
- VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
-#define DECLARE_VMA_BIT_ALIAS(name, aliased) \
- VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT
-enum {
- DECLARE_VMA_BIT(READ, 0),
- DECLARE_VMA_BIT(WRITE, 1),
- DECLARE_VMA_BIT(EXEC, 2),
- DECLARE_VMA_BIT(SHARED, 3),
- /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
- DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */
- DECLARE_VMA_BIT(MAYWRITE, 5),
- DECLARE_VMA_BIT(MAYEXEC, 6),
- DECLARE_VMA_BIT(MAYSHARE, 7),
- DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */
-#ifdef CONFIG_MMU
- DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
-#else
- /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
- DECLARE_VMA_BIT(MAYOVERLAY, 9),
-#endif /* CONFIG_MMU */
- /* Page-ranges managed without "struct page", just pure PFN */
- DECLARE_VMA_BIT(PFNMAP, 10),
- DECLARE_VMA_BIT(MAYBE_GUARD, 11),
- DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */
- DECLARE_VMA_BIT(LOCKED, 13),
- DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */
- DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */
- DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */
- DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */
- DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
- DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
- DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */
- DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */
- DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */
- DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */
- DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */
- DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
- DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */
- DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */
- DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */
- DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */
- DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
- DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */
- /* These bits are reused, we define specific uses below. */
- DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
- DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
- DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
- DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
- DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
- DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
- DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
- /*
- * This flag is used to connect VFIO to arch specific KVM code. It
- * indicates that the memory under this VMA is safe for use with any
- * non-cachable memory type inside KVM. Some VFIO devices, on some
- * platforms, are thought to be unsafe and can cause machine crashes
- * if KVM does not lock down the memory type.
- */
- DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
-#ifdef CONFIG_PPC32
- DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
-#else
- DECLARE_VMA_BIT(DROPPABLE, 40),
-#endif
- DECLARE_VMA_BIT(UFFD_MINOR, 41),
- DECLARE_VMA_BIT(SEALED, 42),
- /* Flags that reuse flags above. */
- DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
- DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
- DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
- DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
- DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
-#if defined(CONFIG_X86_USER_SHADOW_STACK)
- /*
- * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
- * support core mm.
- *
- * These VMAs will get a single end guard page. This helps userspace
- * protect itself from attacks. A single page is enough for current
- * shadow stack archs (x86). See the comments near alloc_shstk() in
- * arch/x86/kernel/shstk.c for more details on the guard size.
- */
- DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
-#elif defined(CONFIG_ARM64_GCS)
- /*
- * arm64's Guarded Control Stack implements similar functionality and
- * has similar constraints to shadow stacks.
- */
- DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
-#endif
- DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */
- DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */
- DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */
- DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */
- DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */
- DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */
- DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */
- DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
-#ifdef CONFIG_STACK_GROWSUP
- DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
- DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
-#else
- DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
-#endif
-};
-
-#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
-#define VM_READ INIT_VM_FLAG(READ)
-#define VM_WRITE INIT_VM_FLAG(WRITE)
-#define VM_EXEC INIT_VM_FLAG(EXEC)
-#define VM_SHARED INIT_VM_FLAG(SHARED)
-#define VM_MAYREAD INIT_VM_FLAG(MAYREAD)
-#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE)
-#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC)
-#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE)
-#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN)
-#ifdef CONFIG_MMU
-#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING)
-#else
-#define VM_UFFD_MISSING VM_NONE
-#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY)
-#endif
-#define VM_PFNMAP INIT_VM_FLAG(PFNMAP)
-#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD)
-#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP)
-#define VM_LOCKED INIT_VM_FLAG(LOCKED)
-#define VM_IO INIT_VM_FLAG(IO)
-#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ)
-#define VM_RAND_READ INIT_VM_FLAG(RAND_READ)
-#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY)
-#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND)
-#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT)
-#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT)
-#define VM_NORESERVE INIT_VM_FLAG(NORESERVE)
-#define VM_HUGETLB INIT_VM_FLAG(HUGETLB)
-#define VM_SYNC INIT_VM_FLAG(SYNC)
-#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1)
-#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK)
-#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP)
-#ifdef CONFIG_MEM_SOFT_DIRTY
-#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY)
-#else
-#define VM_SOFTDIRTY VM_NONE
-#endif
-#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP)
-#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE)
-#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE)
-#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE)
-#define VM_STACK INIT_VM_FLAG(STACK)
-#ifdef CONFIG_STACK_GROWS_UP
-#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY)
-#else
-#define VM_STACK_EARLY VM_NONE
-#endif
-#ifdef CONFIG_ARCH_HAS_PKEYS
-#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
-/* Despite the naming, these are FLAGS not bits. */
-#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
-#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
-#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
-#if CONFIG_ARCH_PKEY_BITS > 3
-#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
-#else
-#define VM_PKEY_BIT3 VM_NONE
-#endif /* CONFIG_ARCH_PKEY_BITS > 3 */
-#if CONFIG_ARCH_PKEY_BITS > 4
-#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
-#else
-#define VM_PKEY_BIT4 VM_NONE
-#endif /* CONFIG_ARCH_PKEY_BITS > 4 */
-#endif /* CONFIG_ARCH_HAS_PKEYS */
-#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS)
-#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK)
-#else
-#define VM_SHADOW_STACK VM_NONE
-#endif
-#if defined(CONFIG_PPC64)
-#define VM_SAO INIT_VM_FLAG(SAO)
-#elif defined(CONFIG_PARISC)
-#define VM_GROWSUP INIT_VM_FLAG(GROWSUP)
-#elif defined(CONFIG_SPARC64)
-#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI)
-#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR)
-#elif defined(CONFIG_ARM64)
-#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI)
-#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR)
-#elif !defined(CONFIG_MMU)
-#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY)
-#endif
-#ifndef VM_GROWSUP
-#define VM_GROWSUP VM_NONE
-#endif
-#ifdef CONFIG_ARM64_MTE
-#define VM_MTE INIT_VM_FLAG(MTE)
-#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED)
-#else
-#define VM_MTE VM_NONE
-#define VM_MTE_ALLOWED VM_NONE
-#endif
-#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR)
-#else
-#define VM_UFFD_MINOR VM_NONE
-#endif
-#ifdef CONFIG_64BIT
-#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
-#define VM_SEALED INIT_VM_FLAG(SEALED)
-#else
-#define VM_ALLOW_ANY_UNCACHED VM_NONE
-#define VM_SEALED VM_NONE
-#endif
-#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
-#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE)
-#else
-#define VM_DROPPABLE VM_NONE
-#endif
-
-/* Bits set in the VMA until the stack is in its final location */
-#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
-
-#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
-
-/* Common data flag combinations */
-#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \
- VM_MAYWRITE | VM_MAYEXEC)
-#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
-#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */
-#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC
-#endif
-
-#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
-#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
-#endif
-
-#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
-
-#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
-
-/* VMA basic access permission flags */
-#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
-
-/*
- * Special vmas that are non-mergable, non-mlock()able.
- */
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
-
-#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
-#define TASK_SIZE_LOW DEFAULT_MAP_WINDOW
-#define TASK_SIZE_MAX DEFAULT_MAP_WINDOW
-#define STACK_TOP TASK_SIZE_LOW
-#define STACK_TOP_MAX TASK_SIZE_MAX
-
-/* This mask represents all the VMA flag bits used by mlock */
-#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
-
-#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
-
-#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
-#define RLIMIT_STACK 3 /* max stack size */
-#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */
-
-#define CAP_IPC_LOCK 14
-
-/*
- * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
- * possesses it but the other does not, the merged VMA should nonetheless have
- * applied to it:
- *
- * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its
- * references cleared via /proc/$pid/clear_refs, any merged VMA
- * should be considered soft-dirty also as it operates at a VMA
- * granularity.
- */
-#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
-
-/*
- * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
- * of these flags and the other not does not preclude a merge.
- *
- * VM_STICKY - When merging VMAs, VMA flags must match, unless they are
- * 'sticky'. If any sticky flags exist in either VMA, we simply
- * set all of them on the merged VMA.
- */
-#define VM_IGNORE_MERGE VM_STICKY
-
-/*
- * Flags which should result in page tables being copied on fork. These are
- * flags which indicate that the VMA maps page tables which cannot be
- * reconsistuted upon page fault, so necessitate page table copying upon
- *
- * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
- * reasonably reconstructed on page fault.
- *
- * VM_UFFD_WP - Encodes metadata about an installed uffd
- * write protect handler, which cannot be
- * reconstructed on page fault.
- *
- * We always copy pgtables when dst_vma has uffd-wp
- * enabled even if it's file-backed
- * (e.g. shmem). Because when uffd-wp is enabled,
- * pgtable contains uffd-wp protection information,
- * that's something we can't retrieve from page cache,
- * and skip copying will lose those info.
- *
- * VM_MAYBE_GUARD - Could contain page guard region markers which
- * by design are a property of the page tables
- * only and thus cannot be reconstructed on page
- * fault.
- */
-#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
-
-#define FIRST_USER_ADDRESS 0UL
-#define USER_PGTABLES_CEILING 0UL
-
-#define vma_policy(vma) NULL
-
-#define down_write_nest_lock(sem, nest_lock)
-
-#define pgprot_val(x) ((x).pgprot)
-#define __pgprot(x) ((pgprot_t) { (x) } )
-
-#define for_each_vma(__vmi, __vma) \
- while (((__vma) = vma_next(&(__vmi))) != NULL)
-
-/* The MM code likes to work with exclusive end addresses */
-#define for_each_vma_range(__vmi, __vma, __end) \
- while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
-
-#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
-
-#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT))
-
-#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr)
-#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr)
-
-#define TASK_SIZE ((1ul << 47)-PAGE_SIZE)
-
-#define AS_MM_ALL_LOCKS 2
-
-/* We hardcode this for now. */
-#define sysctl_max_map_count 0x1000000UL
-
-#define pgoff_t unsigned long
-typedef unsigned long pgprotval_t;
-typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
-typedef unsigned long vm_flags_t;
-typedef __bitwise unsigned int vm_fault_t;
-
-/*
- * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...)
- * either way :)
- */
-#define pr_warn_once pr_err
-
-#define data_race(expr) expr
-
-#define ASSERT_EXCLUSIVE_WRITER(x)
-
-#define pgtable_supports_soft_dirty() 1
-
-/**
- * swap - swap values of @a and @b
- * @a: first value
- * @b: second value
- */
-#define swap(a, b) \
- do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
-
-struct kref {
- refcount_t refcount;
-};
-
-/*
- * Define the task command name length as enum, then it can be visible to
- * BPF programs.
- */
-enum {
- TASK_COMM_LEN = 16,
-};
-
/*
- * Flags for bug emulation.
- *
- * These occupy the top three bytes.
+ * DUPLICATE typedef definitions from kernel source that have to be declared
+ * ahead of all other headers.
*/
-enum {
- READ_IMPLIES_EXEC = 0x0400000,
-};
-
-struct task_struct {
- char comm[TASK_COMM_LEN];
- pid_t pid;
- struct mm_struct *mm;
-
- /* Used for emulating ABI behavior of previous Linux versions: */
- unsigned int personality;
-};
-
-struct task_struct *get_current(void);
-#define current get_current()
-
-struct anon_vma {
- struct anon_vma *root;
- struct rb_root_cached rb_root;
-
- /* Test fields. */
- bool was_cloned;
- bool was_unlinked;
-};
-
-struct anon_vma_chain {
- struct anon_vma *anon_vma;
- struct list_head same_vma;
-};
-
-struct anon_vma_name {
- struct kref kref;
- /* The name needs to be at the end because it is dynamically sized. */
- char name[];
-};
-
-struct vma_iterator {
- struct ma_state mas;
-};
-
-#define VMA_ITERATOR(name, __mm, __addr) \
- struct vma_iterator name = { \
- .mas = { \
- .tree = &(__mm)->mm_mt, \
- .index = __addr, \
- .node = NULL, \
- .status = ma_start, \
- }, \
- }
-
-struct address_space {
- struct rb_root_cached i_mmap;
- unsigned long flags;
- atomic_t i_mmap_writable;
-};
-
-struct vm_userfaultfd_ctx {};
-struct mempolicy {};
-struct mmu_gather {};
-struct mutex {};
-#define DEFINE_MUTEX(mutexname) \
- struct mutex mutexname = {}
-
-#define DECLARE_BITMAP(name, bits) \
- unsigned long name[BITS_TO_LONGS(bits)]
-
-#define NUM_MM_FLAG_BITS (64)
+#define __private
+/* NUM_MM_FLAG_BITS defined by test code. */
typedef struct {
__private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
} mm_flags_t;
-
-/*
- * Opaque type representing current VMA (vm_area_struct) flag state. Must be
- * accessed via vma_flags_xxx() helper functions.
- */
-#define NUM_VMA_FLAG_BITS BITS_PER_LONG
+/* NUM_VMA_FLAG_BITS defined by test code. */
typedef struct {
DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS);
} __private vma_flags_t;
-struct mm_struct {
- struct maple_tree mm_mt;
- int map_count; /* number of VMAs */
- unsigned long total_vm; /* Total pages mapped */
- unsigned long locked_vm; /* Pages that have PG_mlocked set */
- unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
- unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
- unsigned long stack_vm; /* VM_STACK */
-
- unsigned long def_flags;
-
- mm_flags_t flags; /* Must use mm_flags_* helpers to access */
-};
-
-struct vm_area_struct;
-
-
-/* What action should be taken after an .mmap_prepare call is complete? */
-enum mmap_action_type {
- MMAP_NOTHING, /* Mapping is complete, no further action. */
- MMAP_REMAP_PFN, /* Remap PFN range. */
- MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */
-};
-
-/*
- * Describes an action an mmap_prepare hook can instruct to be taken to complete
- * the mapping of a VMA. Specified in vm_area_desc.
- */
-struct mmap_action {
- union {
- /* Remap range. */
- struct {
- unsigned long start;
- unsigned long start_pfn;
- unsigned long size;
- pgprot_t pgprot;
- } remap;
- };
- enum mmap_action_type type;
-
- /*
- * If specified, this hook is invoked after the selected action has been
- * successfully completed. Note that the VMA write lock still held.
- *
- * The absolute minimum ought to be done here.
- *
- * Returns 0 on success, or an error code.
- */
- int (*success_hook)(const struct vm_area_struct *vma);
-
- /*
- * If specified, this hook is invoked when an error occurred when
- * attempting the selection action.
- *
- * The hook can return an error code in order to filter the error, but
- * it is not valid to clear the error here.
- */
- int (*error_hook)(int err);
-
- /*
- * This should be set in rare instances where the operation required
- * that the rmap should not be able to access the VMA until
- * completely set up.
- */
- bool hide_from_rmap_until_complete :1;
-};
-
-/* Operations which modify VMAs. */
-enum vma_operation {
- VMA_OP_SPLIT,
- VMA_OP_MERGE_UNFAULTED,
- VMA_OP_REMAP,
- VMA_OP_FORK,
-};
-
-/*
- * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
- * manipulate mutable fields which will cause those fields to be updated in the
- * resultant VMA.
- *
- * Helper functions are not required for manipulating any field.
- */
-struct vm_area_desc {
- /* Immutable state. */
- const struct mm_struct *const mm;
- struct file *const file; /* May vary from vm_file in stacked callers. */
- unsigned long start;
- unsigned long end;
-
- /* Mutable fields. Populated with initial state. */
- pgoff_t pgoff;
- struct file *vm_file;
- union {
- vm_flags_t vm_flags;
- vma_flags_t vma_flags;
- };
- pgprot_t page_prot;
-
- /* Write-only fields. */
- const struct vm_operations_struct *vm_ops;
- void *private_data;
-
- /* Take further action? */
- struct mmap_action action;
-};
-
-struct file_operations {
- int (*mmap)(struct file *, struct vm_area_struct *);
- int (*mmap_prepare)(struct vm_area_desc *);
-};
-
-struct file {
- struct address_space *f_mapping;
- const struct file_operations *f_op;
-};
-
-#define VMA_LOCK_OFFSET 0x40000000
-
-typedef struct { unsigned long v; } freeptr_t;
-
-struct vm_area_struct {
- /* The first cache line has the info for VMA tree walking. */
-
- union {
- struct {
- /* VMA covers [vm_start; vm_end) addresses within mm */
- unsigned long vm_start;
- unsigned long vm_end;
- };
- freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
- };
-
- struct mm_struct *vm_mm; /* The address space we belong to. */
- pgprot_t vm_page_prot; /* Access permissions of this VMA. */
-
- /*
- * Flags, see mm.h.
- * To modify use vm_flags_{init|reset|set|clear|mod} functions.
- */
- union {
- const vm_flags_t vm_flags;
- vma_flags_t flags;
- };
-
-#ifdef CONFIG_PER_VMA_LOCK
- /*
- * Can only be written (using WRITE_ONCE()) while holding both:
- * - mmap_lock (in write mode)
- * - vm_refcnt bit at VMA_LOCK_OFFSET is set
- * Can be read reliably while holding one of:
- * - mmap_lock (in read or write mode)
- * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
- * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
- * while holding nothing (except RCU to keep the VMA struct allocated).
- *
- * This sequence counter is explicitly allowed to overflow; sequence
- * counter reuse can only lead to occasional unnecessary use of the
- * slowpath.
- */
- unsigned int vm_lock_seq;
-#endif
-
- /*
- * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
- * list, after a COW of one of the file pages. A MAP_SHARED vma
- * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
- * or brk vma (with NULL file) can only be in an anon_vma list.
- */
- struct list_head anon_vma_chain; /* Serialized by mmap_lock &
- * page_table_lock */
- struct anon_vma *anon_vma; /* Serialized by page_table_lock */
-
- /* Function pointers to deal with this struct. */
- const struct vm_operations_struct *vm_ops;
-
- /* Information about our backing store: */
- unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
- units */
- struct file * vm_file; /* File we map to (can be NULL). */
- void * vm_private_data; /* was vm_pte (shared mem) */
-
-#ifdef CONFIG_SWAP
- atomic_long_t swap_readahead_info;
-#endif
-#ifndef CONFIG_MMU
- struct vm_region *vm_region; /* NOMMU mapping region */
-#endif
-#ifdef CONFIG_NUMA
- struct mempolicy *vm_policy; /* NUMA policy for the VMA */
-#endif
-#ifdef CONFIG_NUMA_BALANCING
- struct vma_numab_state *numab_state; /* NUMA Balancing state */
-#endif
-#ifdef CONFIG_PER_VMA_LOCK
- /* Unstable RCU readers are allowed to read this. */
- refcount_t vm_refcnt;
-#endif
- /*
- * For areas with an address space and backing store,
- * linkage into the address_space->i_mmap interval tree.
- *
- */
- struct {
- struct rb_node rb;
- unsigned long rb_subtree_last;
- } shared;
-#ifdef CONFIG_ANON_VMA_NAME
- /*
- * For private and shared anonymous mappings, a pointer to a null
- * terminated string containing the name given to the vma, or NULL if
- * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
- */
- struct anon_vma_name *anon_name;
-#endif
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
-} __randomize_layout;
-
-struct vm_fault {};
-
-struct vm_operations_struct {
- void (*open)(struct vm_area_struct * area);
- /**
- * @close: Called when the VMA is being removed from the MM.
- * Context: User context. May sleep. Caller holds mmap_lock.
- */
- void (*close)(struct vm_area_struct * area);
- /* Called any time before splitting to check if it's allowed */
- int (*may_split)(struct vm_area_struct *area, unsigned long addr);
- int (*mremap)(struct vm_area_struct *area);
- /*
- * Called by mprotect() to make driver-specific permission
- * checks before mprotect() is finalised. The VMA must not
- * be modified. Returns 0 if mprotect() can proceed.
- */
- int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, unsigned long newflags);
- vm_fault_t (*fault)(struct vm_fault *vmf);
- vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
- vm_fault_t (*map_pages)(struct vm_fault *vmf,
- pgoff_t start_pgoff, pgoff_t end_pgoff);
- unsigned long (*pagesize)(struct vm_area_struct * area);
-
- /* notification that a previously read-only page is about to become
- * writable, if an error is returned it will cause a SIGBUS */
- vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
-
- /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
- vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
-
- /* called by access_process_vm when get_user_pages() fails, typically
- * for use by special VMAs. See also generic_access_phys() for a generic
- * implementation useful for any iomem mapping.
- */
- int (*access)(struct vm_area_struct *vma, unsigned long addr,
- void *buf, int len, int write);
-
- /* Called by the /proc/PID/maps code to ask the vma whether it
- * has a special name. Returning non-NULL will also cause this
- * vma to be dumped unconditionally. */
- const char *(*name)(struct vm_area_struct *vma);
-
-#ifdef CONFIG_NUMA
- /*
- * set_policy() op must add a reference to any non-NULL @new mempolicy
- * to hold the policy upon return. Caller should pass NULL @new to
- * remove a policy and fall back to surrounding context--i.e. do not
- * install a MPOL_DEFAULT policy, nor the task or system default
- * mempolicy.
- */
- int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
-
- /*
- * get_policy() op must add reference [mpol_get()] to any policy at
- * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure
- * in mm/mempolicy.c will do this automatically.
- * get_policy() must NOT add a ref if the policy at (vma,addr) is not
- * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
- * If no [shared/vma] mempolicy exists at the addr, get_policy() op
- * must return NULL--i.e., do not "fallback" to task or system default
- * policy.
- */
- struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
- unsigned long addr, pgoff_t *ilx);
-#endif
-#ifdef CONFIG_FIND_NORMAL_PAGE
- /*
- * Called by vm_normal_page() for special PTEs in @vma at @addr. This
- * allows for returning a "normal" page from vm_normal_page() even
- * though the PTE indicates that the "struct page" either does not exist
- * or should not be touched: "special".
- *
- * Do not add new users: this really only works when a "normal" page
- * was mapped, but then the PTE got changed to something weird (+
- * marked special) that would not make pte_pfn() identify the originally
- * inserted page.
- */
- struct page *(*find_normal_page)(struct vm_area_struct *vma,
- unsigned long addr);
-#endif /* CONFIG_FIND_NORMAL_PAGE */
-};
-
-struct vm_unmapped_area_info {
-#define VM_UNMAPPED_AREA_TOPDOWN 1
- unsigned long flags;
- unsigned long length;
- unsigned long low_limit;
- unsigned long high_limit;
- unsigned long align_mask;
- unsigned long align_offset;
- unsigned long start_gap;
-};
-
-struct pagetable_move_control {
- struct vm_area_struct *old; /* Source VMA. */
- struct vm_area_struct *new; /* Destination VMA. */
- unsigned long old_addr; /* Address from which the move begins. */
- unsigned long old_end; /* Exclusive address at which old range ends. */
- unsigned long new_addr; /* Address to move page tables to. */
- unsigned long len_in; /* Bytes to remap specified by user. */
-
- bool need_rmap_locks; /* Do rmap locks need to be taken? */
- bool for_stack; /* Is this an early temp stack being moved? */
-};
-
-#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
- struct pagetable_move_control name = { \
- .old = old_, \
- .new = new_, \
- .old_addr = old_addr_, \
- .old_end = (old_addr_) + (len_), \
- .new_addr = new_addr_, \
- .len_in = len_, \
- }
-
-static inline void vma_iter_invalidate(struct vma_iterator *vmi)
-{
- mas_pause(&vmi->mas);
-}
-
-static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
-{
- return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot));
-}
-
-static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
-{
- return __pgprot(vm_flags);
-}
-
-static inline bool is_shared_maywrite(vm_flags_t vm_flags)
-{
- return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
- (VM_SHARED | VM_MAYWRITE);
-}
-
-static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
-{
- return is_shared_maywrite(vma->vm_flags);
-}
-
-static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
-{
- /*
- * Uses mas_find() to get the first VMA when the iterator starts.
- * Calling mas_next() could skip the first entry.
- */
- return mas_find(&vmi->mas, ULONG_MAX);
-}
-
-/*
- * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
- * assertions should be made either under mmap_write_lock or when the object
- * has been isolated under mmap_write_lock, ensuring no competing writers.
- */
-static inline void vma_assert_attached(struct vm_area_struct *vma)
-{
- WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
-}
-
-static inline void vma_assert_detached(struct vm_area_struct *vma)
-{
- WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
-}
-
-static inline void vma_assert_write_locked(struct vm_area_struct *);
-static inline void vma_mark_attached(struct vm_area_struct *vma)
-{
- vma_assert_write_locked(vma);
- vma_assert_detached(vma);
- refcount_set_release(&vma->vm_refcnt, 1);
-}
-
-static inline void vma_mark_detached(struct vm_area_struct *vma)
-{
- vma_assert_write_locked(vma);
- vma_assert_attached(vma);
- /* We are the only writer, so no need to use vma_refcount_put(). */
- if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
- /*
- * Reader must have temporarily raised vm_refcnt but it will
- * drop it without using the vma since vma is write-locked.
- */
- }
-}
-
-extern const struct vm_operations_struct vma_dummy_vm_ops;
-
-extern unsigned long rlimit(unsigned int limit);
-
-static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
-{
- memset(vma, 0, sizeof(*vma));
- vma->vm_mm = mm;
- vma->vm_ops = &vma_dummy_vm_ops;
- INIT_LIST_HEAD(&vma->anon_vma_chain);
- vma->vm_lock_seq = UINT_MAX;
-}
-
-/*
- * These are defined in vma.h, but sadly vm_stat_account() is referenced by
- * kernel/fork.c, so we have to these broadly available there, and temporarily
- * define them here to resolve the dependency cycle.
- */
-
-#define is_exec_mapping(flags) \
- ((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC)
-
-#define is_stack_mapping(flags) \
- (((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK))
-
-#define is_data_mapping(flags) \
- ((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE)
-
-static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags,
- long npages)
-{
- WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
-
- if (is_exec_mapping(flags))
- mm->exec_vm += npages;
- else if (is_stack_mapping(flags))
- mm->stack_vm += npages;
- else if (is_data_mapping(flags))
- mm->data_vm += npages;
-}
-
-#undef is_exec_mapping
-#undef is_stack_mapping
-#undef is_data_mapping
-
-/* Currently stubbed but we may later wish to un-stub. */
-static inline void vm_acct_memory(long pages);
-static inline void vm_unacct_memory(long pages)
-{
- vm_acct_memory(-pages);
-}
-
-static inline void mapping_allow_writable(struct address_space *mapping)
-{
- atomic_inc(&mapping->i_mmap_writable);
-}
-
-static inline void vma_set_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- pgoff_t pgoff)
-{
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_pgoff = pgoff;
-}
-
-static inline
-struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
-{
- return mas_find(&vmi->mas, max - 1);
-}
-
-static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
- unsigned long start, unsigned long end, gfp_t gfp)
-{
- __mas_set_range(&vmi->mas, start, end - 1);
- mas_store_gfp(&vmi->mas, NULL, gfp);
- if (unlikely(mas_is_err(&vmi->mas)))
- return -ENOMEM;
-
- return 0;
-}
-
-static inline void mmap_assert_locked(struct mm_struct *);
-static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
- unsigned long start_addr,
- unsigned long end_addr)
-{
- unsigned long index = start_addr;
-
- mmap_assert_locked(mm);
- return mt_find(&mm->mm_mt, &index, end_addr - 1);
-}
-
-static inline
-struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
-{
- return mtree_load(&mm->mm_mt, addr);
-}
-
-static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
-{
- return mas_prev(&vmi->mas, 0);
-}
-
-static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
-{
- mas_set(&vmi->mas, addr);
-}
-
-static inline bool vma_is_anonymous(struct vm_area_struct *vma)
-{
- return !vma->vm_ops;
-}
-
-/* Defined in vma.h, so temporarily define here to avoid circular dependency. */
-#define vma_iter_load(vmi) \
- mas_walk(&(vmi)->mas)
-
-static inline struct vm_area_struct *
-find_vma_prev(struct mm_struct *mm, unsigned long addr,
- struct vm_area_struct **pprev)
-{
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, mm, addr);
-
- vma = vma_iter_load(&vmi);
- *pprev = vma_prev(&vmi);
- if (!vma)
- vma = vma_next(&vmi);
- return vma;
-}
-
-#undef vma_iter_load
-
-static inline void vma_iter_init(struct vma_iterator *vmi,
- struct mm_struct *mm, unsigned long addr)
-{
- mas_init(&vmi->mas, &mm->mm_mt, addr);
-}
-
-/* Stubbed functions. */
-
-static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
-{
- return NULL;
-}
-
-static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
- struct vm_userfaultfd_ctx vm_ctx)
-{
- return true;
-}
-
-static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
- struct anon_vma_name *anon_name2)
-{
- return true;
-}
-
-static inline void might_sleep(void)
-{
-}
-
-static inline unsigned long vma_pages(struct vm_area_struct *vma)
-{
- return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-}
-
-static inline void fput(struct file *file)
-{
-}
-
-static inline void mpol_put(struct mempolicy *pol)
-{
-}
-
-static inline void lru_add_drain(void)
-{
-}
-
-static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
-{
-}
-
-static inline void update_hiwater_rss(struct mm_struct *mm)
-{
-}
-
-static inline void update_hiwater_vm(struct mm_struct *mm)
-{
-}
-
-static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
- struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, unsigned long tree_end)
-{
-}
-
-static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
- struct vm_area_struct *vma, unsigned long floor,
- unsigned long ceiling, bool mm_wr_locked)
-{
-}
-
-static inline void mapping_unmap_writable(struct address_space *mapping)
-{
-}
-
-static inline void flush_dcache_mmap_lock(struct address_space *mapping)
-{
-}
-
-static inline void tlb_finish_mmu(struct mmu_gather *tlb)
-{
-}
-
-static inline struct file *get_file(struct file *f)
-{
- return f;
-}
-
-static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
-{
- return 0;
-}
-
-static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
- enum vma_operation operation)
-{
- /* For testing purposes. We indicate that an anon_vma has been cloned. */
- if (src->anon_vma != NULL) {
- dst->anon_vma = src->anon_vma;
- dst->anon_vma->was_cloned = true;
- }
-
- return 0;
-}
-
-static inline void vma_start_write(struct vm_area_struct *vma)
-{
- /* Used to indicate to tests that a write operation has begun. */
- vma->vm_lock_seq++;
-}
-
-static inline __must_check
-int vma_start_write_killable(struct vm_area_struct *vma)
-{
- /* Used to indicate to tests that a write operation has begun. */
- vma->vm_lock_seq++;
- return 0;
-}
-
-static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- struct vm_area_struct *next)
-{
-}
-
-static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
-
-static inline void vma_iter_free(struct vma_iterator *vmi)
-{
- mas_destroy(&vmi->mas);
-}
-
-static inline
-struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
-{
- return mas_next_range(&vmi->mas, ULONG_MAX);
-}
-
-static inline void vm_acct_memory(long pages)
-{
-}
-
-static inline void vma_interval_tree_insert(struct vm_area_struct *vma,
- struct rb_root_cached *rb)
-{
-}
-
-static inline void vma_interval_tree_remove(struct vm_area_struct *vma,
- struct rb_root_cached *rb)
-{
-}
-
-static inline void flush_dcache_mmap_unlock(struct address_space *mapping)
-{
-}
-
-static inline void anon_vma_interval_tree_insert(struct anon_vma_chain *avc,
- struct rb_root_cached *rb)
-{
-}
-
-static inline void anon_vma_interval_tree_remove(struct anon_vma_chain *avc,
- struct rb_root_cached *rb)
-{
-}
-
-static inline void uprobe_mmap(struct vm_area_struct *vma)
-{
-}
-
-static inline void uprobe_munmap(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
-}
-
-static inline void i_mmap_lock_write(struct address_space *mapping)
-{
-}
-
-static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
-{
-}
-
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
-{
-}
-
-static inline void unlink_anon_vmas(struct vm_area_struct *vma)
-{
- /* For testing purposes, indicate that the anon_vma was unlinked. */
- vma->anon_vma->was_unlinked = true;
-}
-
-static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
-{
-}
-
-static inline void i_mmap_unlock_write(struct address_space *mapping)
-{
-}
-
-static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- struct list_head *unmaps)
-{
- return 0;
-}
-
-static inline void mmap_write_downgrade(struct mm_struct *mm)
-{
-}
-
-static inline void mmap_read_unlock(struct mm_struct *mm)
-{
-}
-
-static inline void mmap_write_unlock(struct mm_struct *mm)
-{
-}
-
-static inline int mmap_write_lock_killable(struct mm_struct *mm)
-{
- return 0;
-}
-
-static inline bool can_modify_mm(struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
- return true;
-}
-
-static inline void arch_unmap(struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
-}
-
-static inline void mmap_assert_locked(struct mm_struct *mm)
-{
-}
-
-static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
-{
- return true;
-}
-
-static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
- vm_flags_t vm_flags)
-{
-}
-
-static inline bool mapping_can_writeback(struct address_space *mapping)
-{
- return true;
-}
-
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
-{
- return false;
-}
-
-static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
-{
- return false;
-}
-
-static inline bool userfaultfd_wp(struct vm_area_struct *vma)
-{
- return false;
-}
-
-static inline void mmap_assert_write_locked(struct mm_struct *mm)
-{
-}
-
-static inline void mutex_lock(struct mutex *lock)
-{
-}
-
-static inline void mutex_unlock(struct mutex *lock)
-{
-}
-
-static inline bool mutex_is_locked(struct mutex *lock)
-{
- return true;
-}
-
-static inline bool signal_pending(void *p)
-{
- return false;
-}
-
-static inline bool is_file_hugepages(struct file *file)
-{
- return false;
-}
-
-static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
-{
- return 0;
-}
-
-static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags,
- unsigned long npages)
-{
- return true;
-}
-
-static inline int shmem_zero_setup(struct vm_area_struct *vma)
-{
- return 0;
-}
-
-static inline void vma_set_anonymous(struct vm_area_struct *vma)
-{
- vma->vm_ops = NULL;
-}
-
-static inline void ksm_add_vma(struct vm_area_struct *vma)
-{
-}
-
-static inline void perf_event_mmap(struct vm_area_struct *vma)
-{
-}
-
-static inline bool vma_is_dax(struct vm_area_struct *vma)
-{
- return false;
-}
-
-static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
- return NULL;
-}
-
-bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
-
-/* Update vma->vm_page_prot to reflect vma->vm_flags. */
-static inline void vma_set_page_prot(struct vm_area_struct *vma)
-{
- vm_flags_t vm_flags = vma->vm_flags;
- pgprot_t vm_page_prot;
-
- /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
- vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags));
-
- if (vma_wants_writenotify(vma, vm_page_prot)) {
- vm_flags &= ~VM_SHARED;
- /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
- vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags));
- }
- /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
- WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
-}
-
-static inline bool arch_validate_flags(vm_flags_t flags)
-{
- return true;
-}
-
-static inline void vma_close(struct vm_area_struct *vma)
-{
-}
-
-static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
-{
- return 0;
-}
-
-static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
-{
- if (vma->vm_flags & VM_GROWSDOWN)
- return stack_guard_gap;
-
- /* See reasoning around the VM_SHADOW_STACK definition */
- if (vma->vm_flags & VM_SHADOW_STACK)
- return PAGE_SIZE;
-
- return 0;
-}
-
-static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
-{
- unsigned long gap = stack_guard_start_gap(vma);
- unsigned long vm_start = vma->vm_start;
-
- vm_start -= gap;
- if (vm_start > vma->vm_start)
- vm_start = 0;
- return vm_start;
-}
-
-static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
-{
- unsigned long vm_end = vma->vm_end;
-
- if (vma->vm_flags & VM_GROWSUP) {
- vm_end += stack_guard_gap;
- if (vm_end < vma->vm_end)
- vm_end = -PAGE_SIZE;
- }
- return vm_end;
-}
-
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr, unsigned long len)
-{
- return 0;
-}
-
-static inline bool vma_is_accessible(struct vm_area_struct *vma)
-{
- return vma->vm_flags & VM_ACCESS_FLAGS;
-}
-
-static inline bool capable(int cap)
-{
- return true;
-}
-
-static inline bool mlock_future_ok(const struct mm_struct *mm,
- vm_flags_t vm_flags, unsigned long bytes)
-{
- unsigned long locked_pages, limit_pages;
-
- if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
- return true;
-
- locked_pages = bytes >> PAGE_SHIFT;
- locked_pages += mm->locked_vm;
-
- limit_pages = rlimit(RLIMIT_MEMLOCK);
- limit_pages >>= PAGE_SHIFT;
-
- return locked_pages <= limit_pages;
-}
-
-static inline int __anon_vma_prepare(struct vm_area_struct *vma)
-{
- struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma));
-
- if (!anon_vma)
- return -ENOMEM;
-
- anon_vma->root = anon_vma;
- vma->anon_vma = anon_vma;
-
- return 0;
-}
-
-static inline int anon_vma_prepare(struct vm_area_struct *vma)
-{
- if (likely(vma->anon_vma))
- return 0;
-
- return __anon_vma_prepare(vma);
-}
-
-static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
- struct list_head *uf)
-{
-}
-
-#define ACCESS_PRIVATE(p, member) ((p)->member)
-
-#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)
-
-static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
-{
- unsigned int len = bitmap_size(nbits);
-
- if (small_const_nbits(nbits))
- *dst = 0;
- else
- memset(dst, 0, len);
-}
-
-static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
-{
- return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
-}
-
-/* Clears all bits in the VMA flags bitmap, non-atomically. */
-static inline void vma_flags_clear_all(vma_flags_t *flags)
-{
- bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS);
-}
-
-/*
- * Copy value to the first system word of VMA flags, non-atomically.
- *
- * IMPORTANT: This does not overwrite bytes past the first system word. The
- * caller must account for this.
- */
-static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value)
-{
- *ACCESS_PRIVATE(flags, __vma_flags) = value;
-}
-
-/*
- * Copy value to the first system word of VMA flags ONCE, non-atomically.
- *
- * IMPORTANT: This does not overwrite bytes past the first system word. The
- * caller must account for this.
- */
-static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value)
-{
- unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
-
- WRITE_ONCE(*bitmap, value);
-}
-
-/* Update the first system word of VMA flags setting bits, non-atomically. */
-static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value)
-{
- unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
-
- *bitmap |= value;
-}
-
-/* Update the first system word of VMA flags clearing bits, non-atomically. */
-static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value)
-{
- unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
-
- *bitmap &= ~value;
-}
-
-
-/* Use when VMA is not part of the VMA tree and needs no locking */
-static inline void vm_flags_init(struct vm_area_struct *vma,
- vm_flags_t flags)
-{
- vma_flags_clear_all(&vma->flags);
- vma_flags_overwrite_word(&vma->flags, flags);
-}
-
-/*
- * Use when VMA is part of the VMA tree and modifications need coordination
- * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
- * it should be locked explicitly beforehand.
- */
-static inline void vm_flags_reset(struct vm_area_struct *vma,
- vm_flags_t flags)
-{
- vma_assert_write_locked(vma);
- vm_flags_init(vma, flags);
-}
-
-static inline void vm_flags_reset_once(struct vm_area_struct *vma,
- vm_flags_t flags)
-{
- vma_assert_write_locked(vma);
- /*
- * The user should only be interested in avoiding reordering of
- * assignment to the first word.
- */
- vma_flags_clear_all(&vma->flags);
- vma_flags_overwrite_word_once(&vma->flags, flags);
-}
-
-static inline void vm_flags_set(struct vm_area_struct *vma,
- vm_flags_t flags)
-{
- vma_start_write(vma);
- vma_flags_set_word(&vma->flags, flags);
-}
-
-static inline void vm_flags_clear(struct vm_area_struct *vma,
- vm_flags_t flags)
-{
- vma_start_write(vma);
- vma_flags_clear_word(&vma->flags, flags);
-}
-
-/*
- * Denies creating a writable executable mapping or gaining executable permissions.
- *
- * This denies the following:
- *
- * a) mmap(PROT_WRITE | PROT_EXEC)
- *
- * b) mmap(PROT_WRITE)
- * mprotect(PROT_EXEC)
- *
- * c) mmap(PROT_WRITE)
- * mprotect(PROT_READ)
- * mprotect(PROT_EXEC)
- *
- * But allows the following:
- *
- * d) mmap(PROT_READ | PROT_EXEC)
- * mmap(PROT_READ | PROT_EXEC | PROT_BTI)
- *
- * This is only applicable if the user has set the Memory-Deny-Write-Execute
- * (MDWE) protection mask for the current process.
- *
- * @old specifies the VMA flags the VMA originally possessed, and @new the ones
- * we propose to set.
- *
- * Return: false if proposed change is OK, true if not ok and should be denied.
- */
-static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
-{
- /* If MDWE is disabled, we have nothing to deny. */
- if (mm_flags_test(MMF_HAS_MDWE, current->mm))
- return false;
-
- /* If the new VMA is not executable, we have nothing to deny. */
- if (!(new & VM_EXEC))
- return false;
-
- /* Under MDWE we do not accept newly writably executable VMAs... */
- if (new & VM_WRITE)
- return true;
-
- /* ...nor previously non-executable VMAs becoming executable. */
- if (!(old & VM_EXEC))
- return true;
-
- return false;
-}
-
-static inline int mapping_map_writable(struct address_space *mapping)
-{
- return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
- 0 : -EPERM;
-}
-
-static inline unsigned long move_page_tables(struct pagetable_move_control *pmc)
-{
- return 0;
-}
-
-static inline void free_pgd_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
-{
-}
-
-static inline int ksm_execve(struct mm_struct *mm)
-{
- return 0;
-}
-
-static inline void ksm_exit(struct mm_struct *mm)
-{
-}
-
-static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
-{
- if (reset_refcnt)
- refcount_set(&vma->vm_refcnt, 0);
-}
-
-static inline void vma_numab_state_init(struct vm_area_struct *vma)
-{
-}
-
-static inline void vma_numab_state_free(struct vm_area_struct *vma)
-{
-}
-
-static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
- struct vm_area_struct *new_vma)
-{
-}
-
-static inline void free_anon_vma_name(struct vm_area_struct *vma)
-{
-}
-
-/* Declared in vma.h. */
-static inline void set_vma_from_desc(struct vm_area_struct *vma,
- struct vm_area_desc *desc);
-
-static inline void mmap_action_prepare(struct mmap_action *action,
- struct vm_area_desc *desc)
-{
-}
-
-static inline int mmap_action_complete(struct mmap_action *action,
- struct vm_area_struct *vma)
-{
- return 0;
-}
-
-static inline int __compat_vma_mmap(const struct file_operations *f_op,
- struct file *file, struct vm_area_struct *vma)
-{
- struct vm_area_desc desc = {
- .mm = vma->vm_mm,
- .file = file,
- .start = vma->vm_start,
- .end = vma->vm_end,
-
- .pgoff = vma->vm_pgoff,
- .vm_file = vma->vm_file,
- .vm_flags = vma->vm_flags,
- .page_prot = vma->vm_page_prot,
-
- .action.type = MMAP_NOTHING, /* Default */
- };
- int err;
-
- err = f_op->mmap_prepare(&desc);
- if (err)
- return err;
-
- mmap_action_prepare(&desc.action, &desc);
- set_vma_from_desc(vma, &desc);
- return mmap_action_complete(&desc.action, vma);
-}
-
-static inline int compat_vma_mmap(struct file *file,
- struct vm_area_struct *vma)
-{
- return __compat_vma_mmap(file->f_op, file, vma);
-}
-
-/* Did the driver provide valid mmap hook configuration? */
-static inline bool can_mmap_file(struct file *file)
-{
- bool has_mmap = file->f_op->mmap;
- bool has_mmap_prepare = file->f_op->mmap_prepare;
-
- /* Hooks are mutually exclusive. */
- if (WARN_ON_ONCE(has_mmap && has_mmap_prepare))
- return false;
- if (!has_mmap && !has_mmap_prepare)
- return false;
-
- return true;
-}
-
-static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
-{
- if (file->f_op->mmap_prepare)
- return compat_vma_mmap(file, vma);
-
- return file->f_op->mmap(file, vma);
-}
-
-static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
-{
- return file->f_op->mmap_prepare(desc);
-}
-
-static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
-{
-}
-
-static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
-{
- /* Changing an anonymous vma with this is illegal */
- get_file(file);
- swap(vma->vm_file, file);
- fput(file);
-}
-
-static inline bool shmem_file(struct file *file)
-{
- return false;
-}
-
-static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
- const struct file *file, vm_flags_t vm_flags)
-{
- return vm_flags;
-}
-
-static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
-{
-}
-
-static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t pgprot)
-{
- return 0;
-}
+typedef unsigned long vm_flags_t;
+#define pgoff_t unsigned long
+typedef unsigned long pgprotval_t;
+typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
+typedef __bitwise unsigned int vm_fault_t;
-static inline int do_munmap(struct mm_struct *, unsigned long, size_t,
- struct list_head *uf)
-{
- return 0;
-}
+#include "include/stubs.h"
+#include "include/dup.h"
+#include "include/custom.h"
#endif /* __MM_VMA_INTERNAL_H */