summaryrefslogtreecommitdiff
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c244
1 files changed, 130 insertions, 114 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 22f4bf956ba1..c5ba1f4487bd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -29,7 +29,7 @@
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/miscdevice.h>
#include <linux/uio.h>
@@ -165,14 +165,14 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
if (refcount_dec_and_test(&ctx->refcount)) {
- VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh));
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
@@ -233,40 +233,48 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
{
struct vm_area_struct *vma = vmf->vma;
pte_t *ptep, pte;
- bool ret = true;
assert_fault_locked(vmf);
ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
if (!ptep)
- goto out;
+ return true;
- ret = false;
pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us. PTE markers should be handled the same as none
- * ptes here.
+ * changes under us.
+ */
+
+ /* Entry is still missing, wait for userspace to resolve the fault. */
+ if (huge_pte_none(pte))
+ return true;
+ /* UFFD PTE markers require userspace to resolve the fault. */
+ if (pte_is_uffd_marker(pte))
+ return true;
+ /*
+ * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
+ * resolve the fault.
*/
- if (huge_pte_none_mostly(pte))
- ret = true;
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
- ret = true;
-out:
- return ret;
+ return true;
+
+ return false;
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
struct vm_fault *vmf,
unsigned long reason)
{
- return false; /* should never get here */
+ /* Should never get here. */
+ VM_WARN_ON_ONCE(1);
+ return false;
}
#endif /* CONFIG_HUGETLB_PAGE */
/*
- * Verify the pagetables are still not ok after having reigstered into
+ * Verify the pagetables are still not ok after having registered into
* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
* userfault that has already been resolved, if userfaultfd_read_iter and
* UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
@@ -284,53 +292,63 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pmd_t *pmd, _pmd;
pte_t *pte;
pte_t ptent;
- bool ret = true;
+ bool ret;
assert_fault_locked(vmf);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
- goto out;
+ return true;
p4d = p4d_offset(pgd, address);
if (!p4d_present(*p4d))
- goto out;
+ return true;
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
- goto out;
+ return true;
pmd = pmd_offset(pud, address);
again:
_pmd = pmdp_get_lockless(pmd);
if (pmd_none(_pmd))
- goto out;
+ return true;
- ret = false;
- if (!pmd_present(_pmd) || pmd_devmap(_pmd))
- goto out;
+ /*
+ * A race could arise which would result in a softleaf entry such as
+ * migration entry unexpectedly being present in the PMD, so explicitly
+ * check for this and bail out if so.
+ */
+ if (!pmd_present(_pmd))
+ return false;
- if (pmd_trans_huge(_pmd)) {
- if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
- ret = true;
- goto out;
- }
+ if (pmd_trans_huge(_pmd))
+ return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
pte = pte_offset_map(pmd, address);
- if (!pte) {
- ret = true;
+ if (!pte)
goto again;
- }
+
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us. PTE markers should be handled the same as none
- * ptes here.
+ * changes under us.
*/
ptent = ptep_get(pte);
- if (pte_none_mostly(ptent))
- ret = true;
+
+ ret = true;
+ /* Entry is still missing, wait for userspace to resolve the fault. */
+ if (pte_none(ptent))
+ goto out;
+ /* UFFD PTE markers require userspace to resolve the fault. */
+ if (pte_is_uffd_marker(ptent))
+ goto out;
+ /*
+ * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
+ * resolve the fault.
+ */
if (!pte_write(ptent) && (reason & VM_UFFD_WP))
- ret = true;
- pte_unmap(pte);
+ goto out;
+ ret = false;
out:
+ pte_unmap(pte);
return ret;
}
@@ -383,12 +401,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (!ctx)
goto out;
- BUG_ON(ctx->mm != mm);
+ VM_WARN_ON_ONCE(ctx->mm != mm);
/* Any unrecognized flag is a bug. */
- VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
+ VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS);
/* 0 or > 1 flags set is a bug; we expect exactly 1. */
- VM_BUG_ON(!reason || (reason & (reason - 1)));
+ VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
@@ -411,12 +429,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
- BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
+ VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
- printk(KERN_WARNING
- "FAULT_FLAG_ALLOW_RETRY missing %x\n",
- vmf->flags);
+ pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n",
+ vmf->flags);
dump_stack();
}
#endif
@@ -491,12 +508,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
set_current_state(blocking_state);
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
- if (!is_vm_hugetlb_page(vma))
- must_wait = userfaultfd_must_wait(ctx, vmf, reason);
- else
+ if (is_vm_hugetlb_page(vma)) {
must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
- if (is_vm_hugetlb_page(vma))
hugetlb_vma_unlock_read(vma);
+ } else {
+ must_wait = userfaultfd_must_wait(ctx, vmf, reason);
+ }
+
release_fault_lock(vmf);
if (likely(must_wait && !READ_ONCE(ctx->released))) {
@@ -602,7 +620,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
*/
out:
atomic_dec(&ctx->mmap_changing);
- VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
+ VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0);
userfaultfd_ctx_put(ctx);
}
@@ -710,7 +728,7 @@ void dup_userfaultfd_fail(struct list_head *fcs)
struct userfaultfd_ctx *ctx = fctx->new;
atomic_dec(&octx->mmap_changing);
- VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
+ VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0);
userfaultfd_ctx_put(octx);
userfaultfd_ctx_put(ctx);
@@ -751,11 +769,6 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
if (!ctx)
return;
- if (to & ~PAGE_MASK) {
- userfaultfd_ctx_put(ctx);
- return;
- }
-
msg_init(&ewq.msg);
ewq.msg.event = UFFD_EVENT_REMAP;
@@ -766,6 +779,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
userfaultfd_event_wait_completion(ctx, &ewq);
}
+void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx)
+{
+ struct userfaultfd_ctx *ctx = vm_ctx->ctx;
+
+ if (!ctx)
+ return;
+
+ userfaultfd_ctx_put(ctx);
+}
+
bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
@@ -1243,7 +1266,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
int ret;
struct uffdio_register uffdio_register;
struct uffdio_register __user *user_uffdio_register;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
bool found;
bool basic_ioctls;
unsigned long start, end;
@@ -1266,9 +1289,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
vm_flags |= VM_UFFD_MISSING;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
-#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
- goto out;
-#endif
+ if (!pgtable_supports_uffd_wp())
+ goto out;
+
vm_flags |= VM_UFFD_WP;
}
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
@@ -1317,8 +1340,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
- !!(cur->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
/* check not compatible vmas */
ret = -EINVAL;
@@ -1372,7 +1395,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
found = true;
} for_each_vma_range(vmi, cur, end);
- BUG_ON(!found);
+ VM_WARN_ON_ONCE(!found);
ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
wp_async);
@@ -1464,8 +1487,16 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
- !!(cur->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
+
+ /*
+ * Prevent unregistering through a different userfaultfd than
+ * the one used for registration.
+ */
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
/*
* Check not compatible vmas, not strictly required
@@ -1479,7 +1510,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
found = true;
} for_each_vma_range(vmi, cur, end);
- BUG_ON(!found);
+ VM_WARN_ON_ONCE(!found);
vma_iter_set(&vmi, start);
prev = vma_prev(&vmi);
@@ -1490,16 +1521,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
-
- /*
- * Nothing to do: this vma is already registered into this
- * userfaultfd and with the right tracking mode too.
- */
+ /* VMA not registered with userfaultfd. */
if (!vma->vm_userfaultfd_ctx.ctx)
goto skip;
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
+ VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx);
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async));
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
if (vma->vm_start > start)
start = vma->vm_start;
@@ -1564,7 +1592,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
* len == 0 means wake all and we don't want to wake all here,
* so check it again to be sure.
*/
- VM_BUG_ON(!range.len);
+ VM_WARN_ON_ONCE(!range.len);
wake_userfault(ctx, &range);
ret = 0;
@@ -1621,7 +1649,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
return -EFAULT;
if (ret < 0)
goto out;
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
/* len == 0 would wake all */
range.len = ret;
if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
@@ -1676,7 +1704,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
if (ret < 0)
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
range.start = uffdio_zeropage.range.start;
@@ -1788,7 +1816,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
range.start = uffdio_continue.range.start;
@@ -1845,7 +1873,7 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
range.start = uffdio_poison.range.start;
@@ -1971,14 +1999,14 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
uffdio_api.features &=
~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif
-#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
- uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
-#endif
-#ifndef CONFIG_PTE_MARKER_UFFD_WP
- uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
- uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
- uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
-#endif
+ if (!pgtable_supports_uffd_wp())
+ uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+
+ if (!uffd_supports_wp_marker()) {
+ uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
+ }
ret = -EINVAL;
if (features & ~uffdio_api.features)
@@ -2102,16 +2130,12 @@ static void init_once_userfaultfd_ctx(void *mem)
static int new_userfaultfd(int flags)
{
- struct userfaultfd_ctx *ctx;
- struct file *file;
- int fd;
+ struct userfaultfd_ctx *ctx __free(kfree) = NULL;
- BUG_ON(!current->mm);
+ VM_WARN_ON_ONCE(!current->mm);
/* Check the UFFD_* constants for consistency. */
BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
- BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
- BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
return -EINVAL;
@@ -2128,26 +2152,18 @@ static int new_userfaultfd(int flags)
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
- fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
- if (fd < 0)
- goto err_out;
+ FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS,
+ anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS),
+ NULL));
+ if (fdf.err)
+ return fdf.err;
- /* Create a new inode so that the LSM can block the creation. */
- file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
- O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- fd = PTR_ERR(file);
- goto err_out;
- }
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- file->f_mode |= FMODE_NOWAIT;
- fd_install(fd, file);
- return fd;
-err_out:
- kmem_cache_free(userfaultfd_ctx_cachep, ctx);
- return fd;
+ fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT;
+ retain_and_null_ptr(ctx);
+ return fd_publish(fdf);
}
static inline bool userfaultfd_syscall_allowed(int flags)