diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-30 21:33:14 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-30 21:33:14 +0300 |
commit | aa32f1169148beb90d71494e2f2a1999ba7b5366 (patch) | |
tree | ec8c434bff07bf0beb2df08629089824927f62f9 /mm | |
parent | d5bb349dbbe27537e90a03b9597deeb07723a86d (diff) | |
parent | 93f4e735b6d98ee4b7a1252d81e815a983e359f2 (diff) | |
download | linux-aa32f1169148beb90d71494e2f2a1999ba7b5366.tar.xz |
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull hmm updates from Jason Gunthorpe:
"This is another round of bug fixing and cleanup. This time the focus
is on the driver pattern to use mmu notifiers to monitor a VA range.
This code is lifted out of many drivers and hmm_mirror directly into
the mmu_notifier core and written using the best ideas from all the
driver implementations.
This removes many bugs from the drivers and has a very pleasing
diffstat. More drivers can still be converted, but that is for another
cycle.
- A shared branch with RDMA reworking the RDMA ODP implementation
- New mmu_interval_notifier API. This is focused on the use case of
monitoring a VA and simplifies the process for drivers
- A common seq-count locking scheme built into the
mmu_interval_notifier API usable by drivers that call
get_user_pages() or hmm_range_fault() with the VA range
- Conversion of mlx5 ODP, hfi1, radeon, nouveau, AMD GPU, and Xen
GntDev drivers to the new API. This deletes a lot of wonky driver
code.
- Two improvements for hmm_range_fault(), from testing done by Ralph"
* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
mm/hmm: remove hmm_range_dma_map and hmm_range_dma_unmap
mm/hmm: make full use of walk_page_range()
xen/gntdev: use mmu_interval_notifier_insert
mm/hmm: remove hmm_mirror and related
drm/amdgpu: Use mmu_interval_notifier instead of hmm_mirror
drm/amdgpu: Use mmu_interval_insert instead of hmm_mirror
drm/amdgpu: Call find_vma under mmap_sem
nouveau: use mmu_interval_notifier instead of hmm_mirror
nouveau: use mmu_notifier directly for invalidate_range_start
drm/radeon: use mmu_interval_notifier_insert
RDMA/hfi1: Use mmu_interval_notifier_insert for user_exp_rcv
RDMA/odp: Use mmu_interval_notifier_insert()
mm/hmm: define the pre-processor related parts of hmm.h even if disabled
mm/hmm: allow hmm_range to be used with a mmu_interval_notifier or hmm_mirror
mm/mmu_notifier: add an interval tree notifier
mm/mmu_notifier: define the header pre-processor parts even if disabled
mm/hmm: allow snapshot of the special zero page
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/hmm.c | 523 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 557 |
3 files changed, 591 insertions, 491 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 550f7aceb679..f332efe751dd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -284,6 +284,7 @@ config VIRT_TO_BUS config MMU_NOTIFIER bool select SRCU + select INTERVAL_TREE config KSM bool "Enable KSM for page merging" @@ -674,7 +675,6 @@ config DEV_PAGEMAP_OPS config HMM_MIRROR bool depends on MMU - depends on MMU_NOTIFIER config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" @@ -26,193 +26,6 @@ #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> -static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm) -{ - struct hmm *hmm; - - hmm = kzalloc(sizeof(*hmm), GFP_KERNEL); - if (!hmm) - return ERR_PTR(-ENOMEM); - - init_waitqueue_head(&hmm->wq); - INIT_LIST_HEAD(&hmm->mirrors); - init_rwsem(&hmm->mirrors_sem); - INIT_LIST_HEAD(&hmm->ranges); - spin_lock_init(&hmm->ranges_lock); - hmm->notifiers = 0; - return &hmm->mmu_notifier; -} - -static void hmm_free_notifier(struct mmu_notifier *mn) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - - WARN_ON(!list_empty(&hmm->ranges)); - WARN_ON(!list_empty(&hmm->mirrors)); - kfree(hmm); -} - -static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - struct hmm_mirror *mirror; - - /* - * Since hmm_range_register() holds the mmget() lock hmm_release() is - * prevented as long as a range exists. - */ - WARN_ON(!list_empty_careful(&hmm->ranges)); - - down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) { - /* - * Note: The driver is not allowed to trigger - * hmm_mirror_unregister() from this thread. - */ - if (mirror->ops->release) - mirror->ops->release(mirror); - } - up_read(&hmm->mirrors_sem); -} - -static void notifiers_decrement(struct hmm *hmm) -{ - unsigned long flags; - - spin_lock_irqsave(&hmm->ranges_lock, flags); - hmm->notifiers--; - if (!hmm->notifiers) { - struct hmm_range *range; - - list_for_each_entry(range, &hmm->ranges, list) { - if (range->valid) - continue; - range->valid = true; - } - wake_up_all(&hmm->wq); - } - spin_unlock_irqrestore(&hmm->ranges_lock, flags); -} - -static int hmm_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *nrange) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - struct hmm_mirror *mirror; - struct hmm_range *range; - unsigned long flags; - int ret = 0; - - spin_lock_irqsave(&hmm->ranges_lock, flags); - hmm->notifiers++; - list_for_each_entry(range, &hmm->ranges, list) { - if (nrange->end < range->start || nrange->start >= range->end) - continue; - - range->valid = false; - } - spin_unlock_irqrestore(&hmm->ranges_lock, flags); - - if (mmu_notifier_range_blockable(nrange)) - down_read(&hmm->mirrors_sem); - else if (!down_read_trylock(&hmm->mirrors_sem)) { - ret = -EAGAIN; - goto out; - } - - list_for_each_entry(mirror, &hmm->mirrors, list) { - int rc; - - rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); - if (rc) { - if (WARN_ON(mmu_notifier_range_blockable(nrange) || - rc != -EAGAIN)) - continue; - ret = -EAGAIN; - break; - } - } - up_read(&hmm->mirrors_sem); - -out: - if (ret) - notifiers_decrement(hmm); - return ret; -} - -static void hmm_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *nrange) -{ - struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - - notifiers_decrement(hmm); -} - -static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { - .release = hmm_release, - .invalidate_range_start = hmm_invalidate_range_start, - .invalidate_range_end = hmm_invalidate_range_end, - .alloc_notifier = hmm_alloc_notifier, - .free_notifier = hmm_free_notifier, -}; - -/* - * hmm_mirror_register() - register a mirror against an mm - * - * @mirror: new mirror struct to register - * @mm: mm to register against - * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments - * - * To start mirroring a process address space, the device driver must register - * an HMM mirror struct. - * - * The caller cannot unregister the hmm_mirror while any ranges are - * registered. - * - * Callers using this function must put a call to mmu_notifier_synchronize() - * in their module exit functions. - */ -int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) -{ - struct mmu_notifier *mn; - - lockdep_assert_held_write(&mm->mmap_sem); - - /* Sanity check */ - if (!mm || !mirror || !mirror->ops) - return -EINVAL; - - mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm); - if (IS_ERR(mn)) - return PTR_ERR(mn); - mirror->hmm = container_of(mn, struct hmm, mmu_notifier); - - down_write(&mirror->hmm->mirrors_sem); - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); - - return 0; -} -EXPORT_SYMBOL(hmm_mirror_register); - -/* - * hmm_mirror_unregister() - unregister a mirror - * - * @mirror: mirror struct to unregister - * - * Stop mirroring a process address space, and cleanup. - */ -void hmm_mirror_unregister(struct hmm_mirror *mirror) -{ - struct hmm *hmm = mirror->hmm; - - down_write(&hmm->mirrors_sem); - list_del(&mirror->list); - up_write(&hmm->mirrors_sem); - mmu_notifier_put(&hmm->mmu_notifier); -} -EXPORT_SYMBOL(hmm_mirror_unregister); - struct hmm_vma_walk { struct hmm_range *range; struct dev_pagemap *pgmap; @@ -252,18 +65,15 @@ err: return -EFAULT; } -static int hmm_pfns_bad(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static int hmm_pfns_fill(unsigned long addr, unsigned long end, + struct hmm_range *range, enum hmm_pfn_value_e value) { - struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; unsigned long i; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = range->values[HMM_PFN_ERROR]; + pfns[i] = range->values[value]; return 0; } @@ -532,8 +342,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (unlikely(!hmm_vma_walk->pgmap)) return -EBUSY; } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { - *pfn = range->values[HMM_PFN_SPECIAL]; - return -EFAULT; + if (!is_zero_pfn(pte_pfn(pte))) { + *pfn = range->values[HMM_PFN_SPECIAL]; + return -EFAULT; + } + /* + * Since each architecture defines a struct page for the zero + * page, just fall through and treat it like a normal page. + */ } *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; @@ -584,7 +400,7 @@ again: } return 0; } else if (!pmd_present(pmd)) - return hmm_pfns_bad(start, end, walk); + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* @@ -612,7 +428,7 @@ again: * recover. */ if (pmd_bad(pmd)) - return hmm_pfns_bad(start, end, walk); + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); ptep = pte_offset_map(pmdp, addr); i = (addr - range->start) >> PAGE_SHIFT; @@ -770,93 +586,55 @@ unlock: #define hmm_vma_walk_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ -static void hmm_pfns_clear(struct hmm_range *range, - uint64_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = range->values[HMM_PFN_NONE]; -} - -/* - * hmm_range_register() - start tracking change to CPU page table over a range - * @range: range - * @mm: the mm struct for the range of virtual address - * - * Return: 0 on success, -EFAULT if the address space is no longer valid - * - * Track updates to the CPU page table see include/linux/hmm.h - */ -int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) +static int hmm_vma_walk_test(unsigned long start, unsigned long end, + struct mm_walk *walk) { - struct hmm *hmm = mirror->hmm; - unsigned long flags; - - range->valid = false; - range->hmm = NULL; - - if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1))) - return -EINVAL; - if (range->start >= range->end) - return -EINVAL; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; - /* Prevent hmm_release() from running while the range is valid */ - if (!mmget_not_zero(hmm->mmu_notifier.mm)) + /* + * Skip vma ranges that don't have struct page backing them or + * map I/O devices directly. + */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) return -EFAULT; - /* Initialize range to track CPU page table updates. */ - spin_lock_irqsave(&hmm->ranges_lock, flags); - - range->hmm = hmm; - list_add(&range->list, &hmm->ranges); - /* - * If there are any concurrent notifiers we have to wait for them for - * the range to be valid (see hmm_range_wait_until_valid()). + * If the vma does not allow read access, then assume that it does not + * allow write access either. HMM does not support architectures + * that allow write without read. */ - if (!hmm->notifiers) - range->valid = true; - spin_unlock_irqrestore(&hmm->ranges_lock, flags); - - return 0; -} -EXPORT_SYMBOL(hmm_range_register); + if (!(vma->vm_flags & VM_READ)) { + bool fault, write_fault; -/* - * hmm_range_unregister() - stop tracking change to CPU page table over a range - * @range: range - * - * Range struct is used to track updates to the CPU page table after a call to - * hmm_range_register(). See include/linux/hmm.h for how to use it. - */ -void hmm_range_unregister(struct hmm_range *range) -{ - struct hmm *hmm = range->hmm; - unsigned long flags; + /* + * Check to see if a fault is requested for any page in the + * range. + */ + hmm_range_need_fault(hmm_vma_walk, range->pfns + + ((start - range->start) >> PAGE_SHIFT), + (end - start) >> PAGE_SHIFT, + 0, &fault, &write_fault); + if (fault || write_fault) + return -EFAULT; - spin_lock_irqsave(&hmm->ranges_lock, flags); - list_del_init(&range->list); - spin_unlock_irqrestore(&hmm->ranges_lock, flags); + hmm_pfns_fill(start, end, range, HMM_PFN_NONE); + hmm_vma_walk->last = end; - /* Drop reference taken by hmm_range_register() */ - mmput(hmm->mmu_notifier.mm); + /* Skip this vma and continue processing the next vma. */ + return 1; + } - /* - * The range is now invalid and the ref on the hmm is dropped, so - * poison the pointer. Leave other fields in place, for the caller's - * use. - */ - range->valid = false; - memset(&range->hmm, POISON_INUSE, sizeof(range->hmm)); + return 0; } -EXPORT_SYMBOL(hmm_range_unregister); static const struct mm_walk_ops hmm_walk_ops = { .pud_entry = hmm_vma_walk_pud, .pmd_entry = hmm_vma_walk_pmd, .pte_hole = hmm_vma_walk_hole, .hugetlb_entry = hmm_vma_walk_hugetlb_entry, + .test_walk = hmm_vma_walk_test, }; /** @@ -889,210 +667,27 @@ static const struct mm_walk_ops hmm_walk_ops = { */ long hmm_range_fault(struct hmm_range *range, unsigned int flags) { - const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; - unsigned long start = range->start, end; - struct hmm_vma_walk hmm_vma_walk; - struct hmm *hmm = range->hmm; - struct vm_area_struct *vma; + struct hmm_vma_walk hmm_vma_walk = { + .range = range, + .last = range->start, + .flags = flags, + }; + struct mm_struct *mm = range->notifier->mm; int ret; - lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); + lockdep_assert_held(&mm->mmap_sem); do { /* If range is no longer valid force retry. */ - if (!range->valid) + if (mmu_interval_check_retry(range->notifier, + range->notifier_seq)) return -EBUSY; + ret = walk_page_range(mm, hmm_vma_walk.last, range->end, + &hmm_walk_ops, &hmm_vma_walk); + } while (ret == -EBUSY); - vma = find_vma(hmm->mmu_notifier.mm, start); - if (vma == NULL || (vma->vm_flags & device_vma)) - return -EFAULT; - - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it - * does not allow write access, either. HMM does not - * support architecture that allow write without read. - */ - hmm_pfns_clear(range, range->pfns, - range->start, range->end); - return -EPERM; - } - - hmm_vma_walk.pgmap = NULL; - hmm_vma_walk.last = start; - hmm_vma_walk.flags = flags; - hmm_vma_walk.range = range; - end = min(range->end, vma->vm_end); - - walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops, - &hmm_vma_walk); - - do { - ret = walk_page_range(vma->vm_mm, start, end, - &hmm_walk_ops, &hmm_vma_walk); - start = hmm_vma_walk.last; - - /* Keep trying while the range is valid. */ - } while (ret == -EBUSY && range->valid); - - if (ret) { - unsigned long i; - - i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(range, &range->pfns[i], - hmm_vma_walk.last, range->end); - return ret; - } - start = end; - - } while (start < range->end); - + if (ret) + return ret; return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } EXPORT_SYMBOL(hmm_range_fault); - -/** - * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. - * @range: range being faulted - * @device: device to map page to - * @daddrs: array of dma addresses for the mapped pages - * @flags: HMM_FAULT_* - * - * Return: the number of pages mapped on success (including zero), or any - * status return from hmm_range_fault() otherwise. - */ -long hmm_range_dma_map(struct hmm_range *range, struct device *device, - dma_addr_t *daddrs, unsigned int flags) -{ - unsigned long i, npages, mapped; - long ret; - - ret = hmm_range_fault(range, flags); - if (ret <= 0) - return ret ? ret : -EBUSY; - - npages = (range->end - range->start) >> PAGE_SHIFT; - for (i = 0, mapped = 0; i < npages; ++i) { - enum dma_data_direction dir = DMA_TO_DEVICE; - struct page *page; - - /* - * FIXME need to update DMA API to provide invalid DMA address - * value instead of a function to test dma address value. This - * would remove lot of dumb code duplicated accross many arch. - * - * For now setting it to 0 here is good enough as the pfns[] - * value is what is use to check what is valid and what isn't. - */ - daddrs[i] = 0; - - page = hmm_device_entry_to_page(range, range->pfns[i]); - if (page == NULL) - continue; - - /* Check if range is being invalidated */ - if (!range->valid) { - ret = -EBUSY; - goto unmap; - } - - /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) - dir = DMA_BIDIRECTIONAL; - - daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); - if (dma_mapping_error(device, daddrs[i])) { - ret = -EFAULT; - goto unmap; - } - - mapped++; - } - - return mapped; - -unmap: - for (npages = i, i = 0; (i < npages) && mapped; ++i) { - enum dma_data_direction dir = DMA_TO_DEVICE; - struct page *page; - - page = hmm_device_entry_to_page(range, range->pfns[i]); - if (page == NULL) - continue; - - if (dma_mapping_error(device, daddrs[i])) - continue; - - /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) - dir = DMA_BIDIRECTIONAL; - - dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); - mapped--; - } - - return ret; -} -EXPORT_SYMBOL(hmm_range_dma_map); - -/** - * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() - * @range: range being unmapped - * @device: device against which dma map was done - * @daddrs: dma address of mapped pages - * @dirty: dirty page if it had the write flag set - * Return: number of page unmapped on success, -EINVAL otherwise - * - * Note that caller MUST abide by mmu notifier or use HMM mirror and abide - * to the sync_cpu_device_pagetables() callback so that it is safe here to - * call set_page_dirty(). Caller must also take appropriate locks to avoid - * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. - */ -long hmm_range_dma_unmap(struct hmm_range *range, - struct device *device, - dma_addr_t *daddrs, - bool dirty) -{ - unsigned long i, npages; - long cpages = 0; - - /* Sanity check. */ - if (range->end <= range->start) - return -EINVAL; - if (!daddrs) - return -EINVAL; - if (!range->pfns) - return -EINVAL; - - npages = (range->end - range->start) >> PAGE_SHIFT; - for (i = 0; i < npages; ++i) { - enum dma_data_direction dir = DMA_TO_DEVICE; - struct page *page; - - page = hmm_device_entry_to_page(range, range->pfns[i]); - if (page == NULL) - continue; - - /* If it is read and write than map bi-directional. */ - if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { - dir = DMA_BIDIRECTIONAL; - - /* - * See comments in function description on why it is - * safe here to call set_page_dirty() - */ - if (dirty) - set_page_dirty(page); - } - - /* Unmap and clear pfns/dma address */ - dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); - range->pfns[i] = range->values[HMM_PFN_NONE]; - /* FIXME see comments in hmm_vma_dma_map() */ - daddrs[i] = 0; - cpages++; - } - - return cpages; -} -EXPORT_SYMBOL(hmm_range_dma_unmap); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9a889e456168..f76ea05b1cb0 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/err.h> +#include <linux/interval_tree.h> #include <linux/srcu.h> #include <linux/rcupdate.h> #include <linux/sched.h> @@ -28,6 +29,254 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = { #endif /* + * The mmu notifier_mm structure is allocated and installed in + * mm->mmu_notifier_mm inside the mm_take_all_locks() protected + * critical section and it's released only when mm_count reaches zero + * in mmdrop(). + */ +struct mmu_notifier_mm { + /* all mmu notifiers registered in this mm are queued in this list */ + struct hlist_head list; + bool has_itree; + /* to serialize the list modifications and hlist_unhashed */ + spinlock_t lock; + unsigned long invalidate_seq; + unsigned long active_invalidate_ranges; + struct rb_root_cached itree; + wait_queue_head_t wq; + struct hlist_head deferred_list; +}; + +/* + * This is a collision-retry read-side/write-side 'lock', a lot like a + * seqcount, however this allows multiple write-sides to hold it at + * once. Conceptually the write side is protecting the values of the PTEs in + * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any + * writer exists. + * + * Note that the core mm creates nested invalidate_range_start()/end() regions + * within the same thread, and runs invalidate_range_start()/end() in parallel + * on multiple CPUs. This is designed to not reduce concurrency or block + * progress on the mm side. + * + * As a secondary function, holding the full write side also serves to prevent + * writers for the itree, this is an optimization to avoid extra locking + * during invalidate_range_start/end notifiers. + * + * The write side has two states, fully excluded: + * - mm->active_invalidate_ranges != 0 + * - mnn->invalidate_seq & 1 == True (odd) + * - some range on the mm_struct is being invalidated + * - the itree is not allowed to change + * + * And partially excluded: + * - mm->active_invalidate_ranges != 0 + * - mnn->invalidate_seq & 1 == False (even) + * - some range on the mm_struct is being invalidated + * - the itree is allowed to change + * + * Operations on mmu_notifier_mm->invalidate_seq (under spinlock): + * seq |= 1 # Begin writing + * seq++ # Release the writing state + * seq & 1 # True if a writer exists + * + * The later state avoids some expensive work on inv_end in the common case of + * no mni monitoring the VA. + */ +static bool mn_itree_is_invalidating(struct mmu_notifier_mm *mmn_mm) +{ + lockdep_assert_held(&mmn_mm->lock); + return mmn_mm->invalidate_seq & 1; +} + +static struct mmu_interval_notifier * +mn_itree_inv_start_range(struct mmu_notifier_mm *mmn_mm, + const struct mmu_notifier_range *range, + unsigned long *seq) +{ + struct interval_tree_node *node; + struct mmu_interval_notifier *res = NULL; + + spin_lock(&mmn_mm->lock); + mmn_mm->active_invalidate_ranges++; + node = interval_tree_iter_first(&mmn_mm->itree, range->start, + range->end - 1); + if (node) { + mmn_mm->invalidate_seq |= 1; + res = container_of(node, struct mmu_interval_notifier, + interval_tree); + } + + *seq = mmn_mm->invalidate_seq; + spin_unlock(&mmn_mm->lock); + return res; +} + +static struct mmu_interval_notifier * +mn_itree_inv_next(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range) +{ + struct interval_tree_node *node; + + node = interval_tree_iter_next(&mni->interval_tree, range->start, + range->end - 1); + if (!node) + return NULL; + return container_of(node, struct mmu_interval_notifier, interval_tree); +} + +static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) +{ + struct mmu_interval_notifier *mni; + struct hlist_node *next; + + spin_lock(&mmn_mm->lock); + if (--mmn_mm->active_invalidate_ranges || + !mn_itree_is_invalidating(mmn_mm)) { + spin_unlock(&mmn_mm->lock); + return; + } + + /* Make invalidate_seq even */ + mmn_mm->invalidate_seq++; + + /* + * The inv_end incorporates a deferred mechanism like rtnl_unlock(). + * Adds and removes are queued until the final inv_end happens then + * they are progressed. This arrangement for tree updates is used to + * avoid using a blocking lock during invalidate_range_start. + */ + hlist_for_each_entry_safe(mni, next, &mmn_mm->deferred_list, + deferred_item) { + if (RB_EMPTY_NODE(&mni->interval_tree.rb)) + interval_tree_insert(&mni->interval_tree, + &mmn_mm->itree); + else + interval_tree_remove(&mni->interval_tree, + &mmn_mm->itree); + hlist_del(&mni->deferred_item); + } + spin_unlock(&mmn_mm->lock); + + wake_up_all(&mmn_mm->wq); +} + +/** + * mmu_interval_read_begin - Begin a read side critical section against a VA + * range + * mni: The range to use + * + * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a + * collision-retry scheme similar to seqcount for the VA range under mni. If + * the mm invokes invalidation during the critical section then + * mmu_interval_read_retry() will return true. + * + * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs + * require a blocking context. The critical region formed by this can sleep, + * and the required 'user_lock' can also be a sleeping lock. + * + * The caller is required to provide a 'user_lock' to serialize both teardown + * and setup. + * + * The return value should be passed to mmu_interval_read_retry(). + */ +unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni) +{ + struct mmu_notifier_mm *mmn_mm = mni->mm->mmu_notifier_mm; + unsigned long seq; + bool is_invalidating; + + /* + * If the mni has a different seq value under the user_lock than we + * started with then it has collided. + * + * If the mni currently has the same seq value as the mmn_mm seq, then + * it is currently between invalidate_start/end and is colliding. + * + * The locking looks broadly like this: + * mn_tree_invalidate_start(): mmu_interval_read_begin(): + * spin_lock + * seq = READ_ONCE(mni->invalidate_seq); + * seq == mmn_mm->invalidate_seq + * spin_unlock + * spin_lock + * seq = ++mmn_mm->invalidate_seq + * spin_unlock + * op->invalidate_range(): + * user_lock + * mmu_interval_set_seq() + * mni->invalidate_seq = seq + * user_unlock + * + * [Required: mmu_interval_read_retry() == true] + * + * mn_itree_inv_end(): + * spin_lock + * seq = ++mmn_mm->invalidate_seq + * spin_unlock + * + * user_lock + * mmu_interval_read_retry(): + * mni->invalidate_seq != seq + * user_unlock + * + * Barriers are not needed here as any races here are closed by an + * eventual mmu_interval_read_retry(), which provides a barrier via the + * user_lock. + */ + spin_lock(&mmn_mm->lock); + /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ + seq = READ_ONCE(mni->invalidate_seq); + is_invalidating = seq == mmn_mm->invalidate_seq; + spin_unlock(&mmn_mm->lock); + + /* + * mni->invalidate_seq must always be set to an odd value via + * mmu_interval_set_seq() using the provided cur_seq from + * mn_itree_inv_start_range(). This ensures that if seq does wrap we + * will always clear the below sleep in some reasonable time as + * mmn_mm->invalidate_seq is even in the idle state. + */ + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + if (is_invalidating) + wait_event(mmn_mm->wq, + READ_ONCE(mmn_mm->invalidate_seq) != seq); + + /* + * Notice that mmu_interval_read_retry() can already be true at this + * point, avoiding loops here allows the caller to provide a global + * time bound. + */ + + return seq; +} +EXPORT_SYMBOL_GPL(mmu_interval_read_begin); + +static void mn_itree_release(struct mmu_notifier_mm *mmn_mm, + struct mm_struct *mm) +{ + struct mmu_notifier_range range = { + .flags = MMU_NOTIFIER_RANGE_BLOCKABLE, + .event = MMU_NOTIFY_RELEASE, + .mm = mm, + .start = 0, + .end = ULONG_MAX, + }; + struct mmu_interval_notifier *mni; + unsigned long cur_seq; + bool ret; + + for (mni = mn_itree_inv_start_range(mmn_mm, &range, &cur_seq); mni; + mni = mn_itree_inv_next(mni, &range)) { + ret = mni->ops->invalidate(mni, &range, cur_seq); + WARN_ON(!ret); + } + + mn_itree_inv_end(mmn_mm); +} + +/* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap * runs with mm_users == 0. Other tasks may still invoke mmu notifiers @@ -39,7 +288,8 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = { * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ -void __mmu_notifier_release(struct mm_struct *mm) +static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm, + struct mm_struct *mm) { struct mmu_notifier *mn; int id; @@ -49,7 +299,7 @@ void __mmu_notifier_release(struct mm_struct *mm) * ->release returns. */ id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) + hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) /* * If ->release runs before mmu_notifier_unregister it must be * handled, as it's the only way for the driver to flush all @@ -59,10 +309,9 @@ void __mmu_notifier_release(struct mm_struct *mm) if (mn->ops->release) mn->ops->release(mn, mm); - spin_lock(&mm->mmu_notifier_mm->lock); - while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { - mn = hlist_entry(mm->mmu_notifier_mm->list.first, - struct mmu_notifier, + spin_lock(&mmn_mm->lock); + while (unlikely(!hlist_empty(&mmn_mm->list))) { + mn = hlist_entry(mmn_mm->list.first, struct mmu_notifier, hlist); /* * We arrived before mmu_notifier_unregister so @@ -72,7 +321,7 @@ void __mmu_notifier_release(struct mm_struct *mm) */ hlist_del_init_rcu(&mn->hlist); } - spin_unlock(&mm->mmu_notifier_mm->lock); + spin_unlock(&mmn_mm->lock); srcu_read_unlock(&srcu, id); /* @@ -87,6 +336,17 @@ void __mmu_notifier_release(struct mm_struct *mm) synchronize_srcu(&srcu); } +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; + + if (mmn_mm->has_itree) + mn_itree_release(mmn_mm, mm); + + if (!hlist_empty(&mmn_mm->list)) + mn_hlist_release(mmn_mm, mm); +} + /* * If no young bitflag is supported by the hardware, ->clear_flush_young can * unmap the address and return 1 or 0 depending if the mapping previously @@ -159,14 +419,43 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) +static int mn_itree_invalidate(struct mmu_notifier_mm *mmn_mm, + const struct mmu_notifier_range *range) +{ + struct mmu_interval_notifier *mni; + unsigned long cur_seq; + + for (mni = mn_itree_inv_start_range(mmn_mm, range, &cur_seq); mni; + mni = mn_itree_inv_next(mni, range)) { + bool ret; + + ret = mni->ops->invalidate(mni, range, cur_seq); + if (!ret) { + if (WARN_ON(mmu_notifier_range_blockable(range))) + continue; + goto out_would_block; + } + } + return 0; + +out_would_block: + /* + * On -EAGAIN the non-blocking caller is not allowed to call + * invalidate_range_end() + */ + mn_itree_inv_end(mmn_mm); + return -EAGAIN; +} + +static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm, + struct mmu_notifier_range *range) { struct mmu_notifier *mn; int ret = 0; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) { if (mn->ops->invalidate_range_start) { int _ret; @@ -190,15 +479,30 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) return ret; } -void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, - bool only_end) +int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) +{ + struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm; + int ret; + + if (mmn_mm->has_itree) { + ret = mn_itree_invalidate(mmn_mm, range); + if (ret) + return ret; + } + if (!hlist_empty(&mmn_mm->list)) + return mn_hlist_invalidate_range_start(mmn_mm, range); + return 0; +} + +static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm, + struct mmu_notifier_range *range, + bool only_end) { struct mmu_notifier *mn; int id; - lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { + hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) { /* * Call invalidate_range here too to avoid the need for the * subsystem of having to register an invalidate_range_end @@ -225,6 +529,19 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, } } srcu_read_unlock(&srcu, id); +} + +void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, + bool only_end) +{ + struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm; + + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + if (mmn_mm->has_itree) + mn_itree_inv_end(mmn_mm); + + if (!hlist_empty(&mmn_mm->list)) + mn_hlist_invalidate_end(mmn_mm, range, only_end); lock_map_release(&__mmu_notifier_invalidate_range_start_map); } @@ -243,8 +560,9 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } /* - * Same as mmu_notifier_register but here the caller must hold the - * mmap_sem in write mode. + * Same as mmu_notifier_register but here the caller must hold the mmap_sem in + * write mode. A NULL mn signals the notifier is being registered for itree + * mode. */ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -261,9 +579,6 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) fs_reclaim_release(GFP_KERNEL); } - mn->mm = mm; - mn->users = 1; - if (!mm->mmu_notifier_mm) { /* * kmalloc cannot be called under mm_take_all_locks(), but we @@ -271,21 +586,22 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) * the write side of the mmap_sem. */ mmu_notifier_mm = - kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + kzalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (!mmu_notifier_mm) return -ENOMEM; INIT_HLIST_HEAD(&mmu_notifier_mm->list); spin_lock_init(&mmu_notifier_mm->lock); + mmu_notifier_mm->invalidate_seq = 2; + mmu_notifier_mm->itree = RB_ROOT_CACHED; + init_waitqueue_head(&mmu_notifier_mm->wq); + INIT_HLIST_HEAD(&mmu_notifier_mm->deferred_list); } ret = mm_take_all_locks(mm); if (unlikely(ret)) goto out_clean; - /* Pairs with the mmdrop in mmu_notifier_unregister_* */ - mmgrab(mm); - /* * Serialize the update against mmu_notifier_unregister. A * side note: mmu_notifier_release can't run concurrently with @@ -293,13 +609,28 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) * current->mm or explicitly with get_task_mm() or similar). * We can't race against any other mmu notifier method either * thanks to mm_take_all_locks(). + * + * release semantics on the initialization of the mmu_notifier_mm's + * contents are provided for unlocked readers. acquire can only be + * used while holding the mmgrab or mmget, and is safe because once + * created the mmu_notififer_mm is not freed until the mm is + * destroyed. As above, users holding the mmap_sem or one of the + * mm_take_all_locks() do not need to use acquire semantics. */ if (mmu_notifier_mm) - mm->mmu_notifier_mm = mmu_notifier_mm; + smp_store_release(&mm->mmu_notifier_mm, mmu_notifier_mm); - spin_lock(&mm->mmu_notifier_mm->lock); - hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); - spin_unlock(&mm->mmu_notifier_mm->lock); + if (mn) { + /* Pairs with the mmdrop in mmu_notifier_unregister_* */ + mmgrab(mm); + mn->mm = mm; + mn->users = 1; + + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); + spin_unlock(&mm->mmu_notifier_mm->lock); + } else + mm->mmu_notifier_mm->has_itree = true; mm_drop_all_locks(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); @@ -516,6 +847,180 @@ out_unlock: } EXPORT_SYMBOL_GPL(mmu_notifier_put); +static int __mmu_interval_notifier_insert( + struct mmu_interval_notifier *mni, struct mm_struct *mm, + struct mmu_notifier_mm *mmn_mm, unsigned long start, + unsigned long length, const struct mmu_interval_notifier_ops *ops) +{ + mni->mm = mm; + mni->ops = ops; + RB_CLEAR_NODE(&mni->interval_tree.rb); + mni->interval_tree.start = start; + /* + * Note that the representation of the intervals in the interval tree + * considers the ending point as contained in the interval. + */ + if (length == 0 || + check_add_overflow(start, length - 1, &mni->interval_tree.last)) + return -EOVERFLOW; + + /* Must call with a mmget() held */ + if (WARN_ON(atomic_read(&mm->mm_count) <= 0)) + return -EINVAL; + + /* pairs with mmdrop in mmu_interval_notifier_remove() */ + mmgrab(mm); + + /* + * If some invalidate_range_start/end region is going on in parallel + * we don't know what VA ranges are affected, so we must assume this + * new range is included. + * + * If the itree is invalidating then we are not allowed to change + * it. Retrying until invalidation is done is tricky due to the + * possibility for live lock, instead defer the add to + * mn_itree_inv_end() so this algorithm is deterministic. + * + * In all cases the value for the mni->invalidate_seq should be + * odd, see mmu_interval_read_begin() + */ + spin_lock(&mmn_mm->lock); + if (mmn_mm->active_invalidate_ranges) { + if (mn_itree_is_invalidating(mmn_mm)) + hlist_add_head(&mni->deferred_item, + &mmn_mm->deferred_list); + else { + mmn_mm->invalidate_seq |= 1; + interval_tree_insert(&mni->interval_tree, + &mmn_mm->itree); + } + mni->invalidate_seq = mmn_mm->invalidate_seq; + } else { + WARN_ON(mn_itree_is_invalidating(mmn_mm)); + /* + * The starting seq for a mni not under invalidation should be + * odd, not equal to the current invalidate_seq and + * invalidate_seq should not 'wrap' to the new seq any time + * soon. + */ + mni->invalidate_seq = mmn_mm->invalidate_seq - 1; + interval_tree_insert(&mni->interval_tree, &mmn_mm->itree); + } + spin_unlock(&mmn_mm->lock); + return 0; +} + +/** + * mmu_interval_notifier_insert - Insert an interval notifier + * @mni: Interval notifier to register + * @start: Starting virtual address to monitor + * @length: Length of the range to monitor + * @mm : mm_struct to attach to + * + * This function subscribes the interval notifier for notifications from the + * mm. Upon return the ops related to mmu_interval_notifier will be called + * whenever an event that intersects with the given range occurs. + * + * Upon return the range_notifier may not be present in the interval tree yet. + * The caller must use the normal interval notifier read flow via + * mmu_interval_read_begin() to establish SPTEs for this range. + */ +int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops) +{ + struct mmu_notifier_mm *mmn_mm; + int ret; + + might_lock(&mm->mmap_sem); + + mmn_mm = smp_load_acquire(&mm->mmu_notifier_mm); + if (!mmn_mm || !mmn_mm->has_itree) { + ret = mmu_notifier_register(NULL, mm); + if (ret) + return ret; + mmn_mm = mm->mmu_notifier_mm; + } + return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length, + ops); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert); + +int mmu_interval_notifier_insert_locked( + struct mmu_interval_notifier *mni, struct mm_struct *mm, + unsigned long start, unsigned long length, + const struct mmu_interval_notifier_ops *ops) +{ + struct mmu_notifier_mm *mmn_mm; + int ret; + + lockdep_assert_held_write(&mm->mmap_sem); + + mmn_mm = mm->mmu_notifier_mm; + if (!mmn_mm || !mmn_mm->has_itree) { + ret = __mmu_notifier_register(NULL, mm); + if (ret) + return ret; + mmn_mm = mm->mmu_notifier_mm; + } + return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length, + ops); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); + +/** + * mmu_interval_notifier_remove - Remove a interval notifier + * @mni: Interval notifier to unregister + * + * This function must be paired with mmu_interval_notifier_insert(). It cannot + * be called from any ops callback. + * + * Once this returns ops callbacks are no longer running on other CPUs and + * will not be called in future. + */ +void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni) +{ + struct mm_struct *mm = mni->mm; + struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; + unsigned long seq = 0; + + might_sleep(); + + spin_lock(&mmn_mm->lock); + if (mn_itree_is_invalidating(mmn_mm)) { + /* + * remove is being called after insert put this on the + * deferred list, but before the deferred list was processed. + */ + if (RB_EMPTY_NODE(&mni->interval_tree.rb)) { + hlist_del(&mni->deferred_item); + } else { + hlist_add_head(&mni->deferred_item, + &mmn_mm->deferred_list); + seq = mmn_mm->invalidate_seq; + } + } else { + WARN_ON(RB_EMPTY_NODE(&mni->interval_tree.rb)); + interval_tree_remove(&mni->interval_tree, &mmn_mm->itree); + } + spin_unlock(&mmn_mm->lock); + + /* + * The possible sleep on progress in the invalidation requires the + * caller not hold any locks held by invalidation callbacks. + */ + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + if (seq) + wait_event(mmn_mm->wq, + READ_ONCE(mmn_mm->invalidate_seq) != seq); + + /* pairs with mmgrab in mmu_interval_notifier_insert() */ + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove); + /** * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed * |