diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5/odp.c')
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 1025 |
1 files changed, 548 insertions, 477 deletions
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 2e9b43061797..f924250f80c2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -93,152 +93,185 @@ struct mlx5_pagefault { static u64 mlx5_imr_ksm_entries; -static int check_parent(struct ib_umem_odp *odp, - struct mlx5_ib_mr *parent) +void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, + struct mlx5_ib_mr *imr, int flags) { - struct mlx5_ib_mr *mr = odp->private; - - return mr && mr->parent == parent && !odp->dying; -} - -static struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr) -{ - if (WARN_ON(!mr || !is_odp_mr(mr))) - return NULL; - - return to_ib_umem_odp(mr->umem)->per_mm; -} - -static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) -{ - struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext_per_mm *per_mm = odp->per_mm; - struct rb_node *rb; - - down_read(&per_mm->umem_rwsem); - while (1) { - rb = rb_next(&odp->interval_tree.rb); - if (!rb) - goto not_found; - odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (check_parent(odp, parent)) - goto end; - } -not_found: - odp = NULL; -end: - up_read(&per_mm->umem_rwsem); - return odp; -} - -static struct ib_umem_odp *odp_lookup(u64 start, u64 length, - struct mlx5_ib_mr *parent) -{ - struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent); - struct ib_umem_odp *odp; - struct rb_node *rb; - - down_read(&per_mm->umem_rwsem); - odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length); - if (!odp) - goto end; - - while (1) { - if (check_parent(odp, parent)) - goto end; - rb = rb_next(&odp->interval_tree.rb); - if (!rb) - goto not_found; - odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (ib_umem_start(odp) > start + length) - goto not_found; - } -not_found: - odp = NULL; -end: - up_read(&per_mm->umem_rwsem); - return odp; -} - -void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, - size_t nentries, struct mlx5_ib_mr *mr, int flags) -{ - struct ib_pd *pd = mr->ibmr.pd; - struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct ib_umem_odp *odp; - unsigned long va; - int i; + struct mlx5_klm *end = pklm + nentries; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (i = 0; i < nentries; i++, pklm++) { + for (; pklm != end; pklm++, idx++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = cpu_to_be32(dev->null_mkey); + pklm->key = cpu_to_be32(imr->dev->null_mkey); pklm->va = 0; } return; } - odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, - nentries * MLX5_IMR_MTT_SIZE, mr); + /* + * The locking here is pretty subtle. Ideally the implicit_children + * xarray would be protected by the umem_mutex, however that is not + * possible. Instead this uses a weaker update-then-lock pattern: + * + * srcu_read_lock() + * xa_store() + * mutex_lock(umem_mutex) + * mlx5_ib_update_xlt() + * mutex_unlock(umem_mutex) + * destroy lkey + * + * ie any change the xarray must be followed by the locked update_xlt + * before destroying. + * + * The umem_mutex provides the acquire/release semantic needed to make + * the xa_store() visible to a racing thread. While SRCU is not + * technically required, using it gives consistent use of the SRCU + * locking around the xarray. + */ + lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); + lockdep_assert_held(&imr->dev->odp_srcu); - for (i = 0; i < nentries; i++, pklm++) { - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - va = (offset + i) * MLX5_IMR_MTT_SIZE; - if (odp && ib_umem_start(odp) == va) { - struct mlx5_ib_mr *mtt = odp->private; + for (; pklm != end; pklm++, idx++) { + struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); + pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); + if (mtt) { pklm->key = cpu_to_be32(mtt->ibmr.lkey); - odp = odp_next(odp); + pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = cpu_to_be32(dev->null_mkey); + pklm->key = cpu_to_be32(imr->dev->null_mkey); + pklm->va = 0; } - mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", - i, va, be32_to_cpu(pklm->key)); } } -static void mr_leaf_free_action(struct work_struct *work) +static void dma_fence_odp_mr(struct mlx5_ib_mr *mr) { - struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); - int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; - struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - mr->parent = NULL; - synchronize_srcu(&mr->dev->mr_srcu); + /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */ + mutex_lock(&odp->umem_mutex); + if (odp->npages) { + mlx5_mr_cache_invalidate(mr); + ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp), + ib_umem_end(odp)); + WARN_ON(odp->npages); + } + odp->private = NULL; + mutex_unlock(&odp->umem_mutex); - ib_umem_odp_release(odp); - if (imr->live) - mlx5_ib_update_xlt(imr, idx, 1, 0, + if (!mr->allocated_from_cache) { + mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey); + WARN_ON(mr->descs); + } +} + +/* + * This must be called after the mr has been removed from implicit_children + * and the SRCU synchronized. NOTE: The MR does not necessarily have to be + * empty here, parallel page faults could have raced with the free process and + * added pages to it. + */ +static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt) +{ + struct mlx5_ib_mr *imr = mr->parent; + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; + int srcu_key; + + /* implicit_child_mr's are not allowed to have deferred work */ + WARN_ON(atomic_read(&mr->num_deferred_work)); + + if (need_imr_xlt) { + srcu_key = srcu_read_lock(&mr->dev->odp_srcu); + mutex_lock(&odp_imr->umem_mutex); + mlx5_ib_update_xlt(mr->parent, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); + mutex_unlock(&odp_imr->umem_mutex); + srcu_read_unlock(&mr->dev->odp_srcu, srcu_key); + } + + dma_fence_odp_mr(mr); + + mr->parent = NULL; mlx5_mr_cache_free(mr->dev, mr); + ib_umem_odp_release(odp); + atomic_dec(&imr->num_deferred_work); +} - if (atomic_dec_and_test(&imr->num_leaf_free)) - wake_up(&imr->q_leaf_free); +static void free_implicit_child_mr_work(struct work_struct *work) +{ + struct mlx5_ib_mr *mr = + container_of(work, struct mlx5_ib_mr, odp_destroy.work); + + free_implicit_child_mr(mr, true); } -void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, - unsigned long end) +static void free_implicit_child_mr_rcu(struct rcu_head *head) { + struct mlx5_ib_mr *mr = + container_of(head, struct mlx5_ib_mr, odp_destroy.rcu); + + /* Freeing a MR is a sleeping operation, so bounce to a work queue */ + INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); + queue_work(system_unbound_wq, &mr->odp_destroy.work); +} + +static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; + struct mlx5_ib_mr *imr = mr->parent; + + xa_lock(&imr->implicit_children); + /* + * This can race with mlx5_ib_free_implicit_mr(), the first one to + * reach the xa lock wins the race and destroys the MR. + */ + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) != + mr) + goto out_unlock; + + atomic_inc(&imr->num_deferred_work); + call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu, + free_implicit_child_mr_rcu); + +out_unlock: + xa_unlock(&imr->implicit_children); +} + +static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct ib_umem_odp *umem_odp = + container_of(mni, struct ib_umem_odp, notifier); struct mlx5_ib_mr *mr; const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) - 1; u64 idx = 0, blk_start_idx = 0; + u64 invalidations = 0; + unsigned long start; + unsigned long end; int in_block = 0; u64 addr; - if (!umem_odp) { - pr_err("invalidation called on NULL umem or non-ODP umem\n"); - return; - } + if (!mmu_notifier_range_blockable(range)) + return false; + mutex_lock(&umem_odp->umem_mutex); + mmu_interval_set_seq(mni, cur_seq); + /* + * If npages is zero then umem_odp->private may not be setup yet. This + * does not complete until after the first page is mapped for DMA. + */ + if (!umem_odp->npages) + goto out; mr = umem_odp->private; - if (!mr || !mr->ibmr.pd) - return; - - start = max_t(u64, ib_umem_start(umem_odp), start); - end = min_t(u64, ib_umem_end(umem_odp), end); + start = max_t(u64, ib_umem_start(umem_odp), range->start); + end = min_t(u64, ib_umem_end(umem_odp), range->end); /* * Iteration one - zap the HW's MTTs. The notifiers_count ensures that @@ -246,7 +279,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, * overwrite the same MTTs. Concurent invalidations might race us, * but they will write 0s as well, so no difference in the end result. */ - mutex_lock(&umem_odp->umem_mutex); for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) { idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; /* @@ -261,6 +293,9 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, blk_start_idx = idx; in_block = 1; } + + /* Count page invalidations */ + invalidations += idx - blk_start_idx + 1; } else { u64 umr_offset = idx & umr_block_mask; @@ -278,7 +313,9 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, idx - blk_start_idx + 1, 0, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); - mutex_unlock(&umem_odp->umem_mutex); + + mlx5_update_odp_stats(mr, invalidations, invalidations); + /* * We are now sure that the device will not access the * memory. We can safely unmap it, and mark it as dirty if @@ -287,14 +324,17 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ib_umem_odp_unmap_dma_pages(umem_odp, start, end); - if (unlikely(!umem_odp->npages && mr->parent && - !umem_odp->dying)) { - WRITE_ONCE(umem_odp->dying, 1); - atomic_inc(&mr->parent->num_leaf_free); - schedule_work(&umem_odp->work); - } + if (unlikely(!umem_odp->npages && mr->parent)) + destroy_unused_implicit_child_mr(mr); +out: + mutex_unlock(&umem_odp->umem_mutex); + return true; } +const struct mmu_interval_notifier_ops mlx5_mn_ops = { + .invalidate = mlx5_ib_invalidate_range, +}; + void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) { struct ib_odp_caps *caps = &dev->odp_caps; @@ -359,8 +399,6 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; - - return; } static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, @@ -385,252 +423,226 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, wq_num, err); } -static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, - struct ib_umem_odp *umem_odp, - bool ksm, int access_flags) +static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, + unsigned long idx) { - struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem_odp *odp; struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *ret; int err; - mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : - MLX5_IMR_MTT_CACHE_ENTRY); + odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), + idx * MLX5_IMR_MTT_SIZE, + MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); + if (IS_ERR(odp)) + return ERR_CAST(odp); + ret = mr = mlx5_mr_cache_alloc(imr->dev, MLX5_IMR_MTT_CACHE_ENTRY); if (IS_ERR(mr)) - return mr; - - mr->ibmr.pd = pd; - - mr->dev = dev; - mr->access_flags = access_flags; - mr->mmkey.iova = 0; - mr->umem = &umem_odp->umem; - - if (ksm) { - err = mlx5_ib_update_xlt(mr, 0, - mlx5_imr_ksm_entries, - MLX5_KSM_PAGE_SHIFT, - MLX5_IB_UPD_XLT_INDIRECT | - MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ENABLE); - - } else { - err = mlx5_ib_update_xlt(mr, 0, - MLX5_IMR_MTT_ENTRIES, - PAGE_SHIFT, - MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ENABLE | - MLX5_IB_UPD_XLT_ATOMIC); - } - - if (err) - goto fail; + goto out_umem; + mr->ibmr.pd = imr->ibmr.pd; + mr->access_flags = imr->access_flags; + mr->umem = &odp->umem; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; - - mr->live = 1; - - mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", - mr->mmkey.key, dev->mdev, mr); - - return mr; - -fail: - mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); - mlx5_mr_cache_free(dev, mr); - - return ERR_PTR(err); -} - -static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, - u64 io_virt, size_t bcnt) -{ - struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); - struct ib_umem_odp *odp, *result = NULL; - struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); - u64 addr = io_virt & MLX5_IMR_MTT_MASK; - int nentries = 0, start_idx = 0, ret; - struct mlx5_ib_mr *mtt; - - mutex_lock(&odp_mr->umem_mutex); - odp = odp_lookup(addr, 1, mr); - - mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", - io_virt, bcnt, addr, odp); - -next_mr: - if (likely(odp)) { - if (nentries) - nentries++; - } else { - odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE); - if (IS_ERR(odp)) { - mutex_unlock(&odp_mr->umem_mutex); - return ERR_CAST(odp); - } - - mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0, - mr->access_flags); - if (IS_ERR(mtt)) { - mutex_unlock(&odp_mr->umem_mutex); - ib_umem_odp_release(odp); - return ERR_CAST(mtt); - } - - odp->private = mtt; - mtt->umem = &odp->umem; - mtt->mmkey.iova = addr; - mtt->parent = mr; - INIT_WORK(&odp->work, mr_leaf_free_action); - - if (!nentries) - start_idx = addr >> MLX5_IMR_MTT_SHIFT; - nentries++; - } - - /* Return first odp if region not covered by single one */ - if (likely(!result)) - result = odp; - - addr += MLX5_IMR_MTT_SIZE; - if (unlikely(addr < io_virt + bcnt)) { - odp = odp_next(odp); - if (odp && ib_umem_start(odp) != addr) - odp = NULL; - goto next_mr; + mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; + mr->parent = imr; + odp->private = mr; + + err = mlx5_ib_update_xlt(mr, 0, + MLX5_IMR_MTT_ENTRIES, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); + if (err) { + ret = ERR_PTR(err); + goto out_mr; } - if (unlikely(nentries)) { - ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, - MLX5_IB_UPD_XLT_INDIRECT | - MLX5_IB_UPD_XLT_ATOMIC); - if (ret) { - mlx5_ib_err(dev, "Failed to update PAS\n"); - result = ERR_PTR(ret); + /* + * Once the store to either xarray completes any error unwind has to + * use synchronize_srcu(). Avoid this with xa_reserve() + */ + ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, + GFP_KERNEL); + if (unlikely(ret)) { + if (xa_is_err(ret)) { + ret = ERR_PTR(xa_err(ret)); + goto out_mr; } + /* + * Another thread beat us to creating the child mr, use + * theirs. + */ + goto out_mr; } - mutex_unlock(&odp_mr->umem_mutex); - return result; + mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr); + return mr; + +out_mr: + mlx5_mr_cache_free(imr->dev, mr); +out_umem: + ib_umem_odp_release(odp); + return ret; } struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct ib_udata *udata, int access_flags) { - struct mlx5_ib_mr *imr; + struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); struct ib_umem_odp *umem_odp; + struct mlx5_ib_mr *imr; + int err; umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); if (IS_ERR(umem_odp)) return ERR_CAST(umem_odp); - imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags); + imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY); if (IS_ERR(imr)) { - ib_umem_odp_release(umem_odp); - return ERR_CAST(imr); + err = PTR_ERR(imr); + goto out_umem; } + imr->ibmr.pd = &pd->ibpd; + imr->access_flags = access_flags; + imr->mmkey.iova = 0; + imr->umem = &umem_odp->umem; + imr->ibmr.lkey = imr->mmkey.key; + imr->ibmr.rkey = imr->mmkey.key; imr->umem = &umem_odp->umem; - init_waitqueue_head(&imr->q_leaf_free); - atomic_set(&imr->num_leaf_free, 0); - atomic_set(&imr->num_pending_prefetch, 0); + imr->is_odp_implicit = true; + atomic_set(&imr->num_deferred_work, 0); + xa_init(&imr->implicit_children); + + err = mlx5_ib_update_xlt(imr, 0, + mlx5_imr_ksm_entries, + MLX5_KSM_PAGE_SHIFT, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); + if (err) + goto out_mr; + + err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key), + &imr->mmkey, GFP_KERNEL)); + if (err) + goto out_mr; + mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr); return imr; +out_mr: + mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); + mlx5_mr_cache_free(dev, imr); +out_umem: + ib_umem_odp_release(umem_odp); + return ERR_PTR(err); } void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { - struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); - struct rb_node *node; + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + struct mlx5_ib_dev *dev = imr->dev; + struct list_head destroy_list; + struct mlx5_ib_mr *mtt; + struct mlx5_ib_mr *tmp; + unsigned long idx; - down_read(&per_mm->umem_rwsem); - for (node = rb_first_cached(&per_mm->umem_tree); node; - node = rb_next(node)) { - struct ib_umem_odp *umem_odp = - rb_entry(node, struct ib_umem_odp, interval_tree.rb); - struct mlx5_ib_mr *mr = umem_odp->private; + INIT_LIST_HEAD(&destroy_list); - if (mr->parent != imr) - continue; + xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key)); + /* + * This stops the SRCU protected page fault path from touching either + * the imr or any children. The page fault path can only reach the + * children xarray via the imr. + */ + synchronize_srcu(&dev->odp_srcu); - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); + xa_lock(&imr->implicit_children); + xa_for_each (&imr->implicit_children, idx, mtt) { + __xa_erase(&imr->implicit_children, idx); + list_add(&mtt->odp_destroy.elm, &destroy_list); + } + xa_unlock(&imr->implicit_children); - if (umem_odp->dying) - continue; + /* + * num_deferred_work can only be incremented inside the odp_srcu, or + * under xa_lock while the child is in the xarray. Thus at this point + * it is only decreasing, and all work holding it is now on the wq. + */ + if (atomic_read(&imr->num_deferred_work)) { + flush_workqueue(system_unbound_wq); + WARN_ON(atomic_read(&imr->num_deferred_work)); + } + + /* + * Fence the imr before we destroy the children. This allows us to + * skip updating the XLT of the imr during destroy of the child mkey + * the imr points to. + */ + mlx5_mr_cache_invalidate(imr); + + list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm) + free_implicit_child_mr(mtt, false); - WRITE_ONCE(umem_odp->dying, 1); - atomic_inc(&imr->num_leaf_free); - schedule_work(&umem_odp->work); + mlx5_mr_cache_free(dev, imr); + ib_umem_odp_release(odp_imr); +} + +/** + * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR + * @mr: to fence + * + * On return no parallel threads will be touching this MR and no DMA will be + * active. + */ +void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr) +{ + /* Prevent new page faults and prefetch requests from succeeding */ + xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); + + /* Wait for all running page-fault handlers to finish. */ + synchronize_srcu(&mr->dev->odp_srcu); + + if (atomic_read(&mr->num_deferred_work)) { + flush_workqueue(system_unbound_wq); + WARN_ON(atomic_read(&mr->num_deferred_work)); } - up_read(&per_mm->umem_rwsem); - wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); + dma_fence_odp_mr(mr); } -#define MLX5_PF_FLAGS_PREFETCH BIT(0) #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) -static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, - u64 io_virt, size_t bcnt, u32 *bytes_mapped, - u32 flags) +static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, + u64 user_va, size_t bcnt, u32 *bytes_mapped, + u32 flags) { - int npages = 0, current_seq, page_shift, ret, np; - struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); + int page_shift, ret, np; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; - bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; + unsigned long current_seq; u64 access_mask; u64 start_idx, page_mask; - struct ib_umem_odp *odp; - size_t size; - - if (odp_mr->is_implicit_odp) { - odp = implicit_mr_get_data(mr, io_virt, bcnt); - - if (IS_ERR(odp)) - return PTR_ERR(odp); - mr = odp->private; - } else { - odp = odp_mr; - } - -next_mr: - size = min_t(size_t, bcnt, ib_umem_end(odp) - io_virt); page_shift = odp->page_shift; page_mask = ~(BIT(page_shift) - 1); - start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; + start_idx = (user_va - (mr->mmkey.iova & page_mask)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; - if (prefetch && !downgrade && !odp->umem.writable) { - /* prefetch with write-access must - * be supported by the MR - */ - ret = -EINVAL; - goto out; - } - if (odp->umem.writable && !downgrade) access_mask |= ODP_WRITE_ALLOWED_BIT; - current_seq = READ_ONCE(odp->notifiers_seq); - /* - * Ensure the sequence number is valid for some time before we call - * gup. - */ - smp_rmb(); - - ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask, - current_seq); - - if (ret < 0) - goto out; + current_seq = mmu_interval_read_begin(&odp->notifier); - np = ret; + np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask, + current_seq); + if (np < 0) + return np; mutex_lock(&odp->umem_mutex); - if (!ib_umem_mmu_notifier_retry(odp, current_seq)) { + if (!mmu_interval_read_retry(&odp->notifier, current_seq)) { /* * No need to check whether the MTTs really belong to * this MR, since ib_umem_odp_map_dma_pages already @@ -645,53 +657,127 @@ next_mr: if (ret < 0) { if (ret != -EAGAIN) - mlx5_ib_err(dev, "Failed to update mkey page tables\n"); + mlx5_ib_err(mr->dev, + "Failed to update mkey page tables\n"); goto out; } if (bytes_mapped) { u32 new_mappings = (np << page_shift) - - (io_virt - round_down(io_virt, 1 << page_shift)); - *bytes_mapped += min_t(u32, new_mappings, size); + (user_va - round_down(user_va, 1 << page_shift)); + + *bytes_mapped += min_t(u32, new_mappings, bcnt); } - npages += np << (page_shift - PAGE_SHIFT); - bcnt -= size; + return np << (page_shift - PAGE_SHIFT); - if (unlikely(bcnt)) { - struct ib_umem_odp *next; +out: + return ret; +} - io_virt += size; - next = odp_next(odp); - if (unlikely(!next || ib_umem_start(next) != io_virt)) { - mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", - io_virt, next); - return -EAGAIN; +static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, + struct ib_umem_odp *odp_imr, u64 user_va, + size_t bcnt, u32 *bytes_mapped, u32 flags) +{ + unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; + unsigned long upd_start_idx = end_idx + 1; + unsigned long upd_len = 0; + unsigned long npages = 0; + int err; + int ret; + + if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || + mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) + return -EFAULT; + + /* Fault each child mr that intersects with our interval. */ + while (bcnt) { + unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; + struct ib_umem_odp *umem_odp; + struct mlx5_ib_mr *mtt; + u64 len; + + mtt = xa_load(&imr->implicit_children, idx); + if (unlikely(!mtt)) { + mtt = implicit_get_child_mr(imr, idx); + if (IS_ERR(mtt)) { + ret = PTR_ERR(mtt); + goto out; + } + upd_start_idx = min(upd_start_idx, idx); + upd_len = idx - upd_start_idx + 1; } - odp = next; - mr = odp->private; - goto next_mr; + + umem_odp = to_ib_umem_odp(mtt->umem); + len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) - + user_va; + + ret = pagefault_real_mr(mtt, umem_odp, user_va, len, + bytes_mapped, flags); + if (ret < 0) + goto out; + user_va += len; + bcnt -= len; + npages += ret; } - return npages; + ret = npages; + /* + * Any time the implicit_children are changed we must perform an + * update of the xlt before exiting to ensure the HW and the + * implicit_children remains synchronized. + */ out: - if (ret == -EAGAIN) { - unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); - - if (!wait_for_completion_timeout(&odp->notifier_completion, - timeout)) { - mlx5_ib_warn( - dev, - "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n", - current_seq, odp->notifiers_seq, - odp->notifiers_count); - } - } + if (likely(!upd_len)) + return ret; + /* + * Notice this is not strictly ordered right, the KSM is updated after + * the implicit_children is updated, so a parallel page fault could + * see a MR that is not yet visible in the KSM. This is similar to a + * parallel page fault seeing a MR that is being concurrently removed + * from the KSM. Both of these improbable situations are resolved + * safely by resuming the HW and then taking another page fault. The + * next pagefault handler will see the new information. + */ + mutex_lock(&odp_imr->umem_mutex); + err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); + mutex_unlock(&odp_imr->umem_mutex); + if (err) { + mlx5_ib_err(imr->dev, "Failed to update PAS\n"); + return err; + } return ret; } +/* + * Returns: + * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are + * not accessible, or the MR is no longer valid. + * -EAGAIN/-ENOMEM: The operation should be retried + * + * -EINVAL/others: General internal malfunction + * >0: Number of pages mapped + */ +static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, + u32 *bytes_mapped, u32 flags) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + + if (!odp->is_implicit_odp) { + if (unlikely(io_virt < ib_umem_start(odp) || + ib_umem_end(odp) - io_virt < bcnt)) + return -EFAULT; + return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped, + flags); + } + return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, + flags); +} + struct pf_frame { struct pf_frame *next; u32 key; @@ -739,10 +825,9 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, struct ib_pd *pd, u32 key, u64 io_virt, size_t bcnt, u32 *bytes_committed, - u32 *bytes_mapped, u32 flags) + u32 *bytes_mapped) { int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; - bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; struct pf_frame *head = NULL, *frame; struct mlx5_core_mkey *mmkey; struct mlx5_ib_mr *mr; @@ -751,58 +836,49 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, size_t offset; int ndescs; - srcu_key = srcu_read_lock(&dev->mr_srcu); + srcu_key = srcu_read_lock(&dev->odp_srcu); io_virt += *bytes_committed; bcnt -= *bytes_committed; next_mr: - mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(key)); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); + if (!mmkey) { + mlx5_ib_dbg( + dev, + "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += bcnt; + /* + * The user could specify a SGL with multiple lkeys and only + * some of them are ODP. Treat the non-ODP ones as fully + * faulted. + */ + ret = 0; + goto srcu_unlock; + } if (!mkey_is_eq(mmkey, key)) { mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); ret = -EFAULT; goto srcu_unlock; } - if (prefetch && mmkey->type != MLX5_MKEY_MR) { - mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n"); - ret = -EINVAL; - goto srcu_unlock; - } - switch (mmkey->type) { case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - if (!mr->live || !mr->ibmr.pd) { - mlx5_ib_dbg(dev, "got dead MR\n"); - ret = -EFAULT; - goto srcu_unlock; - } - if (prefetch) { - if (!is_odp_mr(mr) || - mr->ibmr.pd != pd) { - mlx5_ib_dbg(dev, "Invalid prefetch request: %s\n", - is_odp_mr(mr) ? "MR is not ODP" : - "PD is not of the MR"); - ret = -EINVAL; - goto srcu_unlock; - } - } - - if (!is_odp_mr(mr)) { - mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", - key); - if (bytes_mapped) - *bytes_mapped += bcnt; - ret = 0; - goto srcu_unlock; - } - - ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags); + ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); if (ret < 0) goto srcu_unlock; + /* + * When prefetching a page, page fault is generated + * in order to bring the page to the main memory. + * In the current flow, page faults are being counted. + */ + mlx5_update_odp_stats(mr, faults, ret); + npages += ret; ret = 0; break; @@ -892,7 +968,7 @@ srcu_unlock: } kfree(out); - srcu_read_unlock(&dev->mr_srcu, srcu_key); + srcu_read_unlock(&dev->odp_srcu, srcu_key); *bytes_committed = 0; return ret ? ret : npages; } @@ -973,7 +1049,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, NULL, key, io_virt, bcnt, &pfault->bytes_committed, - bytes_mapped, 0); + bytes_mapped); if (ret < 0) break; npages += ret; @@ -1256,8 +1332,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, } ret = pagefault_single_data_segment(dev, NULL, rkey, address, length, - &pfault->bytes_committed, NULL, - 0); + &pfault->bytes_committed, NULL); if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; @@ -1284,8 +1359,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, NULL, rkey, address, prefetch_len, - &bytes_committed, NULL, - 0); + &bytes_committed, NULL); if (ret < 0 && ret != -EAGAIN) { mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", ret, pfault->token, address, prefetch_len); @@ -1545,7 +1619,6 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) static const struct ib_device_ops mlx5_ib_dev_odp_ops = { .advise_mr = mlx5_ib_advise_mr, - .invalidate_range = mlx5_ib_invalidate_range, }; int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) @@ -1588,114 +1661,128 @@ int mlx5_ib_odp_init(void) struct prefetch_mr_work { struct work_struct work; - struct ib_pd *pd; u32 pf_flags; u32 num_sge; - struct ib_sge sg_list[0]; + struct { + u64 io_virt; + struct mlx5_ib_mr *mr; + size_t length; + } frags[]; }; -static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev, - struct ib_sge *sg_list, u32 num_sge, - u32 from) +static void destroy_prefetch_work(struct prefetch_mr_work *work) { u32 i; - int srcu_key; - - srcu_key = srcu_read_lock(&dev->mr_srcu); - - for (i = from; i < num_sge; ++i) { - struct mlx5_core_mkey *mmkey; - struct mlx5_ib_mr *mr; - mmkey = xa_load(&dev->mdev->priv.mkey_table, - mlx5_base_mkey(sg_list[i].lkey)); - mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - atomic_dec(&mr->num_pending_prefetch); - } - - srcu_read_unlock(&dev->mr_srcu, srcu_key); + for (i = 0; i < work->num_sge; ++i) + atomic_dec(&work->frags[i].mr->num_deferred_work); + kvfree(work); } -static bool num_pending_prefetch_inc(struct ib_pd *pd, - struct ib_sge *sg_list, u32 num_sge) +static struct mlx5_ib_mr * +get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 lkey) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - bool ret = true; - u32 i; + struct mlx5_core_mkey *mmkey; + struct ib_umem_odp *odp; + struct mlx5_ib_mr *mr; - for (i = 0; i < num_sge; ++i) { - struct mlx5_core_mkey *mmkey; - struct mlx5_ib_mr *mr; + lockdep_assert_held(&dev->odp_srcu); - mmkey = xa_load(&dev->mdev->priv.mkey_table, - mlx5_base_mkey(sg_list[i].lkey)); - if (!mmkey || mmkey->key != sg_list[i].lkey) { - ret = false; - break; - } + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey)); + if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR) + return NULL; - if (mmkey->type != MLX5_MKEY_MR) { - ret = false; - break; - } + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + if (mr->ibmr.pd != pd) + return NULL; - if (mr->ibmr.pd != pd) { - ret = false; - break; - } + odp = to_ib_umem_odp(mr->umem); - if (!mr->live) { - ret = false; - break; - } + /* prefetch with write-access must be supported by the MR */ + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && + !odp->umem.writable) + return NULL; - atomic_inc(&mr->num_pending_prefetch); - } + return mr; +} - if (!ret) - num_pending_prefetch_dec(dev, sg_list, i, 0); +static void mlx5_ib_prefetch_mr_work(struct work_struct *w) +{ + struct prefetch_mr_work *work = + container_of(w, struct prefetch_mr_work, work); + u32 bytes_mapped = 0; + u32 i; - return ret; + for (i = 0; i < work->num_sge; ++i) + pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, + work->frags[i].length, &bytes_mapped, + work->pf_flags); + + destroy_prefetch_work(work); } -static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, u32 pf_flags, - struct ib_sge *sg_list, u32 num_sge) +static bool init_prefetch_work(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 pf_flags, struct prefetch_mr_work *work, + struct ib_sge *sg_list, u32 num_sge) { u32 i; - int ret = 0; - struct mlx5_ib_dev *dev = to_mdev(pd->device); + + INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); + work->pf_flags = pf_flags; for (i = 0; i < num_sge; ++i) { - struct ib_sge *sg = &sg_list[i]; - int bytes_committed = 0; + work->frags[i].io_virt = sg_list[i].addr; + work->frags[i].length = sg_list[i].length; + work->frags[i].mr = + get_prefetchable_mr(pd, advice, sg_list[i].lkey); + if (!work->frags[i].mr) { + work->num_sge = i - 1; + if (i) + destroy_prefetch_work(work); + return false; + } - ret = pagefault_single_data_segment(dev, pd, sg->lkey, sg->addr, - sg->length, - &bytes_committed, NULL, - pf_flags); - if (ret < 0) - break; + /* Keep the MR pointer will valid outside the SRCU */ + atomic_inc(&work->frags[i].mr->num_deferred_work); } - - return ret < 0 ? ret : 0; + work->num_sge = num_sge; + return true; } -static void mlx5_ib_prefetch_mr_work(struct work_struct *work) +static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 pf_flags, struct ib_sge *sg_list, + u32 num_sge) { - struct prefetch_mr_work *w = - container_of(work, struct prefetch_mr_work, work); + struct mlx5_ib_dev *dev = to_mdev(pd->device); + u32 bytes_mapped = 0; + int srcu_key; + int ret = 0; + u32 i; + + srcu_key = srcu_read_lock(&dev->odp_srcu); + for (i = 0; i < num_sge; ++i) { + struct mlx5_ib_mr *mr; - if (ib_device_try_get(w->pd->device)) { - mlx5_ib_prefetch_sg_list(w->pd, w->pf_flags, w->sg_list, - w->num_sge); - ib_device_put(w->pd->device); + mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey); + if (!mr) { + ret = -ENOENT; + goto out; + } + ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, + &bytes_mapped, pf_flags); + if (ret < 0) + goto out; } + ret = 0; - num_pending_prefetch_dec(to_mdev(w->pd->device), w->sg_list, - w->num_sge, 0); - kvfree(w); +out: + srcu_read_unlock(&dev->odp_srcu, srcu_key); + return ret; } int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, @@ -1703,43 +1790,27 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, u32 flags, struct ib_sge *sg_list, u32 num_sge) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - u32 pf_flags = MLX5_PF_FLAGS_PREFETCH; + u32 pf_flags = 0; struct prefetch_mr_work *work; - bool valid_req; int srcu_key; if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) - return mlx5_ib_prefetch_sg_list(pd, pf_flags, sg_list, + return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list, num_sge); - work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL); + work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL); if (!work) return -ENOMEM; - memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge)); - - /* It is guaranteed that the pd when work is executed is the pd when - * work was queued since pd can't be destroyed while it holds MRs and - * destroying a MR leads to flushing the workquque - */ - work->pd = pd; - work->pf_flags = pf_flags; - work->num_sge = num_sge; - - INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); - - srcu_key = srcu_read_lock(&dev->mr_srcu); - - valid_req = num_pending_prefetch_inc(pd, sg_list, num_sge); - if (valid_req) - queue_work(system_unbound_wq, &work->work); - else - kvfree(work); - - srcu_read_unlock(&dev->mr_srcu, srcu_key); - - return valid_req ? 0 : -EINVAL; + srcu_key = srcu_read_lock(&dev->odp_srcu); + if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) { + srcu_read_unlock(&dev->odp_srcu, srcu_key); + return -EINVAL; + } + queue_work(system_unbound_wq, &work->work); + srcu_read_unlock(&dev->odp_srcu, srcu_key); + return 0; } |