diff options
48 files changed, 3318 insertions, 317 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 77089399359b..b899531498eb 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -38,6 +38,17 @@ config INFINIBAND_USER_MEM depends on INFINIBAND_USER_ACCESS != n default y +config INFINIBAND_ON_DEMAND_PAGING + bool "InfiniBand on-demand paging support" + depends on INFINIBAND_USER_MEM + select MMU_NOTIFIER + default y + ---help--- + On demand paging support for the InfiniBand subsystem. + Together with driver support this allows registration of + memory regions without pinning their pages, fetching the + pages on demand instead. + config INFINIBAND_ADDR_TRANS bool depends on INFINIBAND diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index ffd0af6734af..acf736764445 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o +ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o ib_mad-y := mad.o smi.o agent.o mad_rmpp.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 8172d37f9add..f80da50d84a5 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -176,8 +176,8 @@ static void set_timeout(unsigned long time) unsigned long delay; delay = time - jiffies; - if ((long)delay <= 0) - delay = 1; + if ((long)delay < 0) + delay = 0; mod_delayed_work(addr_wq, &work, delay); } diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index d2360a8ef0b2..fa17b552ff78 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -525,17 +525,22 @@ static void join_handler(int status, struct ib_sa_mcmember_rec *rec, if (status) process_join_error(group, status); else { + int mgids_changed, is_mgid0; ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(rec->pkey), &pkey_index); spin_lock_irq(&group->port->lock); - group->rec = *rec; if (group->state == MCAST_BUSY && group->pkey_index == MCAST_INVALID_PKEY_INDEX) group->pkey_index = pkey_index; - if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) { + mgids_changed = memcmp(&rec->mgid, &group->rec.mgid, + sizeof(group->rec.mgid)); + group->rec = *rec; + if (mgids_changed) { rb_erase(&group->node, &group->port->table); - mcast_insert(group->port, group, 1); + is_mgid0 = !memcmp(&mgid0, &group->rec.mgid, + sizeof(mgid0)); + mcast_insert(group->port, group, is_mgid0); } spin_unlock_irq(&group->port->lock); } diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index df0c4f605a21..aec7a6aa2951 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -39,6 +39,7 @@ #include <linux/hugetlb.h> #include <linux/dma-attrs.h> #include <linux/slab.h> +#include <rdma/ib_umem_odp.h> #include "uverbs.h" @@ -69,6 +70,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d /** * ib_umem_get - Pin and DMA map userspace memory. + * + * If access flags indicate ODP memory, avoid pinning. Instead, stores + * the mm for future page fault handling in conjunction with MMU notifiers. + * * @context: userspace context to pin memory for * @addr: userspace virtual address to start at * @size: length of region to pin @@ -103,17 +108,30 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->context = context; umem->length = size; - umem->offset = addr & ~PAGE_MASK; + umem->address = addr; umem->page_size = PAGE_SIZE; umem->pid = get_task_pid(current, PIDTYPE_PID); /* - * We ask for writable memory if any access flags other than - * "remote read" are set. "Local write" and "remote write" + * We ask for writable memory if any of the following + * access flags are set. "Local write" and "remote write" * obviously require write access. "Remote atomic" can do * things like fetch and add, which will modify memory, and * "MW bind" can change permissions by binding a window. */ - umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); + umem->writable = !!(access & + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); + + if (access & IB_ACCESS_ON_DEMAND) { + ret = ib_umem_odp_get(context, umem); + if (ret) { + kfree(umem); + return ERR_PTR(ret); + } + return umem; + } + + umem->odp_data = NULL; /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; @@ -132,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!vma_list) umem->hugetlb = 0; - npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; + npages = ib_umem_num_pages(umem); down_write(¤t->mm->mmap_sem); @@ -235,6 +253,11 @@ void ib_umem_release(struct ib_umem *umem) struct task_struct *task; unsigned long diff; + if (umem->odp_data) { + ib_umem_odp_release(umem); + return; + } + __ib_umem_release(umem->context->device, umem, 1); task = get_pid_task(umem->pid, PIDTYPE_PID); @@ -246,7 +269,7 @@ void ib_umem_release(struct ib_umem *umem) if (!mm) goto out; - diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; + diff = ib_umem_num_pages(umem); /* * We may be called with the mm's mmap_sem already held. This @@ -283,6 +306,9 @@ int ib_umem_page_count(struct ib_umem *umem) int n; struct scatterlist *sg; + if (umem->odp_data) + return ib_umem_num_pages(umem); + shift = ilog2(umem->page_size); n = 0; @@ -292,3 +318,37 @@ int ib_umem_page_count(struct ib_umem *umem) return n; } EXPORT_SYMBOL(ib_umem_page_count); + +/* + * Copy from the given ib_umem's pages to the given buffer. + * + * umem - the umem to copy from + * offset - offset to start copying from + * dst - destination buffer + * length - buffer length + * + * Returns 0 on success, or an error code. + */ +int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length) +{ + size_t end = offset + length; + int ret; + + if (offset > umem->length || length > umem->length - offset) { + pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n", + offset, umem->length, end); + return -EINVAL; + } + + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length, + offset + ib_umem_offset(umem)); + + if (ret < 0) + return ret; + else if (ret != length) + return -EINVAL; + else + return 0; +} +EXPORT_SYMBOL(ib_umem_copy_from); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c new file mode 100644 index 000000000000..6095872549e7 --- /dev/null +++ b/drivers/infiniband/core/umem_odp.c @@ -0,0 +1,668 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/pid.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/vmalloc.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> + +static void ib_umem_notifier_start_account(struct ib_umem *item) +{ + mutex_lock(&item->odp_data->umem_mutex); + + /* Only update private counters for this umem if it has them. + * Otherwise skip it. All page faults will be delayed for this umem. */ + if (item->odp_data->mn_counters_active) { + int notifiers_count = item->odp_data->notifiers_count++; + + if (notifiers_count == 0) + /* Initialize the completion object for waiting on + * notifiers. Since notifier_count is zero, no one + * should be waiting right now. */ + reinit_completion(&item->odp_data->notifier_completion); + } + mutex_unlock(&item->odp_data->umem_mutex); +} + +static void ib_umem_notifier_end_account(struct ib_umem *item) +{ + mutex_lock(&item->odp_data->umem_mutex); + + /* Only update private counters for this umem if it has them. + * Otherwise skip it. All page faults will be delayed for this umem. */ + if (item->odp_data->mn_counters_active) { + /* + * This sequence increase will notify the QP page fault that + * the page that is going to be mapped in the spte could have + * been freed. + */ + ++item->odp_data->notifiers_seq; + if (--item->odp_data->notifiers_count == 0) + complete_all(&item->odp_data->notifier_completion); + } + mutex_unlock(&item->odp_data->umem_mutex); +} + +/* Account for a new mmu notifier in an ib_ucontext. */ +static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) +{ + atomic_inc(&context->notifier_count); +} + +/* Account for a terminating mmu notifier in an ib_ucontext. + * + * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since + * the function takes the semaphore itself. */ +static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) +{ + int zero_notifiers = atomic_dec_and_test(&context->notifier_count); + + if (zero_notifiers && + !list_empty(&context->no_private_counters)) { + /* No currently running mmu notifiers. Now is the chance to + * add private accounting to all previously added umems. */ + struct ib_umem_odp *odp_data, *next; + + /* Prevent concurrent mmu notifiers from working on the + * no_private_counters list. */ + down_write(&context->umem_rwsem); + + /* Read the notifier_count again, with the umem_rwsem + * semaphore taken for write. */ + if (!atomic_read(&context->notifier_count)) { + list_for_each_entry_safe(odp_data, next, + &context->no_private_counters, + no_private_counters) { + mutex_lock(&odp_data->umem_mutex); + odp_data->mn_counters_active = true; + list_del(&odp_data->no_private_counters); + complete_all(&odp_data->notifier_completion); + mutex_unlock(&odp_data->umem_mutex); + } + } + + up_write(&context->umem_rwsem); + } +} + +static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) { + /* + * Increase the number of notifiers running, to + * prevent any further fault handling on this MR. + */ + ib_umem_notifier_start_account(item); + item->odp_data->dying = 1; + /* Make sure that the fact the umem is dying is out before we release + * all pending page faults. */ + smp_wmb(); + complete_all(&item->odp_data->notifier_completion); + item->context->invalidate_range(item, ib_umem_start(item), + ib_umem_end(item)); + return 0; +} + +static void ib_umem_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, + ULLONG_MAX, + ib_umem_notifier_release_trampoline, + NULL); + up_read(&context->umem_rwsem); +} + +static int invalidate_page_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_start_account(item); + item->context->invalidate_range(item, start, start + PAGE_SIZE); + ib_umem_notifier_end_account(item); + return 0; +} + +static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, address, + address + PAGE_SIZE, + invalidate_page_trampoline, NULL); + up_read(&context->umem_rwsem); + ib_ucontext_notifier_end_account(context); +} + +static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_start_account(item); + item->context->invalidate_range(item, start, end); + return 0; +} + +static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + ib_ucontext_notifier_start_account(context); + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + end, + invalidate_range_start_trampoline, NULL); + up_read(&context->umem_rwsem); +} + +static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, + u64 end, void *cookie) +{ + ib_umem_notifier_end_account(item); + return 0; +} + +static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + + if (!context->invalidate_range) + return; + + down_read(&context->umem_rwsem); + rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + end, + invalidate_range_end_trampoline, NULL); + up_read(&context->umem_rwsem); + ib_ucontext_notifier_end_account(context); +} + +static struct mmu_notifier_ops ib_umem_notifiers = { + .release = ib_umem_notifier_release, + .invalidate_page = ib_umem_notifier_invalidate_page, + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, +}; + +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) +{ + int ret_val; + struct pid *our_pid; + struct mm_struct *mm = get_task_mm(current); + + if (!mm) + return -EINVAL; + + /* Prevent creating ODP MRs in child processes */ + rcu_read_lock(); + our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); + rcu_read_unlock(); + put_pid(our_pid); + if (context->tgid != our_pid) { + ret_val = -EINVAL; + goto out_mm; + } + + umem->hugetlb = 0; + umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); + if (!umem->odp_data) { + ret_val = -ENOMEM; + goto out_mm; + } + umem->odp_data->umem = umem; + + mutex_init(&umem->odp_data->umem_mutex); + + init_completion(&umem->odp_data->notifier_completion); + + umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * + sizeof(*umem->odp_data->page_list)); + if (!umem->odp_data->page_list) { + ret_val = -ENOMEM; + goto out_odp_data; + } + + umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * + sizeof(*umem->odp_data->dma_list)); + if (!umem->odp_data->dma_list) { + ret_val = -ENOMEM; + goto out_page_list; + } + + /* + * When using MMU notifiers, we will get a + * notification before the "current" task (and MM) is + * destroyed. We use the umem_rwsem semaphore to synchronize. + */ + down_write(&context->umem_rwsem); + context->odp_mrs_count++; + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_insert(&umem->odp_data->interval_tree, + &context->umem_tree); + if (likely(!atomic_read(&context->notifier_count))) + umem->odp_data->mn_counters_active = true; + else + list_add(&umem->odp_data->no_private_counters, + &context->no_private_counters); + downgrade_write(&context->umem_rwsem); + + if (context->odp_mrs_count == 1) { + /* + * Note that at this point, no MMU notifier is running + * for this context! + */ + atomic_set(&context->notifier_count, 0); + INIT_HLIST_NODE(&context->mn.hlist); + context->mn.ops = &ib_umem_notifiers; + /* + * Lock-dep detects a false positive for mmap_sem vs. + * umem_rwsem, due to not grasping downgrade_write correctly. + */ + lockdep_off(); + ret_val = mmu_notifier_register(&context->mn, mm); + lockdep_on(); + if (ret_val) { + pr_err("Failed to register mmu_notifier %d\n", ret_val); + ret_val = -EBUSY; + goto out_mutex; + } + } + + up_read(&context->umem_rwsem); + + /* + * Note that doing an mmput can cause a notifier for the relevant mm. + * If the notifier is called while we hold the umem_rwsem, this will + * cause a deadlock. Therefore, we release the reference only after we + * released the semaphore. + */ + mmput(mm); + return 0; + +out_mutex: + up_read(&context->umem_rwsem); + vfree(umem->odp_data->dma_list); +out_page_list: + vfree(umem->odp_data->page_list); +out_odp_data: + kfree(umem->odp_data); +out_mm: + mmput(mm); + return ret_val; +} + +void ib_umem_odp_release(struct ib_umem *umem) +{ + struct ib_ucontext *context = umem->context; + + /* + * Ensure that no more pages are mapped in the umem. + * + * It is the driver's responsibility to ensure, before calling us, + * that the hardware will not attempt to access the MR any more. + */ + ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), + ib_umem_end(umem)); + + down_write(&context->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_remove(&umem->odp_data->interval_tree, + &context->umem_tree); + context->odp_mrs_count--; + if (!umem->odp_data->mn_counters_active) { + list_del(&umem->odp_data->no_private_counters); + complete_all(&umem->odp_data->notifier_completion); + } + + /* + * Downgrade the lock to a read lock. This ensures that the notifiers + * (who lock the mutex for reading) will be able to finish, and we + * will be able to enventually obtain the mmu notifiers SRCU. Note + * that since we are doing it atomically, no other user could register + * and unregister while we do the check. + */ + downgrade_write(&context->umem_rwsem); + if (!context->odp_mrs_count) { + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + + owning_process = get_pid_task(context->tgid, + PIDTYPE_PID); + if (owning_process == NULL) + /* + * The process is already dead, notifier were removed + * already. + */ + goto out; + + owning_mm = get_task_mm(owning_process); + if (owning_mm == NULL) + /* + * The process' mm is already dead, notifier were + * removed already. + */ + goto out_put_task; + mmu_notifier_unregister(&context->mn, owning_mm); + + mmput(owning_mm); + +out_put_task: + put_task_struct(owning_process); + } +out: + up_read(&context->umem_rwsem); + + vfree(umem->odp_data->dma_list); + vfree(umem->odp_data->page_list); + kfree(umem->odp_data); + kfree(umem); +} + +/* + * Map for DMA and insert a single page into the on-demand paging page tables. + * + * @umem: the umem to insert the page to. + * @page_index: index in the umem to add the page to. + * @page: the page struct to map and add. + * @access_mask: access permissions needed for this page. + * @current_seq: sequence number for synchronization with invalidations. + * the sequence number is taken from + * umem->odp_data->notifiers_seq. + * + * The function returns -EFAULT if the DMA mapping operation fails. It returns + * -EAGAIN if a concurrent invalidation prevents us from updating the page. + * + * The page is released via put_page even if the operation failed. For + * on-demand pinning, the page is released whenever it isn't stored in the + * umem. + */ +static int ib_umem_odp_map_dma_single_page( + struct ib_umem *umem, + int page_index, + u64 base_virt_addr, + struct page *page, + u64 access_mask, + unsigned long current_seq) +{ + struct ib_device *dev = umem->context->device; + dma_addr_t dma_addr; + int stored_page = 0; + int remove_existing_mapping = 0; + int ret = 0; + + mutex_lock(&umem->odp_data->umem_mutex); + /* + * Note: we avoid writing if seq is different from the initial seq, to + * handle case of a racing notifier. This check also allows us to bail + * early if we have a notifier running in parallel with us. + */ + if (ib_umem_mmu_notifier_retry(umem, current_seq)) { + ret = -EAGAIN; + goto out; + } + if (!(umem->odp_data->dma_list[page_index])) { + dma_addr = ib_dma_map_page(dev, + page, + 0, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(dev, dma_addr)) { + ret = -EFAULT; + goto out; + } + umem->odp_data->dma_list[page_index] = dma_addr | access_mask; + umem->odp_data->page_list[page_index] = page; + stored_page = 1; + } else if (umem->odp_data->page_list[page_index] == page) { + umem->odp_data->dma_list[page_index] |= access_mask; + } else { + pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", + umem->odp_data->page_list[page_index], page); + /* Better remove the mapping now, to prevent any further + * damage. */ + remove_existing_mapping = 1; + } + +out: + mutex_unlock(&umem->odp_data->umem_mutex); + + /* On Demand Paging - avoid pinning the page */ + if (umem->context->invalidate_range || !stored_page) + put_page(page); + + if (remove_existing_mapping && umem->context->invalidate_range) { + invalidate_page_trampoline( + umem, + base_virt_addr + (page_index * PAGE_SIZE), + base_virt_addr + ((page_index+1)*PAGE_SIZE), + NULL); + ret = -EAGAIN; + } + + return ret; +} + +/** + * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. + * + * Pins the range of pages passed in the argument, and maps them to + * DMA addresses. The DMA addresses of the mapped pages is updated in + * umem->odp_data->dma_list. + * + * Returns the number of pages mapped in success, negative error code + * for failure. + * An -EAGAIN error code is returned when a concurrent mmu notifier prevents + * the function from completing its task. + * + * @umem: the umem to map and pin + * @user_virt: the address from which we need to map. + * @bcnt: the minimal number of bytes to pin and map. The mapping might be + * bigger due to alignment, and may also be smaller in case of an error + * pinning or mapping a page. The actual pages mapped is returned in + * the return value. + * @access_mask: bit mask of the requested access permissions for the given + * range. + * @current_seq: the MMU notifiers sequance value for synchronization with + * invalidations. the sequance number is read from + * umem->odp_data->notifiers_seq before calling this function + */ +int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, + u64 access_mask, unsigned long current_seq) +{ + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + struct page **local_page_list = NULL; + u64 off; + int j, k, ret = 0, start_idx, npages = 0; + u64 base_virt_addr; + + if (access_mask == 0) + return -EINVAL; + + if (user_virt < ib_umem_start(umem) || + user_virt + bcnt > ib_umem_end(umem)) + return -EFAULT; + + local_page_list = (struct page **)__get_free_page(GFP_KERNEL); + if (!local_page_list) + return -ENOMEM; + + off = user_virt & (~PAGE_MASK); + user_virt = user_virt & PAGE_MASK; + base_virt_addr = user_virt; + bcnt += off; /* Charge for the first page offset as well. */ + + owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); + if (owning_process == NULL) { + ret = -EINVAL; + goto out_no_task; + } + + owning_mm = get_task_mm(owning_process); + if (owning_mm == NULL) { + ret = -EINVAL; + goto out_put_task; + } + + start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; + k = start_idx; + + while (bcnt > 0) { + const size_t gup_num_pages = + min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, + PAGE_SIZE / sizeof(struct page *)); + + down_read(&owning_mm->mmap_sem); + /* + * Note: this might result in redundent page getting. We can + * avoid this by checking dma_list to be 0 before calling + * get_user_pages. However, this make the code much more + * complex (and doesn't gain us much performance in most use + * cases). + */ + npages = get_user_pages(owning_process, owning_mm, user_virt, + gup_num_pages, + access_mask & ODP_WRITE_ALLOWED_BIT, 0, + local_page_list, NULL); + up_read(&owning_mm->mmap_sem); + + if (npages < 0) + break; + + bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); + user_virt += npages << PAGE_SHIFT; + for (j = 0; j < npages; ++j) { + ret = ib_umem_odp_map_dma_single_page( + umem, k, base_virt_addr, local_page_list[j], + access_mask, current_seq); + if (ret < 0) + break; + k++; + } + + if (ret < 0) { + /* Release left over pages when handling errors. */ + for (++j; j < npages; ++j) + put_page(local_page_list[j]); + break; + } + } + + if (ret >= 0) { + if (npages < 0 && k == start_idx) + ret = npages; + else + ret = k - start_idx; + } + + mmput(owning_mm); +out_put_task: + put_task_struct(owning_process); +out_no_task: + free_page((unsigned long)local_page_list); + return ret; +} +EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); + +void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, + u64 bound) +{ + int idx; + u64 addr; + struct ib_device *dev = umem->context->device; + + virt = max_t(u64, virt, ib_umem_start(umem)); + bound = min_t(u64, bound, ib_umem_end(umem)); + /* Note that during the run of this function, the + * notifiers_count of the MR is > 0, preventing any racing + * faults from completion. We might be racing with other + * invalidations, so we must make sure we free each page only + * once. */ + for (addr = virt; addr < bound; addr += (u64)umem->page_size) { + idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + mutex_lock(&umem->odp_data->umem_mutex); + if (umem->odp_data->page_list[idx]) { + struct page *page = umem->odp_data->page_list[idx]; + struct page *head_page = compound_head(page); + dma_addr_t dma = umem->odp_data->dma_list[idx]; + dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; + + WARN_ON(!dma_addr); + + ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (dma & ODP_WRITE_ALLOWED_BIT) + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); + /* on demand pinning support */ + if (!umem->context->invalidate_range) + put_page(page); + umem->odp_data->page_list[idx] = NULL; + umem->odp_data->dma_list[idx] = 0; + } + mutex_unlock(&umem->odp_data->umem_mutex); + } +} +EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c new file mode 100644 index 000000000000..727d788448f5 --- /dev/null +++ b/drivers/infiniband/core/umem_rbtree.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/interval_tree_generic.h> +#include <linux/sched.h> +#include <linux/gfp.h> +#include <rdma/ib_umem_odp.h> + +/* + * The ib_umem list keeps track of memory regions for which the HW + * device request to receive notification when the related memory + * mapping is changed. + * + * ib_umem_lock protects the list. + */ + +static inline u64 node_start(struct umem_odp_node *n) +{ + struct ib_umem_odp *umem_odp = + container_of(n, struct ib_umem_odp, interval_tree); + + return ib_umem_start(umem_odp->umem); +} + +/* Note that the representation of the intervals in the interval tree + * considers the ending point as contained in the interval, while the + * function ib_umem_end returns the first address which is not contained + * in the umem. + */ +static inline u64 node_last(struct umem_odp_node *n) +{ + struct ib_umem_odp *umem_odp = + container_of(n, struct ib_umem_odp, interval_tree); + + return ib_umem_end(umem_odp->umem) - 1; +} + +INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, + node_start, node_last, , rbt_ib_umem) + +/* @last is not a part of the interval. See comment for function + * node_last. + */ +int rbt_ib_umem_for_each_in_range(struct rb_root *root, + u64 start, u64 last, + umem_call_back cb, + void *cookie) +{ + int ret_val = 0; + struct umem_odp_node *node; + struct ib_umem_odp *umem; + + if (unlikely(start == last)) + return ret_val; + + for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; + node = rbt_ib_umem_iter_next(node, start, last - 1)) { + umem = container_of(node, struct ib_umem_odp, interval_tree); + ret_val = cb(umem->umem, start, last, cookie) || ret_val; + } + + return ret_val; +} diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 643c08a025a5..b716b0815644 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -258,5 +258,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); IB_UVERBS_DECLARE_EX_CMD(create_flow); IB_UVERBS_DECLARE_EX_CMD(destroy_flow); +IB_UVERBS_DECLARE_EX_CMD(query_device); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5ba2a86aab6a..532d8eba8b02 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -36,6 +36,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/slab.h> +#include <linux/sched.h> #include <asm/uaccess.h> @@ -288,6 +289,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, struct ib_uverbs_get_context_resp resp; struct ib_udata udata; struct ib_device *ibdev = file->device->ib_dev; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct ib_device_attr dev_attr; +#endif struct ib_ucontext *ucontext; struct file *filp; int ret; @@ -325,8 +329,25 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, INIT_LIST_HEAD(&ucontext->ah_list); INIT_LIST_HEAD(&ucontext->xrcd_list); INIT_LIST_HEAD(&ucontext->rule_list); + rcu_read_lock(); + ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + rcu_read_unlock(); ucontext->closing = 0; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + ucontext->umem_tree = RB_ROOT; + init_rwsem(&ucontext->umem_rwsem); + ucontext->odp_mrs_count = 0; + INIT_LIST_HEAD(&ucontext->no_private_counters); + + ret = ib_query_device(ibdev, &dev_attr); + if (ret) + goto err_free; + if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + ucontext->invalidate_range = NULL; + +#endif + resp.num_comp_vectors = file->device->num_comp_vectors; ret = get_unused_fd_flags(O_CLOEXEC); @@ -371,6 +392,7 @@ err_fd: put_unused_fd(resp.async_fd); err_free: + put_pid(ucontext->tgid); ibdev->dealloc_ucontext(ucontext); err: @@ -378,6 +400,52 @@ err: return ret; } +static void copy_query_dev_fields(struct ib_uverbs_file *file, + struct ib_uverbs_query_device_resp *resp, + struct ib_device_attr *attr) +{ + resp->fw_ver = attr->fw_ver; + resp->node_guid = file->device->ib_dev->node_guid; + resp->sys_image_guid = attr->sys_image_guid; + resp->max_mr_size = attr->max_mr_size; + resp->page_size_cap = attr->page_size_cap; + resp->vendor_id = attr->vendor_id; + resp->vendor_part_id = attr->vendor_part_id; + resp->hw_ver = attr->hw_ver; + resp->max_qp = attr->max_qp; + resp->max_qp_wr = attr->max_qp_wr; + resp->device_cap_flags = attr->device_cap_flags; + resp->max_sge = attr->max_sge; + resp->max_sge_rd = attr->max_sge_rd; + resp->max_cq = attr->max_cq; + resp->max_cqe = attr->max_cqe; + resp->max_mr = attr->max_mr; + resp->max_pd = attr->max_pd; + resp->max_qp_rd_atom = attr->max_qp_rd_atom; + resp->max_ee_rd_atom = attr->max_ee_rd_atom; + resp->max_res_rd_atom = attr->max_res_rd_atom; + resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; + resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; + resp->atomic_cap = attr->atomic_cap; + resp->max_ee = attr->max_ee; + resp->max_rdd = attr->max_rdd; + resp->max_mw = attr->max_mw; + resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; + resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; + resp->max_mcast_grp = attr->max_mcast_grp; + resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; + resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; + resp->max_ah = attr->max_ah; + resp->max_fmr = attr->max_fmr; + resp->max_map_per_fmr = attr->max_map_per_fmr; + resp->max_srq = attr->max_srq; + resp->max_srq_wr = attr->max_srq_wr; + resp->max_srq_sge = attr->max_srq_sge; + resp->max_pkeys = attr->max_pkeys; + resp->local_ca_ack_delay = attr->local_ca_ack_delay; + resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; +} + ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -398,47 +466,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, return ret; memset(&resp, 0, sizeof resp); - - resp.fw_ver = attr.fw_ver; - resp.node_guid = file->device->ib_dev->node_guid; - resp.sys_image_guid = attr.sys_image_guid; - resp.max_mr_size = attr.max_mr_size; - resp.page_size_cap = attr.page_size_cap; - resp.vendor_id = attr.vendor_id; - resp.vendor_part_id = attr.vendor_part_id; - resp.hw_ver = attr.hw_ver; - resp.max_qp = attr.max_qp; - resp.max_qp_wr = attr.max_qp_wr; - resp.device_cap_flags = attr.device_cap_flags; - resp.max_sge = attr.max_sge; - resp.max_sge_rd = attr.max_sge_rd; - resp.max_cq = attr.max_cq; - resp.max_cqe = attr.max_cqe; - resp.max_mr = attr.max_mr; - resp.max_pd = attr.max_pd; - resp.max_qp_rd_atom = attr.max_qp_rd_atom; - resp.max_ee_rd_atom = attr.max_ee_rd_atom; - resp.max_res_rd_atom = attr.max_res_rd_atom; - resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom; - resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom; - resp.atomic_cap = attr.atomic_cap; - resp.max_ee = attr.max_ee; - resp.max_rdd = attr.max_rdd; - resp.max_mw = attr.max_mw; - resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp; - resp.max_raw_ethy_qp = attr.max_raw_ethy_qp; - resp.max_mcast_grp = attr.max_mcast_grp; - resp.max_mcast_qp_attach = attr.max_mcast_qp_attach; - resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach; - resp.max_ah = attr.max_ah; - resp.max_fmr = attr.max_fmr; - resp.max_map_per_fmr = attr.max_map_per_fmr; - resp.max_srq = attr.max_srq; - resp.max_srq_wr = attr.max_srq_wr; - resp.max_srq_sge = attr.max_srq_sge; - resp.max_pkeys = attr.max_pkeys; - resp.local_ca_ack_delay = attr.local_ca_ack_delay; - resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt; + copy_query_dev_fields(file, &resp, &attr); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -947,6 +975,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, goto err_free; } + if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { + struct ib_device_attr attr; + + ret = ib_query_device(pd->device, &attr); + if (ret || !(attr.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { + pr_debug("ODP support not available\n"); + ret = -EINVAL; + goto err_put; + } + } + mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, cmd.access_flags, &udata); if (IS_ERR(mr)) { @@ -3253,3 +3293,52 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, return ret ? ret : in_len; } + +int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_query_device_resp resp; + struct ib_uverbs_ex_query_device cmd; + struct ib_device_attr attr; + struct ib_device *device; + int err; + + device = file->device->ib_dev; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + if (cmd.reserved) + return -EINVAL; + + err = device->query_device(device, &attr); + if (err) + return err; + + memset(&resp, 0, sizeof(resp)); + copy_query_dev_fields(file, &resp.base, &attr); + resp.comp_mask = 0; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (cmd.comp_mask & IB_USER_VERBS_EX_QUERY_DEVICE_ODP) { + resp.odp_caps.general_caps = attr.odp_caps.general_caps; + resp.odp_caps.per_transport_caps.rc_odp_caps = + attr.odp_caps.per_transport_caps.rc_odp_caps; + resp.odp_caps.per_transport_caps.uc_odp_caps = + attr.odp_caps.per_transport_caps.uc_odp_caps; + resp.odp_caps.per_transport_caps.ud_odp_caps = + attr.odp_caps.per_transport_caps.ud_odp_caps; + resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP; + } +#endif + + err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); + if (err) + return err; + + return 0; +} diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 71ab83fde472..e6c23b9eab33 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -122,7 +122,8 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) = { [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, - [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow + [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, + [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device }; static void ib_uverbs_add_one(struct ib_device *device); @@ -296,6 +297,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uobj); } + put_pid(context->tgid); + return context->device->dealloc_ucontext(context); } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c2b89cc5dbca..f93eb8da7b5a 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -879,7 +879,8 @@ int ib_resolve_eth_l2_attrs(struct ib_qp *qp, if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac); rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac); - qp_attr->vlan_id = rdma_get_vlan_id(&sgid); + if (!(*qp_attr_mask & IB_QP_VID)) + qp_attr->vlan_id = rdma_get_vlan_id(&sgid); } else { ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid, qp_attr->ah_attr.dmac, &qp_attr->vlan_id); diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index 2d5cbf4363e4..bdf3507810cb 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -476,7 +476,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, c2mr->umem->page_size, i, length, - c2mr->umem->offset, + ib_umem_offset(c2mr->umem), &kva, c2_convert_access(acc), c2mr); diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 4b8c6116c058..9edc200b311d 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1640,7 +1640,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) __state_set(&ep->com, MPA_REQ_RCVD); /* drive upcall */ - mutex_lock(&ep->parent_ep->com.mutex); + mutex_lock_nested(&ep->parent_ep->com.mutex, + SINGLE_DEPTH_NESTING); if (ep->parent_ep->com.state != DEAD) { if (connect_request_upcall(ep)) abort_connection(ep, skb, GFP_KERNEL); @@ -3126,6 +3127,8 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, 0, 0, __func__); + else if (err > 0) + err = net_xmit_errno(err); if (err) pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", err, ep->stid, @@ -3159,6 +3162,8 @@ static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, 0, 0, __func__); + else if (err > 0) + err = net_xmit_errno(err); } if (err) pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n" diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 72f1f052e88c..eb5df4e62703 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -670,7 +670,7 @@ static int ep_open(struct inode *inode, struct file *file) idr_for_each(&epd->devp->stid_idr, count_idrs, &count); spin_unlock_irq(&epd->devp->lock); - epd->bufsize = count * 160; + epd->bufsize = count * 240; epd->buf = vmalloc(epd->bufsize); if (!epd->buf) { ret = -ENOMEM; diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 0744455cd88b..cb43c2299ac0 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -50,6 +50,13 @@ static int inline_threshold = C4IW_INLINE_THRESHOLD; module_param(inline_threshold, int, 0644); MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)"); +static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length) +{ + return (is_t4(dev->rdev.lldi.adapter_type) || + is_t5(dev->rdev.lldi.adapter_type)) && + length >= 8*1024*1024*1024ULL; +} + static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, u32 len, dma_addr_t data, int wait) { @@ -369,9 +376,11 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php, int ret; ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, - FW_RI_STAG_NSMR, mhp->attr.perms, + FW_RI_STAG_NSMR, mhp->attr.len ? + mhp->attr.perms : 0, mhp->attr.mw_bind_enable, mhp->attr.zbva, - mhp->attr.va_fbo, mhp->attr.len, shift - 12, + mhp->attr.va_fbo, mhp->attr.len ? + mhp->attr.len : -1, shift - 12, mhp->attr.pbl_size, mhp->attr.pbl_addr); if (ret) return ret; @@ -536,6 +545,11 @@ int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask, return ret; } + if (mr_exceeds_hw_limits(rhp, total_size)) { + kfree(page_list); + return -EINVAL; + } + ret = reregister_mem(rhp, php, &mh, shift, npages); kfree(page_list); if (ret) @@ -596,6 +610,12 @@ struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd, if (ret) goto err; + if (mr_exceeds_hw_limits(rhp, total_size)) { + kfree(page_list); + ret = -EINVAL; + goto err; + } + ret = alloc_pbl(mhp, npages); if (ret) { kfree(page_list); @@ -699,6 +719,10 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, php = to_c4iw_pd(pd); rhp = php->rhp; + + if (mr_exceeds_hw_limits(rhp, length)) + return ERR_PTR(-EINVAL); + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); if (!mhp) return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 2ed3ece2b2ee..bb85d479e66e 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1538,9 +1538,9 @@ err: set_state(qhp, C4IW_QP_STATE_ERROR); free = 1; abort = 1; - wake_up(&qhp->wait); BUG_ON(!ep); flush_qp(qhp); + wake_up(&qhp->wait); out: mutex_unlock(&qhp->mutex); diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index 3488e8c9fcb4..f914b30999f8 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -399,7 +399,7 @@ reg_user_mr_fallback: pginfo.num_kpages = num_kpages; pginfo.num_hwpages = num_hwpages; pginfo.u.usr.region = e_mr->umem; - pginfo.next_hwpage = e_mr->umem->offset / hwpage_size; + pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size; pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index 5e61e9bff697..c7278f6a8217 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -214,7 +214,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->mr.user_base = start; mr->mr.iova = virt_addr; mr->mr.length = length; - mr->mr.offset = umem->offset; + mr->mr.offset = ib_umem_offset(umem); mr->mr.access_flags = mr_access_flags; mr->mr.max_segs = n; mr->umem = umem; diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 8f9325cfc85d..c36ccbd9a644 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -223,7 +223,6 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, if (flags & IB_MR_REREG_TRANS) { int shift; - int err; int n; mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 4ea0135af484..27a70159e2ea 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o +mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 1ba6c42e4df8..8a87404e9c76 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -244,6 +244,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG) + props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; + props->odp_caps = dev->odp_caps; +#endif + out: kfree(in_mad); kfree(out_mad); @@ -568,6 +574,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, goto out_count; } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; +#endif + INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); @@ -858,7 +868,7 @@ static ssize_t show_reg_pages(struct device *device, struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); - return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages); + return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } static ssize_t show_hca(struct device *device, struct device_attribute *attr, @@ -1321,6 +1331,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); + dev->ib_dev.uverbs_ex_cmd_mask = + (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE); dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; @@ -1366,6 +1378,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; + mlx5_ib_internal_query_odp_caps(dev); + if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; @@ -1379,16 +1393,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) goto err_eqs; mutex_init(&dev->cap_mask_mutex); - spin_lock_init(&dev->mr_lock); err = create_dev_resources(&dev->devr); if (err) goto err_eqs; - err = ib_register_device(&dev->ib_dev, NULL); + err = mlx5_ib_odp_init_one(dev); if (err) goto err_rsrc; + err = ib_register_device(&dev->ib_dev, NULL); + if (err) + goto err_odp; + err = create_umr_res(dev); if (err) goto err_dev; @@ -1410,6 +1427,9 @@ err_umrc: err_dev: ib_unregister_device(&dev->ib_dev); +err_odp: + mlx5_ib_odp_remove_one(dev); + err_rsrc: destroy_dev_resources(&dev->devr); @@ -1425,8 +1445,10 @@ err_dealloc: static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; + ib_unregister_device(&dev->ib_dev); destroy_umrc_res(dev); + mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); free_comp_eqs(dev); ib_dealloc_device(&dev->ib_dev); @@ -1440,15 +1462,30 @@ static struct mlx5_interface mlx5_ib_interface = { static int __init mlx5_ib_init(void) { + int err; + if (deprecated_prof_sel != 2) pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); - return mlx5_register_interface(&mlx5_ib_interface); + err = mlx5_ib_odp_init(); + if (err) + return err; + + err = mlx5_register_interface(&mlx5_ib_interface); + if (err) + goto clean_odp; + + return err; + +clean_odp: + mlx5_ib_odp_cleanup(); + return err; } static void __exit mlx5_ib_cleanup(void) { mlx5_unregister_interface(&mlx5_ib_interface); + mlx5_ib_odp_cleanup(); } module_init(mlx5_ib_init); diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index dae07eae9507..b56e4c5593ee 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -32,6 +32,7 @@ #include <linux/module.h> #include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> #include "mlx5_ib.h" /* @umem: umem object to scan @@ -57,6 +58,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, int entry; unsigned long page_shift = ilog2(umem->page_size); + /* With ODP we must always match OS page size. */ + if (umem->odp_data) { + *count = ib_umem_page_count(umem); + *shift = PAGE_SHIFT; + *ncont = *count; + if (order) + *order = ilog2(roundup_pow_of_two(*count)); + + return; + } + addr = addr >> page_shift; tmp = (unsigned long)addr; m = find_first_bit(&tmp, sizeof(tmp)); @@ -108,8 +120,36 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, *count = i; } -void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, - int page_shift, __be64 *pas, int umr) +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static u64 umem_dma_to_mtt(dma_addr_t umem_dma) +{ + u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; + + if (umem_dma & ODP_READ_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_READ; + if (umem_dma & ODP_WRITE_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_WRITE; + + return mtt_entry; +} +#endif + +/* + * Populate the given array with bus addresses from the umem. + * + * dev - mlx5_ib device + * umem - umem to use to fill the pages + * page_shift - determines the page size used in the resulting array + * offset - offset into the umem to start from, + * only implemented for ODP umems + * num_pages - total number of pages to fill + * pas - bus addresses array to fill + * access_flags - access flags to set on all present pages. + use enum mlx5_ib_mtt_access_flags for this. + */ +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags) { unsigned long umem_page_shift = ilog2(umem->page_size); int shift = page_shift - umem_page_shift; @@ -120,6 +160,21 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int len; struct scatterlist *sg; int entry; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + const bool odp = umem->odp_data != NULL; + + if (odp) { + WARN_ON(shift != 0); + WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); + + for (i = 0; i < num_pages; ++i) { + dma_addr_t pa = umem->odp_data->dma_list[offset + i]; + + pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + } + return; + } +#endif i = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { @@ -128,8 +183,7 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, for (k = 0; k < len; k++) { if (!(i & mask)) { cur = base + (k << umem_page_shift); - if (umr) - cur |= 3; + cur |= access_flags; pas[i >> shift] = cpu_to_be64(cur); mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", @@ -142,6 +196,13 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, } } +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int access_flags) +{ + return __mlx5_ib_populate_pas(dev, umem, page_shift, 0, + ib_umem_num_pages(umem), pas, + access_flags); +} int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) { u64 page_size; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 386780f0d1e1..83f22fe297c8 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -111,6 +111,8 @@ struct mlx5_ib_pd { */ #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START +#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) +#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 @@ -147,6 +149,29 @@ enum { MLX5_QP_EMPTY }; +/* + * Connect-IB can trigger up to four concurrent pagefaults + * per-QP. + */ +enum mlx5_ib_pagefault_context { + MLX5_IB_PAGEFAULT_RESPONDER_READ, + MLX5_IB_PAGEFAULT_REQUESTOR_READ, + MLX5_IB_PAGEFAULT_RESPONDER_WRITE, + MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, + MLX5_IB_PAGEFAULT_CONTEXTS +}; + +static inline enum mlx5_ib_pagefault_context + mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) +{ + return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); +} + +struct mlx5_ib_pfault { + struct work_struct work; + struct mlx5_pagefault mpfault; +}; + struct mlx5_ib_qp { struct ib_qp ibqp; struct mlx5_core_qp mqp; @@ -192,6 +217,21 @@ struct mlx5_ib_qp { /* Store signature errors */ bool signature_en; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * A flag that is true for QP's that are in a state that doesn't + * allow page faults, and shouldn't schedule any more faults. + */ + int disable_page_faults; + /* + * The disable_page_faults_lock protects a QP's disable_page_faults + * field, allowing for a thread to atomically check whether the QP + * allows page faults, and if so schedule a page fault. + */ + spinlock_t disable_page_faults_lock; + struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; +#endif }; struct mlx5_ib_cq_buf { @@ -206,6 +246,19 @@ enum mlx5_ib_qp_flags { MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, }; +struct mlx5_umr_wr { + union { + u64 virt_addr; + u64 offset; + } target; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int npages; + u32 length; + int access_flags; + u32 mkey; +}; + struct mlx5_shared_mr_info { int mr_id; struct ib_umem *umem; @@ -253,6 +306,13 @@ struct mlx5_ib_xrcd { u32 xrcdn; }; +enum mlx5_ib_mtt_access_flags { + MLX5_IB_MTT_READ = (1 << 0), + MLX5_IB_MTT_WRITE = (1 << 1), +}; + +#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) + struct mlx5_ib_mr { struct ib_mr ibmr; struct mlx5_core_mr mmr; @@ -261,12 +321,11 @@ struct mlx5_ib_mr { struct list_head list; int order; int umred; - __be64 *pas; - dma_addr_t dma; int npages; struct mlx5_ib_dev *dev; struct mlx5_create_mkey_mbox_out out; struct mlx5_core_sig_ctx *sig; + int live; }; struct mlx5_ib_fast_reg_page_list { @@ -372,11 +431,18 @@ struct mlx5_ib_dev { struct umr_common umrc; /* sync used page count stats */ - spinlock_t mr_lock; struct mlx5_ib_resources devr; struct mlx5_mr_cache cache; struct timer_list delay_timer; int fill_delay; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct ib_odp_caps odp_caps; + /* + * Sleepable RCU that prevents destruction of MRs while they are still + * being used by a page fault handler. + */ + struct srcu_struct mr_srcu; +#endif }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -490,6 +556,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length); struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, int vector, struct ib_ucontext *context, struct ib_udata *udata); @@ -502,6 +570,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); +int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, + int npages, int zap); int mlx5_ib_dereg_mr(struct ib_mr *ibmr); int mlx5_ib_destroy_mr(struct ib_mr *ibmr); struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, @@ -533,8 +603,11 @@ int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, int *ncont, int *order); +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags); void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, - int page_shift, __be64 *pas, int umr); + int page_shift, __be64 *pas, int access_flags); void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); @@ -544,6 +617,38 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +extern struct workqueue_struct *mlx5_ib_page_fault_wq; + +int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev); +void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault); +void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); +int __init mlx5_ib_odp_init(void); +void mlx5_ib_odp_cleanup(void); +void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); +void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); +void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, + unsigned long end); + +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) +{ + return 0; +} + +static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} +static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } +static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} +static inline int mlx5_ib_odp_init(void) { return 0; } +static inline void mlx5_ib_odp_cleanup(void) {} +static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} +static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} + +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; @@ -561,4 +666,7 @@ static inline u8 convert_access(int acc) MLX5_PERM_LOCAL_READ; } +#define MLX5_MAX_UMR_SHIFT 16 +#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 5a80dd993761..32a28bd50b20 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -37,21 +37,34 @@ #include <linux/export.h> #include <linux/delay.h> #include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> +#include <rdma/ib_verbs.h> #include "mlx5_ib.h" enum { MAX_PENDING_REG_MR = 8, }; -enum { - MLX5_UMR_ALIGN = 2048 -}; +#define MLX5_UMR_ALIGN 2048 +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static __be64 mlx5_ib_update_mtt_emergency_buffer[ + MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)] + __aligned(MLX5_UMR_ALIGN); +static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); +#endif + +static int clean_mr(struct mlx5_ib_mr *mr); -static __be64 *mr_align(__be64 *ptr, int align) +static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - unsigned long mask = align - 1; + int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); - return (__be64 *)(((unsigned long)ptr + mask) & ~mask); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* Wait until all page fault handlers using the mr complete. */ + synchronize_srcu(&dev->mr_srcu); +#endif + + return err; } static int order2idx(struct mlx5_ib_dev *dev, int order) @@ -146,7 +159,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) mr->order = ent->order; mr->umred = 1; mr->dev = dev; - in->seg.status = 1 << 6; + in->seg.status = MLX5_MKEY_STATUS_FREE; in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; @@ -191,7 +204,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) ent->cur--; ent->size--; spin_unlock_irq(&ent->lock); - err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + err = destroy_mkey(dev, mr); if (err) mlx5_ib_warn(dev, "failed destroy mkey\n"); else @@ -482,7 +495,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c) ent->cur--; ent->size--; spin_unlock_irq(&ent->lock); - err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + err = destroy_mkey(dev, mr); if (err) mlx5_ib_warn(dev, "failed destroy mkey\n"); else @@ -668,7 +681,7 @@ static int get_octo_len(u64 addr, u64 len, int page_size) static int use_umr(int order) { - return order <= 17; + return order <= MLX5_MAX_UMR_SHIFT; } static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, @@ -678,6 +691,7 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_mr *mr = dev->umrc.mr; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; sg->addr = dma; sg->length = ALIGN(sizeof(u64) * n, 64); @@ -692,21 +706,24 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, wr->num_sge = 0; wr->opcode = MLX5_IB_WR_UMR; - wr->wr.fast_reg.page_list_len = n; - wr->wr.fast_reg.page_shift = page_shift; - wr->wr.fast_reg.rkey = key; - wr->wr.fast_reg.iova_start = virt_addr; - wr->wr.fast_reg.length = len; - wr->wr.fast_reg.access_flags = access_flags; - wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd; + + umrwr->npages = n; + umrwr->page_shift = page_shift; + umrwr->mkey = key; + umrwr->target.virt_addr = virt_addr; + umrwr->length = len; + umrwr->access_flags = access_flags; + umrwr->pd = pd; } static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, struct ib_send_wr *wr, u32 key) { - wr->send_flags = MLX5_IB_SEND_UMR_UNREG; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + + wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; wr->opcode = MLX5_IB_WR_UMR; - wr->wr.fast_reg.rkey = key; + umrwr->mkey = key; } void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) @@ -742,7 +759,10 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, struct ib_send_wr wr, *bad; struct mlx5_ib_mr *mr; struct ib_sge sg; - int size = sizeof(u64) * npages; + int size; + __be64 *mr_pas; + __be64 *pas; + dma_addr_t dma; int err = 0; int i; @@ -761,25 +781,31 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, if (!mr) return ERR_PTR(-EAGAIN); - mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); - if (!mr->pas) { + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. + * To avoid copying garbage after the pas array, we allocate + * a little more. */ + size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); + mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!mr_pas) { err = -ENOMEM; goto free_mr; } - mlx5_ib_populate_pas(dev, umem, page_shift, - mr_align(mr->pas, MLX5_UMR_ALIGN), 1); + pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); + /* Clear padding after the actual pages. */ + memset(pas + npages, 0, size - npages * sizeof(u64)); - mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size, - DMA_TO_DEVICE); - if (dma_mapping_error(ddev, mr->dma)) { + dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { err = -ENOMEM; goto free_pas; } memset(&wr, 0, sizeof(wr)); wr.wr_id = (u64)(unsigned long)&umr_context; - prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags); + prep_umr_reg_wqe(pd, &wr, &sg, dma, npages, mr->mmr.key, page_shift, + virt_addr, len, access_flags); mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); @@ -799,12 +825,14 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, mr->mmr.size = len; mr->mmr.pd = to_mpd(pd)->pdn; + mr->live = 1; + unmap_dma: up(&umrc->sem); - dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); free_pas: - kfree(mr->pas); + kfree(mr_pas); free_mr: if (err) { @@ -815,6 +843,128 @@ free_mr: return mr; } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, + int zap) +{ + struct mlx5_ib_dev *dev = mr->dev; + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_umem *umem = mr->umem; + int size; + __be64 *pas; + dma_addr_t dma; + struct ib_send_wr wr, *bad; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg; + struct ib_sge sg; + int err = 0; + const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64); + const int page_index_mask = page_index_alignment - 1; + size_t pages_mapped = 0; + size_t pages_to_map = 0; + size_t pages_iter = 0; + int use_emergency_buf = 0; + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, + * so we need to align the offset and length accordingly */ + if (start_page_index & page_index_mask) { + npages += start_page_index & page_index_mask; + start_page_index &= ~page_index_mask; + } + + pages_to_map = ALIGN(npages, page_index_alignment); + + if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES) + return -EINVAL; + + size = sizeof(u64) * pages_to_map; + size = min_t(int, PAGE_SIZE, size); + /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim + * code, when we are called from an invalidation. The pas buffer must + * be 2k-aligned for Connect-IB. */ + pas = (__be64 *)get_zeroed_page(GFP_ATOMIC); + if (!pas) { + mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n"); + pas = mlx5_ib_update_mtt_emergency_buffer; + size = MLX5_UMR_MTT_MIN_CHUNK_SIZE; + use_emergency_buf = 1; + mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + memset(pas, 0, size); + } + pages_iter = size / sizeof(u64); + dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + mlx5_ib_err(dev, "unable to map DMA during MTT update.\n"); + err = -ENOMEM; + goto free_pas; + } + + for (pages_mapped = 0; + pages_mapped < pages_to_map && !err; + pages_mapped += pages_iter, start_page_index += pages_iter) { + dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); + + npages = min_t(size_t, + pages_iter, + ib_umem_num_pages(umem) - start_page_index); + + if (!zap) { + __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT, + start_page_index, npages, pas, + MLX5_IB_MTT_PRESENT); + /* Clear padding after the pages brought from the + * umem. */ + memset(pas + npages, 0, size - npages * sizeof(u64)); + } + + dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + + sg.addr = dma; + sg.length = ALIGN(npages * sizeof(u64), + MLX5_UMR_MTT_ALIGNMENT); + sg.lkey = dev->umrc.mr->lkey; + + wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | + MLX5_IB_SEND_UMR_UPDATE_MTT; + wr.sg_list = &sg; + wr.num_sge = 1; + wr.opcode = MLX5_IB_WR_UMR; + umrwr->npages = sg.length / sizeof(u64); + umrwr->page_shift = PAGE_SHIFT; + umrwr->mkey = mr->mmr.key; + umrwr->target.offset = start_page_index; + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_err(dev, "UMR post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_err(dev, "UMR completion failed, code %d\n", + umr_context.status); + err = -EFAULT; + } + } + up(&umrc->sem); + } + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + +free_pas: + if (!use_emergency_buf) + free_page((unsigned long)pas); + else + mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + + return err; +} +#endif + static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, u64 length, struct ib_umem *umem, int npages, int page_shift, @@ -825,6 +975,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, struct mlx5_ib_mr *mr; int inlen; int err; + bool pg_cap = !!(dev->mdev->caps.gen.flags & + MLX5_DEV_CAP_FLAG_ON_DMND_PG); mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) @@ -836,8 +988,12 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, err = -ENOMEM; goto err_1; } - mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0); + mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, + pg_cap ? MLX5_IB_MTT_PRESENT : 0); + /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags + * in the page list submitted with the command. */ + in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; in->seg.flags = convert_access(access_flags) | MLX5_ACCESS_MODE_MTT; in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); @@ -856,6 +1012,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, goto err_2; } mr->umem = umem; + mr->live = 1; kvfree(in); mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); @@ -910,6 +1067,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "cache empty for order %d", order); mr = NULL; } + } else if (access_flags & IB_ACCESS_ON_DEMAND) { + err = -EINVAL; + pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); + goto error; } if (!mr) @@ -925,16 +1086,51 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->umem = umem; mr->npages = npages; - spin_lock(&dev->mr_lock); - dev->mdev->priv.reg_pages += npages; - spin_unlock(&dev->mr_lock); + atomic_add(npages, &dev->mdev->priv.reg_pages); mr->ibmr.lkey = mr->mmr.key; mr->ibmr.rkey = mr->mmr.key; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem->odp_data) { + /* + * This barrier prevents the compiler from moving the + * setting of umem->odp_data->private to point to our + * MR, before reg_umr finished, to ensure that the MR + * initialization have finished before starting to + * handle invalidations. + */ + smp_wmb(); + mr->umem->odp_data->private = mr; + /* + * Make sure we will see the new + * umem->odp_data->private value in the invalidation + * routines, before we can get page faults on the + * MR. Page faults can happen once we put the MR in + * the tree, below this line. Without the barrier, + * there can be a fault handling and an invalidation + * before umem->odp_data->private == mr is visible to + * the invalidation handler. + */ + smp_wmb(); + } +#endif + return &mr->ibmr; error: + /* + * Destroy the umem *before* destroying the MR, to ensure we + * will not have any in-flight notifiers when destroying the + * MR. + * + * As the MR is completely invalid to begin with, and this + * error path is only taken if we can't push the mr entry into + * the pagefault tree, this is safe. + */ + ib_umem_release(umem); + /* Kill the MR, and return an error code. */ + clean_mr(mr); return ERR_PTR(err); } @@ -971,17 +1167,14 @@ error: return err; } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +static int clean_mr(struct mlx5_ib_mr *mr) { - struct mlx5_ib_dev *dev = to_mdev(ibmr->device); - struct mlx5_ib_mr *mr = to_mmr(ibmr); - struct ib_umem *umem = mr->umem; - int npages = mr->npages; + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); int umred = mr->umred; int err; if (!umred) { - err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + err = destroy_mkey(dev, mr); if (err) { mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", mr->mmr.key, err); @@ -996,15 +1189,47 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) free_cached_mr(dev, mr); } - if (umem) { + if (!umred) + kfree(mr); + + return 0; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int npages = mr->npages; + struct ib_umem *umem = mr->umem; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem && umem->odp_data) { + /* Prevent new page faults from succeeding */ + mr->live = 0; + /* Wait for all running page-fault handlers to finish. */ + synchronize_srcu(&dev->mr_srcu); + /* Destroy all page mappings */ + mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + ib_umem_end(umem)); + /* + * We kill the umem before the MR for ODP, + * so that there will not be any invalidations in + * flight, looking at the *mr struct. + */ ib_umem_release(umem); - spin_lock(&dev->mr_lock); - dev->mdev->priv.reg_pages -= npages; - spin_unlock(&dev->mr_lock); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + + /* Avoid double-freeing the umem. */ + umem = NULL; } +#endif - if (!umred) - kfree(mr); + clean_mr(mr); + + if (umem) { + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + } return 0; } @@ -1028,7 +1253,7 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, goto err_free; } - in->seg.status = 1 << 6; /* free */ + in->seg.status = MLX5_MKEY_STATUS_FREE; in->seg.xlt_oct_size = cpu_to_be32(ndescs); in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); @@ -1113,7 +1338,7 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr) kfree(mr->sig); } - err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + err = destroy_mkey(dev, mr); if (err) { mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", mr->mmr.key, err); @@ -1143,7 +1368,7 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, goto err_free; } - in->seg.status = 1 << 6; /* free */ + in->seg.status = MLX5_MKEY_STATUS_FREE; in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c new file mode 100644 index 000000000000..a2c541c4809a --- /dev/null +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -0,0 +1,798 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/ib_umem.h> +#include <rdma/ib_umem_odp.h> + +#include "mlx5_ib.h" + +#define MAX_PREFETCH_LEN (4*1024*1024U) + +/* Timeout in ms to wait for an active mmu notifier to complete when handling + * a pagefault. */ +#define MMU_NOTIFIER_TIMEOUT 1000 + +struct workqueue_struct *mlx5_ib_page_fault_wq; + +void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, + unsigned long end) +{ + struct mlx5_ib_mr *mr; + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; + u64 idx = 0, blk_start_idx = 0; + int in_block = 0; + u64 addr; + + if (!umem || !umem->odp_data) { + pr_err("invalidation called on NULL umem or non-ODP umem\n"); + return; + } + + mr = umem->odp_data->private; + + if (!mr || !mr->ibmr.pd) + return; + + start = max_t(u64, ib_umem_start(umem), start); + end = min_t(u64, ib_umem_end(umem), end); + + /* + * Iteration one - zap the HW's MTTs. The notifiers_count ensures that + * while we are doing the invalidation, no page fault will attempt to + * overwrite the same MTTs. Concurent invalidations might race us, + * but they will write 0s as well, so no difference in the end result. + */ + + for (addr = start; addr < end; addr += (u64)umem->page_size) { + idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + /* + * Strive to write the MTTs in chunks, but avoid overwriting + * non-existing MTTs. The huristic here can be improved to + * estimate the cost of another UMR vs. the cost of bigger + * UMR. + */ + if (umem->odp_data->dma_list[idx] & + (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (!in_block) { + blk_start_idx = idx; + in_block = 1; + } + } else { + u64 umr_offset = idx & umr_block_mask; + + if (in_block && umr_offset == 0) { + mlx5_ib_update_mtt(mr, blk_start_idx, + idx - blk_start_idx, 1); + in_block = 0; + } + } + } + if (in_block) + mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, + 1); + + /* + * We are now sure that the device will not access the + * memory. We can safely unmap it, and mark it as dirty if + * needed. + */ + + ib_umem_odp_unmap_dma_pages(umem, start, end); +} + +#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ + if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ + ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ +} while (0) + +int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) +{ + int err; + struct mlx5_odp_caps hw_caps; + struct ib_odp_caps *caps = &dev->odp_caps; + + memset(caps, 0, sizeof(*caps)); + + if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)) + return 0; + + err = mlx5_query_odp_caps(dev->mdev, &hw_caps); + if (err) + goto out; + + caps->general_caps = IB_ODP_SUPPORT; + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps, + SEND); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + SEND); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + RECV); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + WRITE); + COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, + READ); + +out: + return err; +} + +static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, + u32 key) +{ + u32 base_key = mlx5_base_mkey(key); + struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); + struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); + + if (!mmr || mmr->key != key || !mr->live) + return NULL; + + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + +static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, + int error) { + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, + pfault->mpfault.flags, + error); + if (ret) + pr_err("Failed to resolve the page fault on QP 0x%x\n", + qp->mqp.qpn); +} + +/* + * Handle a single data segment in a page-fault WQE. + * + * Returns number of pages retrieved on success. The caller will continue to + * the next data segment. + * Can return the following error codes: + * -EAGAIN to designate a temporary error. The caller will abort handling the + * page fault and resolve it. + * -EFAULT when there's an error mapping the requested pages. The caller will + * abort the page fault handling and possibly move the QP to an error state. + * On other errors the QP should also be closed with an error. + */ +static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, + u32 key, u64 io_virt, size_t bcnt, + u32 *bytes_mapped) +{ + struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); + int srcu_key; + unsigned int current_seq; + u64 start_idx; + int npages = 0, ret = 0; + struct mlx5_ib_mr *mr; + u64 access_mask = ODP_READ_ALLOWED_BIT; + + srcu_key = srcu_read_lock(&mib_dev->mr_srcu); + mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); + /* + * If we didn't find the MR, it means the MR was closed while we were + * handling the ODP event. In this case we return -EFAULT so that the + * QP will be closed. + */ + if (!mr || !mr->ibmr.pd) { + pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", + key); + ret = -EFAULT; + goto srcu_unlock; + } + if (!mr->umem->odp_data) { + pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += + (bcnt - pfault->mpfault.bytes_committed); + goto srcu_unlock; + } + if (mr->ibmr.pd != qp->ibqp.pd) { + pr_err("Page-fault with different PDs for QP and MR.\n"); + ret = -EFAULT; + goto srcu_unlock; + } + + current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); + /* + * Ensure the sequence number is valid for some time before we call + * gup. + */ + smp_rmb(); + + /* + * Avoid branches - this code will perform correctly + * in all iterations (in iteration 2 and above, + * bytes_committed == 0). + */ + io_virt += pfault->mpfault.bytes_committed; + bcnt -= pfault->mpfault.bytes_committed; + + start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; + + if (mr->umem->writable) + access_mask |= ODP_WRITE_ALLOWED_BIT; + npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, + access_mask, current_seq); + if (npages < 0) { + ret = npages; + goto srcu_unlock; + } + + if (npages > 0) { + mutex_lock(&mr->umem->odp_data->umem_mutex); + if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { + /* + * No need to check whether the MTTs really belong to + * this MR, since ib_umem_odp_map_dma_pages already + * checks this. + */ + ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); + } else { + ret = -EAGAIN; + } + mutex_unlock(&mr->umem->odp_data->umem_mutex); + if (ret < 0) { + if (ret != -EAGAIN) + pr_err("Failed to update mkey page tables\n"); + goto srcu_unlock; + } + + if (bytes_mapped) { + u32 new_mappings = npages * PAGE_SIZE - + (io_virt - round_down(io_virt, PAGE_SIZE)); + *bytes_mapped += min_t(u32, new_mappings, bcnt); + } + } + +srcu_unlock: + if (ret == -EAGAIN) { + if (!mr->umem->odp_data->dying) { + struct ib_umem_odp *odp_data = mr->umem->odp_data; + unsigned long timeout = + msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); + + if (!wait_for_completion_timeout( + &odp_data->notifier_completion, + timeout)) { + pr_warn("timeout waiting for mmu notifier completion\n"); + } + } else { + /* The MR is being killed, kill the QP as well. */ + ret = -EFAULT; + } + } + srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); + pfault->mpfault.bytes_committed = 0; + return ret ? ret : npages; +} + +/** + * Parse a series of data segments for page fault handling. + * + * @qp the QP on which the fault occurred. + * @pfault contains page fault information. + * @wqe points at the first data segment in the WQE. + * @wqe_end points after the end of the WQE. + * @bytes_mapped receives the number of bytes that the function was able to + * map. This allows the caller to decide intelligently whether + * enough memory was mapped to resolve the page fault + * successfully (e.g. enough for the next MTU, or the entire + * WQE). + * @total_wqe_bytes receives the total data size of this WQE in bytes (minus + * the committed bytes). + * + * Returns the number of pages loaded if positive, zero for an empty WQE, or a + * negative error code. + */ +static int pagefault_data_segments(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, void *wqe, + void *wqe_end, u32 *bytes_mapped, + u32 *total_wqe_bytes, int receive_queue) +{ + int ret = 0, npages = 0; + u64 io_virt; + u32 key; + u32 byte_count; + size_t bcnt; + int inline_segment; + + /* Skip SRQ next-WQE segment. */ + if (receive_queue && qp->ibqp.srq) + wqe += sizeof(struct mlx5_wqe_srq_next_seg); + + if (bytes_mapped) + *bytes_mapped = 0; + if (total_wqe_bytes) + *total_wqe_bytes = 0; + + while (wqe < wqe_end) { + struct mlx5_wqe_data_seg *dseg = wqe; + + io_virt = be64_to_cpu(dseg->addr); + key = be32_to_cpu(dseg->lkey); + byte_count = be32_to_cpu(dseg->byte_count); + inline_segment = !!(byte_count & MLX5_INLINE_SEG); + bcnt = byte_count & ~MLX5_INLINE_SEG; + + if (inline_segment) { + bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; + wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, + 16); + } else { + wqe += sizeof(*dseg); + } + + /* receive WQE end of sg list. */ + if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && + io_virt == 0) + break; + + if (!inline_segment && total_wqe_bytes) { + *total_wqe_bytes += bcnt - min_t(size_t, bcnt, + pfault->mpfault.bytes_committed); + } + + /* A zero length data segment designates a length of 2GB. */ + if (bcnt == 0) + bcnt = 1U << 31; + + if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { + pfault->mpfault.bytes_committed -= + min_t(size_t, bcnt, + pfault->mpfault.bytes_committed); + continue; + } + + ret = pagefault_single_data_segment(qp, pfault, key, io_virt, + bcnt, bytes_mapped); + if (ret < 0) + break; + npages += ret; + } + + return ret < 0 ? ret : npages; +} + +/* + * Parse initiator WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_initiator_pfault_handler( + struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, + void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + struct mlx5_wqe_ctrl_seg *ctrl = *wqe; + u16 wqe_index = pfault->mpfault.wqe.wqe_index; + unsigned ds, opcode; +#if defined(DEBUG) + u32 ctrl_wqe_index, ctrl_qpn; +#endif + + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + if (ds * MLX5_WQE_DS_UNITS > wqe_length) { + mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", + ds, wqe_length); + return -EFAULT; + } + + if (ds == 0) { + mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", + wqe_index, qp->mqp.qpn); + return -EFAULT; + } + +#if defined(DEBUG) + ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_WQE_INDEX_MASK) >> + MLX5_WQE_CTRL_WQE_INDEX_SHIFT; + if (wqe_index != ctrl_wqe_index) { + mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", + wqe_index, qp->mqp.qpn, + ctrl_wqe_index); + return -EFAULT; + } + + ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> + MLX5_WQE_CTRL_QPN_SHIFT; + if (qp->mqp.qpn != ctrl_qpn) { + mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", + wqe_index, qp->mqp.qpn, + ctrl_qpn); + return -EFAULT; + } +#endif /* DEBUG */ + + *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; + *wqe += sizeof(*ctrl); + + opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_OPCODE_MASK; + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + switch (opcode) { + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_IMM: + case MLX5_OPCODE_SEND_INVAL: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_SEND)) + goto invalid_transport_or_opcode; + break; + case MLX5_OPCODE_RDMA_WRITE: + case MLX5_OPCODE_RDMA_WRITE_IMM: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_WRITE)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + case MLX5_OPCODE_RDMA_READ: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_READ)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + default: + goto invalid_transport_or_opcode; + } + break; + case IB_QPT_UD: + switch (opcode) { + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_IMM: + if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & + IB_ODP_SUPPORT_SEND)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_datagram_seg); + break; + default: + goto invalid_transport_or_opcode; + } + break; + default: +invalid_transport_or_opcode: + mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", + qp->ibqp.qp_type, opcode); + return -EFAULT; + } + + return 0; +} + +/* + * Parse responder WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_responder_pfault_handler( + struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, + void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + struct mlx5_ib_wq *wq = &qp->rq; + int wqe_size = 1 << wq->wqe_shift; + + if (qp->ibqp.srq) { + mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); + return -EFAULT; + } + + if (qp->wq_sig) { + mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); + return -EFAULT; + } + + if (wqe_size > wqe_length) { + mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); + return -EFAULT; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_RECV)) + goto invalid_transport_or_opcode; + break; + default: +invalid_transport_or_opcode: + mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", + qp->ibqp.qp_type); + return -EFAULT; + } + + *wqe_end = *wqe + wqe_size; + + return 0; +} + +static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + int ret; + void *wqe, *wqe_end; + u32 bytes_mapped, total_wqe_bytes; + char *buffer = NULL; + int resume_with_error = 0; + u16 wqe_index = pfault->mpfault.wqe.wqe_index; + int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; + + buffer = (char *)__get_free_page(GFP_KERNEL); + if (!buffer) { + mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); + resume_with_error = 1; + goto resolve_page_fault; + } + + ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, + PAGE_SIZE); + if (ret < 0) { + mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", + -ret, wqe_index, qp->mqp.qpn); + resume_with_error = 1; + goto resolve_page_fault; + } + + wqe = buffer; + if (requestor) + ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, + &wqe_end, ret); + else + ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, + &wqe_end, ret); + if (ret < 0) { + resume_with_error = 1; + goto resolve_page_fault; + } + + if (wqe >= wqe_end) { + mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); + resume_with_error = 1; + goto resolve_page_fault; + } + + ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, + &total_wqe_bytes, !requestor); + if (ret == -EAGAIN) { + goto resolve_page_fault; + } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { + mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", + -ret); + resume_with_error = 1; + goto resolve_page_fault; + } + +resolve_page_fault: + mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", + qp->mqp.qpn, resume_with_error, pfault->mpfault.flags); + + free_page((unsigned long)buffer); +} + +static int pages_in_range(u64 address, u32 length) +{ + return (ALIGN(address + length, PAGE_SIZE) - + (address & PAGE_MASK)) >> PAGE_SHIFT; +} + +static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + struct mlx5_pagefault *mpfault = &pfault->mpfault; + u64 address; + u32 length; + u32 prefetch_len = mpfault->bytes_committed; + int prefetch_activated = 0; + u32 rkey = mpfault->rdma.r_key; + int ret; + + /* The RDMA responder handler handles the page fault in two parts. + * First it brings the necessary pages for the current packet + * (and uses the pfault context), and then (after resuming the QP) + * prefetches more pages. The second operation cannot use the pfault + * context and therefore uses the dummy_pfault context allocated on + * the stack */ + struct mlx5_ib_pfault dummy_pfault = {}; + + dummy_pfault.mpfault.bytes_committed = 0; + + mpfault->rdma.rdma_va += mpfault->bytes_committed; + mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, + mpfault->rdma.rdma_op_len); + mpfault->bytes_committed = 0; + + address = mpfault->rdma.rdma_va; + length = mpfault->rdma.rdma_op_len; + + /* For some operations, the hardware cannot tell the exact message + * length, and in those cases it reports zero. Use prefetch + * logic. */ + if (length == 0) { + prefetch_activated = 1; + length = mpfault->rdma.packet_size; + prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); + } + + ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, + NULL); + if (ret == -EAGAIN) { + /* We're racing with an invalidation, don't prefetch */ + prefetch_activated = 0; + } else if (ret < 0 || pages_in_range(address, length) > ret) { + mlx5_ib_page_fault_resume(qp, pfault, 1); + return; + } + + mlx5_ib_page_fault_resume(qp, pfault, 0); + + /* At this point, there might be a new pagefault already arriving in + * the eq, switch to the dummy pagefault for the rest of the + * processing. We're still OK with the objects being alive as the + * work-queue is being fenced. */ + + if (prefetch_activated) { + ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, + address, + prefetch_len, + NULL); + if (ret < 0) { + pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", + ret, prefetch_activated, + qp->ibqp.qp_num, address, prefetch_len); + } + } +} + +void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + u8 event_subtype = pfault->mpfault.event_subtype; + + switch (event_subtype) { + case MLX5_PFAULT_SUBTYPE_WQE: + mlx5_ib_mr_wqe_pfault_handler(qp, pfault); + break; + case MLX5_PFAULT_SUBTYPE_RDMA: + mlx5_ib_mr_rdma_pfault_handler(qp, pfault); + break; + default: + pr_warn("Invalid page fault event subtype: 0x%x\n", + event_subtype); + mlx5_ib_page_fault_resume(qp, pfault, 1); + break; + } +} + +static void mlx5_ib_qp_pfault_action(struct work_struct *work) +{ + struct mlx5_ib_pfault *pfault = container_of(work, + struct mlx5_ib_pfault, + work); + enum mlx5_ib_pagefault_context context = + mlx5_ib_get_pagefault_context(&pfault->mpfault); + struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, + pagefaults[context]); + mlx5_ib_mr_pfault_handler(qp, pfault); +} + +void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->disable_page_faults_lock, flags); + qp->disable_page_faults = 1; + spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); + + /* + * Note that at this point, we are guarenteed that no more + * work queue elements will be posted to the work queue with + * the QP we are closing. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +} + +void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->disable_page_faults_lock, flags); + qp->disable_page_faults = 0; + spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); +} + +static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, + struct mlx5_pagefault *pfault) +{ + /* + * Note that we will only get one fault event per QP per context + * (responder/initiator, read/write), until we resolve the page fault + * with the mlx5_ib_page_fault_resume command. Since this function is + * called from within the work element, there is no risk of missing + * events. + */ + struct mlx5_ib_qp *mibqp = to_mibqp(qp); + enum mlx5_ib_pagefault_context context = + mlx5_ib_get_pagefault_context(pfault); + struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; + + qp_pfault->mpfault = *pfault; + + /* No need to stop interrupts here since we are in an interrupt */ + spin_lock(&mibqp->disable_page_faults_lock); + if (!mibqp->disable_page_faults) + queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); + spin_unlock(&mibqp->disable_page_faults_lock); +} + +void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) +{ + int i; + + qp->disable_page_faults = 1; + spin_lock_init(&qp->disable_page_faults_lock); + + qp->mqp.pfault_handler = mlx5_ib_pfault_handler; + + for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) + INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); +} + +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) +{ + int ret; + + ret = init_srcu_struct(&ibdev->mr_srcu); + if (ret) + return ret; + + return 0; +} + +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) +{ + cleanup_srcu_struct(&ibdev->mr_srcu); +} + +int __init mlx5_ib_odp_init(void) +{ + mlx5_ib_page_fault_wq = + create_singlethread_workqueue("mlx5_ib_page_faults"); + if (!mlx5_ib_page_fault_wq) + return -ENOMEM; + + return 0; +} + +void mlx5_ib_odp_cleanup(void) +{ + destroy_workqueue(mlx5_ib_page_fault_wq); +} diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1cae1c7132b4..be0cd358b080 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -70,15 +70,6 @@ static const u32 mlx5_ib_opcode[] = { [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, }; -struct umr_wr { - u64 virt_addr; - struct ib_pd *pd; - unsigned int page_shift; - unsigned int npages; - u32 length; - int access_flags; - u32 mkey; -}; static int is_qp0(enum ib_qp_type qp_type) { @@ -110,6 +101,77 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); } +/** + * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space. + * + * @qp: QP to copy from. + * @send: copy from the send queue when non-zero, use the receive queue + * otherwise. + * @wqe_index: index to start copying from. For send work queues, the + * wqe_index is in units of MLX5_SEND_WQE_BB. + * For receive work queue, it is the number of work queue + * element in the queue. + * @buffer: destination buffer. + * @length: maximum number of bytes to copy. + * + * Copies at least a single WQE, but may copy more data. + * + * Return: the number of bytes copied, or an error code. + */ +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length) +{ + struct ib_device *ibdev = qp->ibqp.device; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; + size_t offset; + size_t wq_end; + struct ib_umem *umem = qp->umem; + u32 first_copy_length; + int wqe_length; + int ret; + + if (wq->wqe_cnt == 0) { + mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n", + qp->ibqp.qp_type); + return -EINVAL; + } + + offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift); + wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift); + + if (send && length < sizeof(struct mlx5_wqe_ctrl_seg)) + return -EINVAL; + + if (offset > umem->length || + (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length)) + return -EINVAL; + + first_copy_length = min_t(u32, offset + length, wq_end) - offset; + ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length); + if (ret) + return ret; + + if (send) { + struct mlx5_wqe_ctrl_seg *ctrl = buffer; + int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + + wqe_length = ds * MLX5_WQE_DS_UNITS; + } else { + wqe_length = 1 << wq->wqe_shift; + } + + if (wqe_length <= first_copy_length) + return first_copy_length; + + ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset, + wqe_length - first_copy_length); + if (ret) + return ret; + + return wqe_length; +} + static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) { struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; @@ -814,6 +876,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, int inlen = sizeof(*in); int err; + mlx5_ib_odp_create_qp(qp); + gen = &dev->mdev->caps.gen; mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); @@ -1098,11 +1162,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) return; - if (qp->state != IB_QPS_RESET) + if (qp->state != IB_QPS_RESET) { + mlx5_ib_qp_disable_pagefaults(qp); if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", qp->mqp.qpn); + } get_cqs(qp, &send_cq, &recv_cq); @@ -1650,6 +1716,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (mlx5_st < 0) goto out; + /* If moving to a reset or error state, we must disable page faults on + * this QP and flush all current page faults. Otherwise a stale page + * fault may attempt to work on this QP after it is reset and moved + * again to RTS, and may cause the driver and the device to get out of + * sync. */ + if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + mlx5_ib_qp_disable_pagefaults(qp); + optpar = ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; in->optparam = cpu_to_be32(optpar); @@ -1659,6 +1734,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (err) goto out; + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + mlx5_ib_qp_enable_pagefaults(qp); + qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) @@ -1848,37 +1926,70 @@ static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, umr->mkey_mask = frwr_mkey_mask(); } +static __be64 get_umr_reg_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_unreg_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_mtt_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, struct ib_send_wr *wr) { - struct umr_wr *umrwr = (struct umr_wr *)&wr->wr.fast_reg; - u64 mask; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; memset(umr, 0, sizeof(*umr)); + if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) + umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */ + else + umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */ + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { - umr->flags = 1 << 5; /* fail if not free */ umr->klm_octowords = get_klm_octo(umrwr->npages); - mask = MLX5_MKEY_MASK_LEN | - MLX5_MKEY_MASK_PAGE_SIZE | - MLX5_MKEY_MASK_START_ADDR | - MLX5_MKEY_MASK_PD | - MLX5_MKEY_MASK_LR | - MLX5_MKEY_MASK_LW | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_RR | - MLX5_MKEY_MASK_RW | - MLX5_MKEY_MASK_A | - MLX5_MKEY_MASK_FREE; - umr->mkey_mask = cpu_to_be64(mask); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) { + umr->mkey_mask = get_umr_update_mtt_mask(); + umr->bsf_octowords = get_klm_octo(umrwr->target.offset); + umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; + } else { + umr->mkey_mask = get_umr_reg_mr_mask(); + } } else { - umr->flags = 2 << 5; /* fail if free */ - mask = MLX5_MKEY_MASK_FREE; - umr->mkey_mask = cpu_to_be64(mask); + umr->mkey_mask = get_umr_unreg_mr_mask(); } if (!wr->num_sge) - umr->flags |= (1 << 7); /* inline */ + umr->flags |= MLX5_UMR_INLINE; } static u8 get_umr_flags(int acc) @@ -1895,7 +2006,7 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, { memset(seg, 0, sizeof(*seg)); if (li) { - seg->status = 1 << 6; + seg->status = MLX5_MKEY_STATUS_FREE; return; } @@ -1912,19 +2023,23 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) { + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; + memset(seg, 0, sizeof(*seg)); if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { - seg->status = 1 << 6; + seg->status = MLX5_MKEY_STATUS_FREE; return; } - seg->flags = convert_access(wr->wr.fast_reg.access_flags); - seg->flags_pd = cpu_to_be32(to_mpd((struct ib_pd *)wr->wr.fast_reg.page_list)->pdn); - seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); - seg->len = cpu_to_be64(wr->wr.fast_reg.length); - seg->log2_page_size = wr->wr.fast_reg.page_shift; + seg->flags = convert_access(umrwr->access_flags); + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); + } + seg->len = cpu_to_be64(umrwr->length); + seg->log2_page_size = umrwr->page_shift; seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | - mlx5_mkey_variant(wr->wr.fast_reg.rkey)); + mlx5_mkey_variant(umrwr->mkey)); } static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, @@ -2927,6 +3042,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr int mlx5_state; int err = 0; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * Wait for any outstanding page faults, in case the user frees memory + * based upon this query's result. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +#endif + mutex_lock(&qp->mutex); outb = kzalloc(sizeof(*outb), GFP_KERNEL); if (!outb) { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index fef067c959fc..c0d0296e7a00 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2341,9 +2341,9 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u," " offset = %u, page size = %u.\n", (unsigned long int)start, (unsigned long int)virt, (u32)length, - region->offset, region->page_size); + ib_umem_offset(region), region->page_size); - skip_pages = ((u32)region->offset) >> 12; + skip_pages = ((u32)ib_umem_offset(region)) >> 12; if (ib_copy_from_udata(&req, udata, sizeof(req))) { ib_umem_release(region); @@ -2408,7 +2408,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, region_length -= skip_pages << 12; for (page_index = skip_pages; page_index < chunk_pages; page_index++) { skip_pages = 0; - if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length) + if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length) goto enough_pages; if ((page_count&0x01FF) == 0) { if (page_count >= 1024 * 512) { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index ac02ce4e8040..f3cc8c9e65ae 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -96,7 +96,6 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); union ib_gid sgid; - u8 zmac[ETH_ALEN]; if (!(attr->ah_flags & IB_AH_GRH)) return ERR_PTR(-EINVAL); @@ -118,9 +117,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) goto av_conf_err; } - memset(&zmac, 0, ETH_ALEN); - if (pd->uctx && - memcmp(attr->dmac, &zmac, ETH_ALEN)) { + if (pd->uctx) { status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid, attr->dmac, &attr->vlan_id); if (status) { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 4c68305ee781..fb8d8c4dfbb9 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -805,7 +805,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, goto umem_err; mr->hwmr.pbe_size = mr->umem->page_size; - mr->hwmr.fbo = mr->umem->offset; + mr->hwmr.fbo = ib_umem_offset(mr->umem); mr->hwmr.va = usr_addr; mr->hwmr.len = len; mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; @@ -1410,6 +1410,8 @@ int ocrdma_query_qp(struct ib_qp *ibqp, mutex_unlock(&dev->dev_lock); if (status) goto mbx_err; + if (qp->qp_type == IB_QPT_UD) + qp_attr->qkey = params.qkey; qp_attr->qp_state = get_ibqp_state(IB_QPS_INIT); qp_attr->cur_qp_state = get_ibqp_state(IB_QPS_INIT); qp_attr->path_mtu = diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c index 9bbb55347cc1..a77fb4fb14e4 100644 --- a/drivers/infiniband/hw/qib/qib_mr.c +++ b/drivers/infiniband/hw/qib/qib_mr.c @@ -258,7 +258,7 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->mr.user_base = start; mr->mr.iova = virt_addr; mr->mr.length = length; - mr->mr.offset = umem->offset; + mr->mr.offset = ib_umem_offset(umem); mr->mr.access_flags = mr_access_flags; mr->umem = umem; diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index d7562beb5423..8ba80a6d3a46 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -98,9 +98,15 @@ enum { IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ IPOIB_MCAST_FLAG_SENDONLY = 1, - IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ + /* + * For IPOIB_MCAST_FLAG_BUSY + * When set, in flight join and mcast->mc is unreliable + * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or + * haven't started yet + * When clear and mcast->mc is valid pointer, join was successful + */ + IPOIB_MCAST_FLAG_BUSY = 2, IPOIB_MCAST_FLAG_ATTACHED = 3, - IPOIB_MCAST_JOIN_STARTED = 4, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, @@ -317,6 +323,7 @@ struct ipoib_dev_priv { struct list_head multicast_list; struct rb_root multicast_tree; + struct workqueue_struct *wq; struct delayed_work mcast_task; struct work_struct carrier_on_task; struct work_struct flush_light; @@ -477,10 +484,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); -int ipoib_ib_dev_open(struct net_device *dev, int flush); +int ipoib_ib_dev_open(struct net_device *dev); int ipoib_ib_dev_up(struct net_device *dev); -int ipoib_ib_dev_down(struct net_device *dev, int flush); -int ipoib_ib_dev_stop(struct net_device *dev, int flush); +int ipoib_ib_dev_down(struct net_device *dev); +int ipoib_ib_dev_stop(struct net_device *dev); void ipoib_pkey_dev_check_presence(struct net_device *dev); int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); @@ -492,7 +499,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); void ipoib_mcast_restart_task(struct work_struct *work); int ipoib_mcast_start_thread(struct net_device *dev); -int ipoib_mcast_stop_thread(struct net_device *dev, int flush); +int ipoib_mcast_stop_thread(struct net_device *dev); void ipoib_mcast_dev_down(struct net_device *dev); void ipoib_mcast_dev_flush(struct net_device *dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 933efcea0d03..56959adb6c7d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even } spin_lock_irq(&priv->lock); - queue_delayed_work(ipoib_workqueue, + queue_delayed_work(priv->wq, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); /* Add this entry to passive ids list head, but do not re-add it * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ @@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); ipoib_cm_start_rx_drain(priv); - queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + queue_work(priv->wq, &priv->cm.rx_reap_task); spin_unlock_irqrestore(&priv->lock, flags); } else ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", @@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->lock, flags); list_move(&p->list, &priv->cm.rx_reap_list); spin_unlock_irqrestore(&priv->lock, flags); - queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + queue_work(priv->wq, &priv->cm.rx_reap_task); } return; } @@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); } clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); @@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); } spin_unlock_irqrestore(&priv->lock, flags); @@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path tx->dev = dev; list_add(&tx->list, &priv->cm.start_list); set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); - queue_work(ipoib_workqueue, &priv->cm.start_task); + queue_work(priv->wq, &priv->cm.start_task); return tx; } @@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { spin_lock_irqsave(&priv->lock, flags); list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); ipoib_dbg(priv, "Reap connection for gid %pI6\n", tx->neigh->daddr + 4); tx->neigh = NULL; @@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, skb_queue_tail(&priv->cm.skb_queue, skb); if (e) - queue_work(ipoib_workqueue, &priv->cm.skb_task); + queue_work(priv->wq, &priv->cm.skb_task); } static void ipoib_cm_rx_reap(struct work_struct *work) @@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work) } if (!list_empty(&priv->cm.passive_ids)) - queue_delayed_work(ipoib_workqueue, + queue_delayed_work(priv->wq, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); spin_unlock_irq(&priv->lock); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 72626c348174..fe65abb5150c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -655,7 +655,7 @@ void ipoib_reap_ah(struct work_struct *work) __ipoib_reap_ah(dev); if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + queue_delayed_work(priv->wq, &priv->ah_reap_task, round_jiffies_relative(HZ)); } @@ -664,7 +664,7 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx) drain_tx_cq((struct net_device *)ctx); } -int ipoib_ib_dev_open(struct net_device *dev, int flush) +int ipoib_ib_dev_open(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; @@ -696,7 +696,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) } clear_bit(IPOIB_STOP_REAPER, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + queue_delayed_work(priv->wq, &priv->ah_reap_task, round_jiffies_relative(HZ)); if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) @@ -706,7 +706,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) dev_stop: if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) napi_enable(&priv->napi); - ipoib_ib_dev_stop(dev, flush); + ipoib_ib_dev_stop(dev); return -1; } @@ -738,7 +738,7 @@ int ipoib_ib_dev_up(struct net_device *dev) return ipoib_mcast_start_thread(dev); } -int ipoib_ib_dev_down(struct net_device *dev, int flush) +int ipoib_ib_dev_down(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -747,7 +747,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); netif_carrier_off(dev); - ipoib_mcast_stop_thread(dev, flush); + ipoib_mcast_stop_thread(dev); ipoib_mcast_dev_flush(dev); ipoib_flush_paths(dev); @@ -807,7 +807,7 @@ void ipoib_drain_cq(struct net_device *dev) local_bh_enable(); } -int ipoib_ib_dev_stop(struct net_device *dev, int flush) +int ipoib_ib_dev_stop(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; @@ -880,8 +880,7 @@ timeout: /* Wait for all AHs to be reaped */ set_bit(IPOIB_STOP_REAPER, &priv->flags); cancel_delayed_work(&priv->ah_reap_task); - if (flush) - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); begin = jiffies; @@ -918,7 +917,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) (unsigned long) dev); if (dev->flags & IFF_UP) { - if (ipoib_ib_dev_open(dev, 1)) { + if (ipoib_ib_dev_open(dev)) { ipoib_transport_dev_cleanup(dev); return -ENODEV; } @@ -1040,12 +1039,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, } if (level >= IPOIB_FLUSH_NORMAL) - ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_down(dev); if (level == IPOIB_FLUSH_HEAVY) { if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) - ipoib_ib_dev_stop(dev, 0); - if (ipoib_ib_dev_open(dev, 0) != 0) + ipoib_ib_dev_stop(dev); + if (ipoib_ib_dev_open(dev) != 0) return; if (netif_queue_stopped(dev)) netif_start_queue(dev); @@ -1097,7 +1096,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) */ ipoib_flush_paths(dev); - ipoib_mcast_stop_thread(dev, 1); + ipoib_mcast_stop_thread(dev); ipoib_mcast_dev_flush(dev); ipoib_transport_dev_cleanup(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 58b5aa3b6f2d..6bad17d4d588 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev) set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); - if (ipoib_ib_dev_open(dev, 1)) { + if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; goto err_disable; @@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev) return 0; err_stop: - ipoib_ib_dev_stop(dev, 1); + ipoib_ib_dev_stop(dev); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); @@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev) netif_stop_queue(dev); - ipoib_ib_dev_down(dev, 1); - ipoib_ib_dev_stop(dev, 0); + ipoib_ib_dev_down(dev); + ipoib_ib_dev_stop(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; @@ -839,7 +839,7 @@ static void ipoib_set_mcast_list(struct net_device *dev) return; } - queue_work(ipoib_workqueue, &priv->restart_task); + queue_work(priv->wq, &priv->restart_task); } static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) @@ -954,7 +954,7 @@ static void ipoib_reap_neigh(struct work_struct *work) __ipoib_reap_neigh(priv); if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); } @@ -1133,7 +1133,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) /* start garbage collection */ clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); return 0; @@ -1262,15 +1262,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) { struct ipoib_dev_priv *priv = netdev_priv(dev); - if (ipoib_neigh_hash_init(priv) < 0) - goto out; /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); - goto out_neigh_hash_cleanup; + goto out; } priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); @@ -1285,16 +1283,24 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) if (ipoib_ib_dev_init(dev, ca, port)) goto out_tx_ring_cleanup; + /* + * Must be after ipoib_ib_dev_init so we can allocate a per + * device wq there and use it here + */ + if (ipoib_neigh_hash_init(priv) < 0) + goto out_dev_uninit; + return 0; +out_dev_uninit: + ipoib_ib_dev_cleanup(dev); + out_tx_ring_cleanup: vfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); -out_neigh_hash_cleanup: - ipoib_neigh_hash_uninit(dev); out: return -ENOMEM; } @@ -1317,6 +1323,12 @@ void ipoib_dev_cleanup(struct net_device *dev) } unregister_netdevice_many(&head); + /* + * Must be before ipoib_ib_dev_cleanup or we delete an in use + * work queue + */ + ipoib_neigh_hash_uninit(dev); + ipoib_ib_dev_cleanup(dev); kfree(priv->rx_ring); @@ -1324,8 +1336,6 @@ void ipoib_dev_cleanup(struct net_device *dev) priv->rx_ring = NULL; priv->tx_ring = NULL; - - ipoib_neigh_hash_uninit(dev); } static const struct header_ops ipoib_header_ops = { @@ -1636,7 +1646,7 @@ register_failed: /* Stop GC if started before flush */ set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); event_failed: ipoib_dev_cleanup(priv->dev); @@ -1707,7 +1717,7 @@ static void ipoib_remove_one(struct ib_device *device) /* Stop GC */ set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); unregister_netdev(priv->dev); free_netdev(priv->dev); @@ -1748,8 +1758,13 @@ static int __init ipoib_init_module(void) * unregister_netdev() and linkwatch_event take the rtnl lock, * so flush_scheduled_work() can deadlock during device * removal. + * + * In addition, bringing one device up and another down at the + * same time can deadlock a single workqueue, so we have this + * global fallback workqueue, but we also attempt to open a + * per device workqueue each time we bring an interface up */ - ipoib_workqueue = create_singlethread_workqueue("ipoib"); + ipoib_workqueue = create_singlethread_workqueue("ipoib_flush"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ffb83b5f7e80..bc50dd0d0e4d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -190,12 +190,6 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, spin_unlock_irq(&priv->lock); priv->tx_wr.wr.ud.remote_qkey = priv->qkey; set_qkey = 1; - - if (!ipoib_cm_admin_enabled(dev)) { - rtnl_lock(); - dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); - rtnl_unlock(); - } } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -277,16 +271,27 @@ ipoib_mcast_sendonly_join_complete(int status, struct ipoib_mcast *mcast = multicast->context; struct net_device *dev = mcast->dev; + /* + * We have to take the mutex to force mcast_sendonly_join to + * return from ib_sa_multicast_join and set mcast->mc to a + * valid value. Otherwise we were racing with ourselves in + * that we might fail here, but get a valid return from + * ib_sa_multicast_join after we had cleared mcast->mc here, + * resulting in mis-matched joins and leaves and a deadlock + */ + mutex_lock(&mcast_mutex); + /* We trap for port events ourselves. */ if (status == -ENETRESET) - return 0; + goto out; if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); if (status) { if (mcast->logcount++ < 20) - ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", + ipoib_dbg_mcast(netdev_priv(dev), "sendonly multicast " + "join failed for %pI6, status %d\n", mcast->mcmember.mgid.raw, status); /* Flush out any queued packets */ @@ -296,11 +301,15 @@ ipoib_mcast_sendonly_join_complete(int status, dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); } netif_tx_unlock_bh(dev); - - /* Clear the busy flag so we try again */ - status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, - &mcast->flags); } +out: + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + if (status) + mcast->mc = NULL; + complete(&mcast->done); + if (status == -ENETRESET) + status = 0; + mutex_unlock(&mcast_mutex); return status; } @@ -318,12 +327,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) int ret = 0; if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { - ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); + ipoib_dbg_mcast(priv, "device shutting down, no sendonly " + "multicast joins\n"); return -ENODEV; } - if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { - ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { + ipoib_dbg_mcast(priv, "multicast entry busy, skipping " + "sendonly join\n"); return -EBUSY; } @@ -331,6 +342,9 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) rec.port_gid = priv->local_gid; rec.pkey = cpu_to_be16(priv->pkey); + mutex_lock(&mcast_mutex); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, IB_SA_MCMEMBER_REC_MGID | @@ -343,12 +357,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) if (IS_ERR(mcast->mc)) { ret = PTR_ERR(mcast->mc); clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", - ret); + complete(&mcast->done); + ipoib_warn(priv, "ib_sa_join_multicast for sendonly join " + "failed (ret = %d)\n", ret); } else { - ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", - mcast->mcmember.mgid.raw); + ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting " + "sendonly join\n", mcast->mcmember.mgid.raw); } + mutex_unlock(&mcast_mutex); return ret; } @@ -359,18 +375,29 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work) carrier_on_task); struct ib_port_attr attr; - /* - * Take rtnl_lock to avoid racing with ipoib_stop() and - * turning the carrier back on while a device is being - * removed. - */ if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } - rtnl_lock(); + /* + * Take rtnl_lock to avoid racing with ipoib_stop() and + * turning the carrier back on while a device is being + * removed. However, ipoib_stop() will attempt to flush + * the workqueue while holding the rtnl lock, so loop + * on trylock until either we get the lock or we see + * FLAG_ADMIN_UP go away as that signals that we are bailing + * and can safely ignore the carrier on work. + */ + while (!rtnl_trylock()) { + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + return; + else + msleep(20); + } + if (!ipoib_cm_admin_enabled(priv->dev)) + dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu)); netif_carrier_on(priv->dev); rtnl_unlock(); } @@ -385,60 +412,63 @@ static int ipoib_mcast_join_complete(int status, ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", mcast->mcmember.mgid.raw, status); + /* + * We have to take the mutex to force mcast_join to + * return from ib_sa_multicast_join and set mcast->mc to a + * valid value. Otherwise we were racing with ourselves in + * that we might fail here, but get a valid return from + * ib_sa_multicast_join after we had cleared mcast->mc here, + * resulting in mis-matched joins and leaves and a deadlock + */ + mutex_lock(&mcast_mutex); + /* We trap for port events ourselves. */ - if (status == -ENETRESET) { - status = 0; + if (status == -ENETRESET) goto out; - } if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); if (!status) { mcast->backoff = 1; - mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, 0); - mutex_unlock(&mcast_mutex); + queue_delayed_work(priv->wq, &priv->mcast_task, 0); /* - * Defer carrier on work to ipoib_workqueue to avoid a + * Defer carrier on work to priv->wq to avoid a * deadlock on rtnl_lock here. */ if (mcast == priv->broadcast) - queue_work(ipoib_workqueue, &priv->carrier_on_task); - - status = 0; - goto out; - } - - if (mcast->logcount++ < 20) { - if (status == -ETIMEDOUT || status == -EAGAIN) { - ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); - } else { - ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); + queue_work(priv->wq, &priv->carrier_on_task); + } else { + if (mcast->logcount++ < 20) { + if (status == -ETIMEDOUT || status == -EAGAIN) { + ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); + } else { + ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", + mcast->mcmember.mgid.raw, status); + } } - } - - mcast->backoff *= 2; - if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) - mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; - /* Clear the busy flag so we try again */ - status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - - mutex_lock(&mcast_mutex); + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + } +out: spin_lock_irq(&priv->lock); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + if (status) + mcast->mc = NULL; + complete(&mcast->done); + if (status == -ENETRESET) + status = 0; + if (status && test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(priv->wq, &priv->mcast_task, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); -out: - complete(&mcast->done); + return status; } @@ -487,10 +517,9 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, rec.hop_limit = priv->broadcast->mcmember.hop_limit; } - set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + mutex_lock(&mcast_mutex); init_completion(&mcast->done); - set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); - + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); @@ -504,13 +533,11 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; - mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, + queue_delayed_work(priv->wq, &priv->mcast_task, mcast->backoff * HZ); - mutex_unlock(&mcast_mutex); } + mutex_unlock(&mcast_mutex); } void ipoib_mcast_join_task(struct work_struct *work) @@ -547,8 +574,8 @@ void ipoib_mcast_join_task(struct work_struct *work) ipoib_warn(priv, "failed to allocate broadcast group\n"); mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, HZ); + queue_delayed_work(priv->wq, &priv->mcast_task, + HZ); mutex_unlock(&mcast_mutex); return; } @@ -563,7 +590,8 @@ void ipoib_mcast_join_task(struct work_struct *work) } if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { - if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) + if (IS_ERR_OR_NULL(priv->broadcast->mc) && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) ipoib_mcast_join(dev, priv->broadcast, 0); return; } @@ -571,23 +599,33 @@ void ipoib_mcast_join_task(struct work_struct *work) while (1) { struct ipoib_mcast *mcast = NULL; + /* + * Need the mutex so our flags are consistent, need the + * priv->lock so we don't race with list removals in either + * mcast_dev_flush or mcast_restart_task + */ + mutex_lock(&mcast_mutex); spin_lock_irq(&priv->lock); list_for_each_entry(mcast, &priv->multicast_list, list) { - if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) - && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) - && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + if (IS_ERR_OR_NULL(mcast->mc) && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) && + !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { /* Found the next unjoined group */ break; } } spin_unlock_irq(&priv->lock); + mutex_unlock(&mcast_mutex); if (&mcast->list == &priv->multicast_list) { /* All done */ break; } - ipoib_mcast_join(dev, mcast, 1); + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + ipoib_mcast_sendonly_join(mcast); + else + ipoib_mcast_join(dev, mcast, 1); return; } @@ -604,13 +642,13 @@ int ipoib_mcast_start_thread(struct net_device *dev) mutex_lock(&mcast_mutex); if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); + queue_delayed_work(priv->wq, &priv->mcast_task, 0); mutex_unlock(&mcast_mutex); return 0; } -int ipoib_mcast_stop_thread(struct net_device *dev, int flush) +int ipoib_mcast_stop_thread(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -621,8 +659,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush) cancel_delayed_work(&priv->mcast_task); mutex_unlock(&mcast_mutex); - if (flush) - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); return 0; } @@ -633,6 +670,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) int ret = 0; if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n"); + + if (!IS_ERR_OR_NULL(mcast->mc)) ib_sa_free_multicast(mcast->mc); if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { @@ -685,6 +725,8 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); __ipoib_mcast_add(dev, mcast); list_add_tail(&mcast->list, &priv->multicast_list); + if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(priv->wq, &priv->mcast_task, 0); } if (!mcast->ah) { @@ -698,8 +740,6 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) ipoib_dbg_mcast(priv, "no address vector, " "but multicast join already started\n"); - else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) - ipoib_mcast_sendonly_join(mcast); /* * If lookup completes between here and out:, don't @@ -759,9 +799,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); - /* seperate between the wait to the leave*/ + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) - if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) wait_for_completion(&mcast->done); list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { @@ -794,8 +837,6 @@ void ipoib_mcast_restart_task(struct work_struct *work) ipoib_dbg_mcast(priv, "restarting multicast task\n"); - ipoib_mcast_stop_thread(dev, 0); - local_irq_save(flags); netif_addr_lock(dev); spin_lock(&priv->lock); @@ -880,14 +921,38 @@ void ipoib_mcast_restart_task(struct work_struct *work) netif_addr_unlock(dev); local_irq_restore(flags); - /* We have to cancel outside of the spinlock */ + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + wait_for_completion(&mcast->done); + + /* + * We have to cancel outside of the spinlock, but we have to + * take the rtnl lock or else we race with the removal of + * entries from the remove list in mcast_dev_flush as part + * of ipoib_stop(). We detect the drop of the ADMIN_UP flag + * to signal that we have hit this particular race, and we + * return since we know we don't need to do anything else + * anyway. + */ + while (!rtnl_trylock()) { + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + return; + else + msleep(20); + } list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(mcast->dev, mcast); ipoib_mcast_free(mcast); } - - if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - ipoib_mcast_start_thread(dev); + /* + * Restart our join task if needed + */ + ipoib_mcast_start_thread(dev); + rtnl_unlock(); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index c56d5d44c53b..b72a753eb41d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -145,10 +145,20 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) int ret, size; int i; + /* + * the various IPoIB tasks assume they will never race against + * themselves, so always use a single thread workqueue + */ + priv->wq = create_singlethread_workqueue("ipoib_wq"); + if (!priv->wq) { + printk(KERN_WARNING "ipoib: failed to allocate device WQ\n"); + return -ENODEV; + } + priv->pd = ib_alloc_pd(priv->ca); if (IS_ERR(priv->pd)) { printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); - return -ENODEV; + goto out_free_wq; } priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); @@ -242,6 +252,10 @@ out_free_mr: out_free_pd: ib_dealloc_pd(priv->pd); + +out_free_wq: + destroy_workqueue(priv->wq); + priv->wq = NULL; return -ENODEV; } @@ -270,6 +284,12 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) if (ib_dealloc_pd(priv->pd)) ipoib_warn(priv, "ib_dealloc_pd failed\n"); + + if (priv->wq) { + flush_workqueue(priv->wq); + destroy_workqueue(priv->wq); + priv->wq = NULL; + } } void ipoib_event(struct ib_event_handler *handler, diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 5461924c9f10..db3c8c851af1 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -2929,7 +2929,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) return -ENOMEM; sep_opt = options; - while ((p = strsep(&sep_opt, ",")) != NULL) { + while ((p = strsep(&sep_opt, ",\n")) != NULL) { if (!*p) continue; diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index e25436b24ce7..629f9f1435a5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -171,9 +171,9 @@ int mlx4_check_port_params(struct mlx4_dev *dev, { int i; - for (i = 0; i < dev->caps.num_ports - 1; i++) { - if (port_type[i] != port_type[i + 1]) { - if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { + if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { + for (i = 0; i < dev->caps.num_ports - 1; i++) { + if (port_type[i] != port_type[i + 1]) { mlx4_err(dev, "Only same port types supported on this HCA, aborting\n"); return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index ab684463780b..da82991239a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -157,6 +157,8 @@ static const char *eqe_type_str(u8 type) return "MLX5_EVENT_TYPE_CMD"; case MLX5_EVENT_TYPE_PAGE_REQUEST: return "MLX5_EVENT_TYPE_PAGE_REQUEST"; + case MLX5_EVENT_TYPE_PAGE_FAULT: + return "MLX5_EVENT_TYPE_PAGE_FAULT"; default: return "Unrecognized event"; } @@ -279,6 +281,11 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) } break; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + case MLX5_EVENT_TYPE_PAGE_FAULT: + mlx5_eq_pagefault(dev, eqe); + break; +#endif default: mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", @@ -446,8 +453,12 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev) int mlx5_start_eqs(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = &dev->priv.eq_table; + u32 async_event_mask = MLX5_ASYNC_EVENT_MASK; int err; + if (dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT); + err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, "mlx5_cmd_eq", &dev->priv.uuari.uars[0]); @@ -459,7 +470,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) mlx5_cmd_use_events(dev); err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC, - MLX5_NUM_ASYNC_EQE, MLX5_ASYNC_EVENT_MASK, + MLX5_NUM_ASYNC_EQE, async_event_mask, "mlx5_async_eq", &dev->priv.uuari.uars[0]); if (err) { mlx5_core_warn(dev, "failed to create async EQ %d\n", err); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 087c4c797deb..06f9036acd83 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -69,6 +69,46 @@ int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps) return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR); } +int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *caps) +{ + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)]; + int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *out; + int err; + + if (!(dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)) + return -ENOTSUPP; + + memset(in, 0, sizeof(in)); + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, HCA_CAP_OPMOD_GET_ODP_CUR); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + if (err) + goto out; + + err = mlx5_cmd_status_to_err_v2(out); + if (err) { + mlx5_core_warn(dev, "query cur hca ODP caps failed, %d\n", err); + goto out; + } + + memcpy(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct), + sizeof(*caps)); + + mlx5_core_dbg(dev, "on-demand paging capabilities:\nrc: %08x\nuc: %08x\nud: %08x\n", + be32_to_cpu(caps->per_transport_caps.rc_odp_caps), + be32_to_cpu(caps->per_transport_caps.uc_odp_caps), + be32_to_cpu(caps->per_transport_caps.ud_odp_caps)); + +out: + kfree(out); + return err; +} +EXPORT_SYMBOL(mlx5_query_odp_caps); + int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) { struct mlx5_cmd_init_hca_mbox_in in; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 5261a2b0da43..575d853dbe05 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -88,6 +88,95 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) mlx5_core_put_rsc(common); } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe) +{ + struct mlx5_eqe_page_fault *pf_eqe = &eqe->data.page_fault; + int qpn = be32_to_cpu(pf_eqe->flags_qpn) & MLX5_QPN_MASK; + struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, qpn); + struct mlx5_core_qp *qp = + container_of(common, struct mlx5_core_qp, common); + struct mlx5_pagefault pfault; + + if (!qp) { + mlx5_core_warn(dev, "ODP event for non-existent QP %06x\n", + qpn); + return; + } + + pfault.event_subtype = eqe->sub_type; + pfault.flags = (be32_to_cpu(pf_eqe->flags_qpn) >> MLX5_QPN_BITS) & + (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE | MLX5_PFAULT_RDMA); + pfault.bytes_committed = be32_to_cpu( + pf_eqe->bytes_committed); + + mlx5_core_dbg(dev, + "PAGE_FAULT: subtype: 0x%02x, flags: 0x%02x,\n", + eqe->sub_type, pfault.flags); + + switch (eqe->sub_type) { + case MLX5_PFAULT_SUBTYPE_RDMA: + /* RDMA based event */ + pfault.rdma.r_key = + be32_to_cpu(pf_eqe->rdma.r_key); + pfault.rdma.packet_size = + be16_to_cpu(pf_eqe->rdma.packet_length); + pfault.rdma.rdma_op_len = + be32_to_cpu(pf_eqe->rdma.rdma_op_len); + pfault.rdma.rdma_va = + be64_to_cpu(pf_eqe->rdma.rdma_va); + mlx5_core_dbg(dev, + "PAGE_FAULT: qpn: 0x%06x, r_key: 0x%08x,\n", + qpn, pfault.rdma.r_key); + mlx5_core_dbg(dev, + "PAGE_FAULT: rdma_op_len: 0x%08x,\n", + pfault.rdma.rdma_op_len); + mlx5_core_dbg(dev, + "PAGE_FAULT: rdma_va: 0x%016llx,\n", + pfault.rdma.rdma_va); + mlx5_core_dbg(dev, + "PAGE_FAULT: bytes_committed: 0x%06x\n", + pfault.bytes_committed); + break; + + case MLX5_PFAULT_SUBTYPE_WQE: + /* WQE based event */ + pfault.wqe.wqe_index = + be16_to_cpu(pf_eqe->wqe.wqe_index); + pfault.wqe.packet_size = + be16_to_cpu(pf_eqe->wqe.packet_length); + mlx5_core_dbg(dev, + "PAGE_FAULT: qpn: 0x%06x, wqe_index: 0x%04x,\n", + qpn, pfault.wqe.wqe_index); + mlx5_core_dbg(dev, + "PAGE_FAULT: bytes_committed: 0x%06x\n", + pfault.bytes_committed); + break; + + default: + mlx5_core_warn(dev, + "Unsupported page fault event sub-type: 0x%02hhx, QP %06x\n", + eqe->sub_type, qpn); + /* Unsupported page faults should still be resolved by the + * page fault handler + */ + } + + if (qp->pfault_handler) { + qp->pfault_handler(qp, &pfault); + } else { + mlx5_core_err(dev, + "ODP event for QP %08x, without a fault handler in QP\n", + qpn); + /* Page fault will remain unresolved. QP will hang until it is + * destroyed + */ + } + + mlx5_core_put_rsc(common); +} +#endif + int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, struct mlx5_create_qp_mbox_in *in, @@ -322,3 +411,33 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn) return err; } EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, + u8 flags, int error) +{ + struct mlx5_page_fault_resume_mbox_in in; + struct mlx5_page_fault_resume_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_PAGE_FAULT_RESUME); + in.hdr.opmod = 0; + flags &= (MLX5_PAGE_FAULT_RESUME_REQUESTOR | + MLX5_PAGE_FAULT_RESUME_WRITE | + MLX5_PAGE_FAULT_RESUME_RDMA); + flags |= (error ? MLX5_PAGE_FAULT_RESUME_ERROR : 0); + in.flags_qpn = cpu_to_be32((qpn & MLX5_QPN_MASK) | + (flags << MLX5_QPN_BITS)); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + err = mlx5_cmd_status_to_err(&out.hdr); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume); +#endif diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ea4f1c46f761..4e5bd813bb9a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -120,6 +120,15 @@ enum { }; enum { + MLX5_MKEY_INBOX_PG_ACCESS = 1 << 31 +}; + +enum { + MLX5_PFAULT_SUBTYPE_WQE = 0, + MLX5_PFAULT_SUBTYPE_RDMA = 1, +}; + +enum { MLX5_PERM_LOCAL_READ = 1 << 2, MLX5_PERM_LOCAL_WRITE = 1 << 3, MLX5_PERM_REMOTE_READ = 1 << 4, @@ -180,6 +189,19 @@ enum { MLX5_MKEY_MASK_FREE = 1ull << 29, }; +enum { + MLX5_UMR_TRANSLATION_OFFSET_EN = (1 << 4), + + MLX5_UMR_CHECK_NOT_FREE = (1 << 5), + MLX5_UMR_CHECK_FREE = (2 << 5), + + MLX5_UMR_INLINE = (1 << 7), +}; + +#define MLX5_UMR_MTT_ALIGNMENT 0x40 +#define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) +#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT + enum mlx5_event { MLX5_EVENT_TYPE_COMP = 0x0, @@ -206,6 +228,8 @@ enum mlx5_event { MLX5_EVENT_TYPE_CMD = 0x0a, MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, + + MLX5_EVENT_TYPE_PAGE_FAULT = 0xc, }; enum { @@ -225,6 +249,7 @@ enum { MLX5_DEV_CAP_FLAG_APM = 1LL << 17, MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23, + MLX5_DEV_CAP_FLAG_ON_DMND_PG = 1LL << 24, MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, @@ -290,6 +315,8 @@ enum { enum { HCA_CAP_OPMOD_GET_MAX = 0, HCA_CAP_OPMOD_GET_CUR = 1, + HCA_CAP_OPMOD_GET_ODP_MAX = 4, + HCA_CAP_OPMOD_GET_ODP_CUR = 5 }; struct mlx5_inbox_hdr { @@ -319,6 +346,23 @@ struct mlx5_cmd_query_adapter_mbox_out { u8 vsd_psid[16]; }; +enum mlx5_odp_transport_cap_bits { + MLX5_ODP_SUPPORT_SEND = 1 << 31, + MLX5_ODP_SUPPORT_RECV = 1 << 30, + MLX5_ODP_SUPPORT_WRITE = 1 << 29, + MLX5_ODP_SUPPORT_READ = 1 << 28, +}; + +struct mlx5_odp_caps { + char reserved[0x10]; + struct { + __be32 rc_odp_caps; + __be32 uc_odp_caps; + __be32 ud_odp_caps; + } per_transport_caps; + char reserved2[0xe4]; +}; + struct mlx5_cmd_init_hca_mbox_in { struct mlx5_inbox_hdr hdr; u8 rsvd0[2]; @@ -439,6 +483,27 @@ struct mlx5_eqe_page_req { __be32 rsvd1[5]; }; +struct mlx5_eqe_page_fault { + __be32 bytes_committed; + union { + struct { + u16 reserved1; + __be16 wqe_index; + u16 reserved2; + __be16 packet_length; + u8 reserved3[12]; + } __packed wqe; + struct { + __be32 r_key; + u16 reserved1; + __be16 packet_length; + __be32 rdma_op_len; + __be64 rdma_va; + } __packed rdma; + } __packed; + __be32 flags_qpn; +} __packed; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -450,6 +515,7 @@ union ev_data { struct mlx5_eqe_congestion cong; struct mlx5_eqe_stall_vl stall_vl; struct mlx5_eqe_page_req req_pages; + struct mlx5_eqe_page_fault page_fault; } __packed; struct mlx5_eqe { @@ -776,6 +842,10 @@ struct mlx5_query_eq_mbox_out { struct mlx5_eq_context ctx; }; +enum { + MLX5_MKEY_STATUS_FREE = 1 << 6, +}; + struct mlx5_mkey_seg { /* This is a two bit field occupying bits 31-30. * bit 31 is always 0, @@ -812,7 +882,7 @@ struct mlx5_query_special_ctxs_mbox_out { struct mlx5_create_mkey_mbox_in { struct mlx5_inbox_hdr hdr; __be32 input_mkey_index; - u8 rsvd0[4]; + __be32 flags; struct mlx5_mkey_seg seg; u8 rsvd1[16]; __be32 xlat_oct_act_size; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b1bf41556b32..166d9315fe4b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -113,6 +113,13 @@ enum { MLX5_REG_HOST_ENDIANNESS = 0x7004, }; +enum mlx5_page_fault_resume_flags { + MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0, + MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1, + MLX5_PAGE_FAULT_RESUME_RDMA = 1 << 2, + MLX5_PAGE_FAULT_RESUME_ERROR = 1 << 7, +}; + enum dbg_rsc_type { MLX5_DBG_RSC_QP, MLX5_DBG_RSC_EQ, @@ -467,7 +474,7 @@ struct mlx5_priv { struct workqueue_struct *pg_wq; struct rb_root page_root; int fw_pages; - int reg_pages; + atomic_t reg_pages; struct list_head free_list; struct mlx5_core_health health; @@ -703,6 +710,9 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev); void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe); +#endif void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); @@ -740,6 +750,8 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, int npsvs, u32 *sig_index); int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); +int mlx5_query_odp_caps(struct mlx5_core_dev *dev, + struct mlx5_odp_caps *odp_caps); static inline u32 mlx5_mkey_to_idx(u32 mkey) { diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 3fa075daeb1d..61f7a342d1bf 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -50,6 +50,9 @@ #define MLX5_BSF_APPTAG_ESCAPE 0x1 #define MLX5_BSF_APPREF_ESCAPE 0x2 +#define MLX5_QPN_BITS 24 +#define MLX5_QPN_MASK ((1 << MLX5_QPN_BITS) - 1) + enum mlx5_qp_optpar { MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, MLX5_QP_OPTPAR_RRE = 1 << 1, @@ -189,6 +192,14 @@ struct mlx5_wqe_ctrl_seg { __be32 imm; }; +#define MLX5_WQE_CTRL_DS_MASK 0x3f +#define MLX5_WQE_CTRL_QPN_MASK 0xffffff00 +#define MLX5_WQE_CTRL_QPN_SHIFT 8 +#define MLX5_WQE_DS_UNITS 16 +#define MLX5_WQE_CTRL_OPCODE_MASK 0xff +#define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 +#define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 + struct mlx5_wqe_xrc_seg { __be32 xrc_srqn; u8 rsvd[12]; @@ -292,6 +303,8 @@ struct mlx5_wqe_signature_seg { u8 rsvd1[11]; }; +#define MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK 0x3ff + struct mlx5_wqe_inline_seg { __be32 byte_count; }; @@ -360,9 +373,46 @@ struct mlx5_stride_block_ctrl_seg { __be16 num_entries; }; +enum mlx5_pagefault_flags { + MLX5_PFAULT_REQUESTOR = 1 << 0, + MLX5_PFAULT_WRITE = 1 << 1, + MLX5_PFAULT_RDMA = 1 << 2, +}; + +/* Contains the details of a pagefault. */ +struct mlx5_pagefault { + u32 bytes_committed; + u8 event_subtype; + enum mlx5_pagefault_flags flags; + union { + /* Initiator or send message responder pagefault details. */ + struct { + /* Received packet size, only valid for responders. */ + u32 packet_size; + /* + * WQE index. Refers to either the send queue or + * receive queue, according to event_subtype. + */ + u16 wqe_index; + } wqe; + /* RDMA responder pagefault details */ + struct { + u32 r_key; + /* + * Received packet size, minimal size page fault + * resolution required for forward progress. + */ + u32 packet_size; + u32 rdma_op_len; + u64 rdma_va; + } rdma; + }; +}; + struct mlx5_core_qp { struct mlx5_core_rsc_common common; /* must be first */ void (*event) (struct mlx5_core_qp *, int); + void (*pfault_handler)(struct mlx5_core_qp *, struct mlx5_pagefault *); int qpn; struct mlx5_rsc_debug *dbg; int pid; @@ -530,6 +580,17 @@ static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u return radix_tree_lookup(&dev->priv.mr_table.tree, key); } +struct mlx5_page_fault_resume_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 flags_qpn; + u8 reserved[4]; +}; + +struct mlx5_page_fault_resume_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, struct mlx5_create_qp_mbox_in *in, @@ -549,6 +610,10 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev); void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, + u8 context, int error); +#endif static inline const char *mlx5_qp_type_str(int type) { diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index a2bf41e0bde9..2d83cfd7e6ce 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -38,11 +38,12 @@ #include <linux/workqueue.h> struct ib_ucontext; +struct ib_umem_odp; struct ib_umem { struct ib_ucontext *context; size_t length; - int offset; + unsigned long address; int page_size; int writable; int hugetlb; @@ -50,17 +51,43 @@ struct ib_umem { struct pid *pid; struct mm_struct *mm; unsigned long diff; + struct ib_umem_odp *odp_data; struct sg_table sg_head; int nmap; int npages; }; +/* Returns the offset of the umem start relative to the first page. */ +static inline int ib_umem_offset(struct ib_umem *umem) +{ + return umem->address & ((unsigned long)umem->page_size - 1); +} + +/* Returns the first page of an ODP umem. */ +static inline unsigned long ib_umem_start(struct ib_umem *umem) +{ + return umem->address - ib_umem_offset(umem); +} + +/* Returns the address of the page after the last one of an ODP umem. */ +static inline unsigned long ib_umem_end(struct ib_umem *umem) +{ + return PAGE_ALIGN(umem->address + umem->length); +} + +static inline size_t ib_umem_num_pages(struct ib_umem *umem) +{ + return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT; +} + #ifdef CONFIG_INFINIBAND_USER_MEM struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); +int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length); #else /* CONFIG_INFINIBAND_USER_MEM */ @@ -73,7 +100,10 @@ static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, } static inline void ib_umem_release(struct ib_umem *umem) { } static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } - +static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length) { + return -EINVAL; +} #endif /* CONFIG_INFINIBAND_USER_MEM */ #endif /* IB_UMEM_H */ diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h new file mode 100644 index 000000000000..3da0b167041b --- /dev/null +++ b/include/rdma/ib_umem_odp.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_UMEM_ODP_H +#define IB_UMEM_ODP_H + +#include <rdma/ib_umem.h> +#include <rdma/ib_verbs.h> +#include <linux/interval_tree.h> + +struct umem_odp_node { + u64 __subtree_last; + struct rb_node rb; +}; + +struct ib_umem_odp { + /* + * An array of the pages included in the on-demand paging umem. + * Indices of pages that are currently not mapped into the device will + * contain NULL. + */ + struct page **page_list; + /* + * An array of the same size as page_list, with DMA addresses mapped + * for pages the pages in page_list. The lower two bits designate + * access permissions. See ODP_READ_ALLOWED_BIT and + * ODP_WRITE_ALLOWED_BIT. + */ + dma_addr_t *dma_list; + /* + * The umem_mutex protects the page_list and dma_list fields of an ODP + * umem, allowing only a single thread to map/unmap pages. The mutex + * also protects access to the mmu notifier counters. + */ + struct mutex umem_mutex; + void *private; /* for the HW driver to use. */ + + /* When false, use the notifier counter in the ucontext struct. */ + bool mn_counters_active; + int notifiers_seq; + int notifiers_count; + + /* A linked list of umems that don't have private mmu notifier + * counters yet. */ + struct list_head no_private_counters; + struct ib_umem *umem; + + /* Tree tracking */ + struct umem_odp_node interval_tree; + + struct completion notifier_completion; + int dying; +}; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem); + +void ib_umem_odp_release(struct ib_umem *umem); + +/* + * The lower 2 bits of the DMA address signal the R/W permissions for + * the entry. To upgrade the permissions, provide the appropriate + * bitmask to the map_dma_pages function. + * + * Be aware that upgrading a mapped address might result in change of + * the DMA address for the page. + */ +#define ODP_READ_ALLOWED_BIT (1<<0ULL) +#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) + +#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) + +int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, + u64 access_mask, unsigned long current_seq); + +void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, + u64 bound); + +void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root); +void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root); +typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, + void *cookie); +/* + * Call the callback on each ib_umem in the range. Returns the logical or of + * the return values of the functions called. + */ +int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end, + umem_call_back cb, void *cookie); + +struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root, + u64 start, u64 last); +struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node, + u64 start, u64 last); + +static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, + unsigned long mmu_seq) +{ + /* + * This code is strongly based on the KVM code from + * mmu_notifier_retry. Should be called with + * the relevant locks taken (item->odp_data->umem_mutex + * and the ucontext umem_mutex semaphore locked for read). + */ + + /* Do not allow page faults while the new ib_umem hasn't seen a state + * with zero notifiers yet, and doesn't have its own valid set of + * private counters. */ + if (!item->odp_data->mn_counters_active) + return 1; + + if (unlikely(item->odp_data->notifiers_count)) + return 1; + if (item->odp_data->notifiers_seq != mmu_seq) + return 1; + return 0; +} + +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +static inline int ib_umem_odp_get(struct ib_ucontext *context, + struct ib_umem *umem) +{ + return -EINVAL; +} + +static inline void ib_umem_odp_release(struct ib_umem *umem) {} + +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +#endif /* IB_UMEM_ODP_H */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 470a011d6fa4..0d74f1de99aa 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -51,6 +51,7 @@ #include <uapi/linux/if_ether.h> #include <linux/atomic.h> +#include <linux/mmu_notifier.h> #include <asm/uaccess.h> extern struct workqueue_struct *ib_wq; @@ -123,7 +124,8 @@ enum ib_device_cap_flags { IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), - IB_DEVICE_SIGNATURE_HANDOVER = (1<<30) + IB_DEVICE_SIGNATURE_HANDOVER = (1<<30), + IB_DEVICE_ON_DEMAND_PAGING = (1<<31), }; enum ib_signature_prot_cap { @@ -143,6 +145,27 @@ enum ib_atomic_cap { IB_ATOMIC_GLOB }; +enum ib_odp_general_cap_bits { + IB_ODP_SUPPORT = 1 << 0, +}; + +enum ib_odp_transport_cap_bits { + IB_ODP_SUPPORT_SEND = 1 << 0, + IB_ODP_SUPPORT_RECV = 1 << 1, + IB_ODP_SUPPORT_WRITE = 1 << 2, + IB_ODP_SUPPORT_READ = 1 << 3, + IB_ODP_SUPPORT_ATOMIC = 1 << 4, +}; + +struct ib_odp_caps { + uint64_t general_caps; + struct { + uint32_t rc_odp_caps; + uint32_t uc_odp_caps; + uint32_t ud_odp_caps; + } per_transport_caps; +}; + struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; @@ -186,6 +209,7 @@ struct ib_device_attr { u8 local_ca_ack_delay; int sig_prot_cap; int sig_guard_cap; + struct ib_odp_caps odp_caps; }; enum ib_mtu { @@ -1073,7 +1097,8 @@ enum ib_access_flags { IB_ACCESS_REMOTE_READ = (1<<2), IB_ACCESS_REMOTE_ATOMIC = (1<<3), IB_ACCESS_MW_BIND = (1<<4), - IB_ZERO_BASED = (1<<5) + IB_ZERO_BASED = (1<<5), + IB_ACCESS_ON_DEMAND = (1<<6), }; struct ib_phys_buf { @@ -1115,6 +1140,8 @@ struct ib_fmr_attr { u8 page_shift; }; +struct ib_umem; + struct ib_ucontext { struct ib_device *device; struct list_head pd_list; @@ -1127,6 +1154,24 @@ struct ib_ucontext { struct list_head xrcd_list; struct list_head rule_list; int closing; + + struct pid *tgid; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct rb_root umem_tree; + /* + * Protects .umem_rbroot and tree, as well as odp_mrs_count and + * mmu notifiers registration. + */ + struct rw_semaphore umem_rwsem; + void (*invalidate_range)(struct ib_umem *umem, + unsigned long start, unsigned long end); + + struct mmu_notifier mn; + atomic_t notifier_count; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + int odp_mrs_count; +#endif }; struct ib_uobject { @@ -1662,7 +1707,10 @@ static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { - return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; + size_t copy_sz; + + copy_sz = min_t(size_t, len, udata->outlen); + return copy_to_user(udata->outbuf, src, copy_sz) ? -EFAULT : 0; } /** diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 26daf55ff76e..4275b961bf60 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -90,8 +90,9 @@ enum { }; enum { + IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, - IB_USER_VERBS_EX_CMD_DESTROY_FLOW + IB_USER_VERBS_EX_CMD_DESTROY_FLOW, }; /* @@ -201,6 +202,32 @@ struct ib_uverbs_query_device_resp { __u8 reserved[4]; }; +enum { + IB_USER_VERBS_EX_QUERY_DEVICE_ODP = 1ULL << 0, +}; + +struct ib_uverbs_ex_query_device { + __u32 comp_mask; + __u32 reserved; +}; + +struct ib_uverbs_odp_caps { + __u64 general_caps; + struct { + __u32 rc_odp_caps; + __u32 uc_odp_caps; + __u32 ud_odp_caps; + } per_transport_caps; + __u32 reserved; +}; + +struct ib_uverbs_ex_query_device_resp { + struct ib_uverbs_query_device_resp base; + __u32 comp_mask; + __u32 reserved; + struct ib_uverbs_odp_caps odp_caps; +}; + struct ib_uverbs_query_port { __u64 response; __u8 port_num; |