diff options
45 files changed, 1995 insertions, 821 deletions
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 066ac313ae5c..a2c0dfc6fdc0 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -48,13 +48,13 @@ void nlmclnt_next_cookie(struct nlm_cookie *c) static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner) { - atomic_inc(&lockowner->count); + refcount_inc(&lockowner->count); return lockowner; } static void nlm_put_lockowner(struct nlm_lockowner *lockowner) { - if (!atomic_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) + if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) return; list_del(&lockowner->list); spin_unlock(&lockowner->host->h_lock); @@ -105,7 +105,7 @@ static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_ res = __nlm_find_lockowner(host, owner); if (res == NULL && new != NULL) { res = new; - atomic_set(&new->count, 1); + refcount_set(&new->count, 1); new->owner = owner; new->pid = __nlm_alloc_pid(host); new->host = nlm_get_host(host); @@ -204,7 +204,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) for(;;) { call = kzalloc(sizeof(*call), GFP_KERNEL); if (call != NULL) { - atomic_set(&call->a_count, 1); + refcount_set(&call->a_count, 1); locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); call->a_host = nlm_get_host(host); @@ -222,7 +222,7 @@ void nlmclnt_release_call(struct nlm_rqst *call) { const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops; - if (!atomic_dec_and_test(&call->a_count)) + if (!refcount_dec_and_test(&call->a_count)) return; if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call) nlmclnt_ops->nlmclnt_release_call(call->a_callback_data); @@ -678,7 +678,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) goto out; } - atomic_inc(&req->a_count); + refcount_inc(&req->a_count); status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); if (status < 0) @@ -769,7 +769,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl nlmclnt_setlockargs(req, fl); req->a_args.block = block; - atomic_inc(&req->a_count); + refcount_inc(&req->a_count); status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status == 0 && req->a_res.status == nlm_lck_denied) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 826a89184f90..d35cd6be0675 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -114,7 +114,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, unsigned long now = jiffies; if (nsm != NULL) - atomic_inc(&nsm->sm_count); + refcount_inc(&nsm->sm_count); else { host = NULL; nsm = nsm_get_handle(ni->net, ni->sap, ni->salen, @@ -151,7 +151,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, host->h_state = 0; host->h_nsmstate = 0; host->h_pidcount = 0; - atomic_set(&host->h_count, 1); + refcount_set(&host->h_count, 1); mutex_init(&host->h_mutex); host->h_nextrebind = now + NLM_HOST_REBIND; host->h_expires = now + NLM_HOST_EXPIRE; @@ -290,7 +290,7 @@ void nlmclnt_release_host(struct nlm_host *host) WARN_ON_ONCE(host->h_server); - if (atomic_dec_and_test(&host->h_count)) { + if (refcount_dec_and_test(&host->h_count)) { WARN_ON_ONCE(!list_empty(&host->h_lockowners)); WARN_ON_ONCE(!list_empty(&host->h_granted)); WARN_ON_ONCE(!list_empty(&host->h_reclaim)); @@ -388,6 +388,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, ln->nrhosts++; nrhosts++; + refcount_inc(&host->h_count); + dprintk("lockd: %s created host %s (%s)\n", __func__, host->h_name, host->h_addrbuf); @@ -410,7 +412,7 @@ void nlmsvc_release_host(struct nlm_host *host) dprintk("lockd: release server host %s\n", host->h_name); WARN_ON_ONCE(!host->h_server); - atomic_dec(&host->h_count); + refcount_dec(&host->h_count); } /* @@ -504,7 +506,7 @@ struct nlm_host * nlm_get_host(struct nlm_host *host) { if (host) { dprintk("lockd: get host %s\n", host->h_name); - atomic_inc(&host->h_count); + refcount_inc(&host->h_count); host->h_expires = jiffies + NLM_HOST_EXPIRE; } return host; @@ -593,7 +595,7 @@ static void nlm_complain_hosts(struct net *net) if (net && host->net != net) continue; dprintk(" %s (cnt %d use %d exp %ld net %x)\n", - host->h_name, atomic_read(&host->h_count), + host->h_name, refcount_read(&host->h_count), host->h_inuse, host->h_expires, host->net->ns.inum); } } @@ -662,16 +664,16 @@ nlm_gc_hosts(struct net *net) for_each_host_safe(host, next, chain, nlm_server_hosts) { if (net && host->net != net) continue; - if (atomic_read(&host->h_count) || host->h_inuse - || time_before(jiffies, host->h_expires)) { + if (host->h_inuse || time_before(jiffies, host->h_expires)) { dprintk("nlm_gc_hosts skipping %s " "(cnt %d use %d exp %ld net %x)\n", - host->h_name, atomic_read(&host->h_count), + host->h_name, refcount_read(&host->h_count), host->h_inuse, host->h_expires, host->net->ns.inum); continue; } - nlm_destroy_host_locked(host); + if (refcount_dec_if_one(&host->h_count)) + nlm_destroy_host_locked(host); } if (net) { diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 96cfb2967ac7..654594ef4f94 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -191,7 +191,7 @@ void nsm_unmonitor(const struct nlm_host *host) struct nsm_res res; int status; - if (atomic_read(&nsm->sm_count) == 1 + if (refcount_read(&nsm->sm_count) == 1 && nsm->sm_monitored && !nsm->sm_sticky) { dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); @@ -279,7 +279,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, if (unlikely(new == NULL)) return NULL; - atomic_set(&new->sm_count, 1); + refcount_set(&new->sm_count, 1); new->sm_name = (char *)(new + 1); memcpy(nsm_addr(new), sap, salen); new->sm_addrlen = salen; @@ -337,13 +337,13 @@ retry: cached = nsm_lookup_addr(&ln->nsm_handles, sap); if (cached != NULL) { - atomic_inc(&cached->sm_count); + refcount_inc(&cached->sm_count); spin_unlock(&nsm_lock); kfree(new); dprintk("lockd: found nsm_handle for %s (%s), " "cnt %d\n", cached->sm_name, cached->sm_addrbuf, - atomic_read(&cached->sm_count)); + refcount_read(&cached->sm_count)); return cached; } @@ -388,12 +388,12 @@ struct nsm_handle *nsm_reboot_lookup(const struct net *net, return cached; } - atomic_inc(&cached->sm_count); + refcount_inc(&cached->sm_count); spin_unlock(&nsm_lock); dprintk("lockd: host %s (%s) rebooted, cnt %d\n", cached->sm_name, cached->sm_addrbuf, - atomic_read(&cached->sm_count)); + refcount_read(&cached->sm_count)); return cached; } @@ -404,7 +404,7 @@ struct nsm_handle *nsm_reboot_lookup(const struct net *net, */ void nsm_release(struct nsm_handle *nsm) { - if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { + if (refcount_dec_and_lock(&nsm->sm_count, &nsm_lock)) { list_del(&nsm->sm_link); spin_unlock(&nsm_lock); dprintk("lockd: destroyed nsm_handle for %s (%s)\n", diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 0d670c5c378f..ea77c66d3cc3 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -295,7 +295,7 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data) void nlmsvc_release_call(struct nlm_rqst *call) { - if (!atomic_dec_and_test(&call->a_count)) + if (!refcount_dec_and_test(&call->a_count)) return; nlmsvc_release_host(call->a_host); kfree(call); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 995d707537da..7cb5c38c19e4 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -137,6 +137,11 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, return bio; } +static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map) +{ + return offset >= map->start && offset < map->start + map->len; +} + static struct bio * do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_dev_map *map, @@ -156,8 +161,8 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, /* translate to physical disk offset */ disk_addr = (u64)isect << SECTOR_SHIFT; - if (disk_addr < map->start || disk_addr >= map->start + map->len) { - if (!dev->map(dev, disk_addr, map)) + if (!offset_in_map(disk_addr, map)) { + if (!dev->map(dev, disk_addr, map) || !offset_in_map(disk_addr, map)) return ERR_PTR(-EIO); bio = bl_submit_bio(bio); } @@ -184,6 +189,29 @@ retry: return bio; } +static void bl_mark_devices_unavailable(struct nfs_pgio_header *header, bool rw) +{ + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + size_t bytes_left = header->args.count; + sector_t isect, extent_length = 0; + struct pnfs_block_extent be; + + isect = header->args.offset >> SECTOR_SHIFT; + bytes_left += header->args.offset - (isect << SECTOR_SHIFT); + + while (bytes_left > 0) { + if (!ext_tree_lookup(bl, isect, &be, rw)) + return; + extent_length = be.be_length - (isect - be.be_f_offset); + nfs4_mark_deviceid_unavailable(be.be_device); + isect += extent_length; + if (bytes_left > extent_length << SECTOR_SHIFT) + bytes_left -= extent_length << SECTOR_SHIFT; + else + bytes_left = 0; + } +} + static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; @@ -194,6 +222,7 @@ static void bl_end_io_read(struct bio *bio) if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); + bl_mark_devices_unavailable(header, false); } bio_put(bio); @@ -323,6 +352,7 @@ static void bl_end_io_write(struct bio *bio) if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); + bl_mark_devices_unavailable(header, true); } bio_put(bio); put_parallel(par); @@ -552,6 +582,31 @@ static int decode_sector_number(__be32 **rp, sector_t *sp) return 0; } +static struct nfs4_deviceid_node * +bl_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask) +{ + struct nfs4_deviceid_node *node; + unsigned long start, end; + +retry: + node = nfs4_find_get_deviceid(server, id, cred, gfp_mask); + if (!node) + return ERR_PTR(-ENODEV); + + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) + return node; + + end = jiffies; + start = end - PNFS_DEVICE_RETRY_TIMEOUT; + if (!time_in_range(node->timestamp_unavailable, start, end)) { + nfs4_delete_deviceid(node->ld, node->nfs_client, id); + goto retry; + } + return ERR_PTR(-ENODEV); +} + static int bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, struct layout_verification *lv, struct list_head *extents, @@ -573,16 +628,18 @@ bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, memcpy(&id, p, NFS4_DEVICEID4_SIZE); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); - error = -EIO; - be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, + be->be_device = bl_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, lo->plh_lc_cred, gfp_mask); - if (!be->be_device) + if (IS_ERR(be->be_device)) { + error = PTR_ERR(be->be_device); goto out_free_be; + } /* * The next three values are read in as bytes, but stored in the * extent structure in 512-byte granularity. */ + error = -EIO; if (decode_sector_number(&p, &be->be_f_offset) < 0) goto out_put_deviceid; if (decode_sector_number(&p, &be->be_length) < 0) @@ -692,11 +749,16 @@ out_free_scratch: __free_page(scratch); out: dprintk("%s returns %d\n", __func__, status); - if (status) { + switch (status) { + case -ENODEV: + /* Our extent block devices are unavailable */ + set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags); + case 0: + return lseg; + default: kfree(lseg); return ERR_PTR(status); } - return lseg; } static void @@ -798,6 +860,13 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } pnfs_generic_pg_init_read(pgio, req); + + if (pgio->pg_lseg && + test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) { + pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg); + pnfs_set_lo_fail(pgio->pg_lseg); + nfs_pageio_reset_read_mds(pgio); + } } /* @@ -853,6 +922,14 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); pnfs_generic_pg_init_write(pgio, req, wb_size); + + if (pgio->pg_lseg && + test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) { + + pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg); + pnfs_set_lo_fail(pgio->pg_lseg); + nfs_pageio_reset_write_mds(pgio); + } } /* @@ -887,6 +964,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .name = "LAYOUT_BLOCK_VOLUME", .owner = THIS_MODULE, .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR | PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, @@ -910,6 +988,7 @@ static struct pnfs_layoutdriver_type scsilayout_type = { .name = "LAYOUT_SCSI", .owner = THIS_MODULE, .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR | PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, @@ -967,6 +1046,7 @@ static void __exit nfs4blocklayout_exit(void) } MODULE_ALIAS("nfs-layouttype4-3"); +MODULE_ALIAS("nfs-layouttype4-5"); module_init(nfs4blocklayout_init); module_exit(nfs4blocklayout_exit); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index efc007f00742..716bc75e9ed2 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -92,10 +92,9 @@ struct pnfs_block_volume { }; struct pnfs_block_dev_map { - sector_t start; - sector_t len; - - sector_t disk_offset; + u64 start; + u64 len; + u64 disk_offset; struct block_device *bdev; }; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 95f74bd2c067..a7efd83779d2 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -533,14 +533,11 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_free_volumes; ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); - if (ret) { - bl_free_device(top); - kfree(top); - goto out_free_volumes; - } node = &top->node; nfs4_init_deviceid_node(node, server, &pdev->dev_id); + if (ret) + nfs4_mark_deviceid_unavailable(node); out_free_volumes: kfree(volumes); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index d2972d537469..8c10b0562e75 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -775,10 +775,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { - dreq->flags = 0; + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) dreq->error = hdr->error; - } if (dreq->error == 0) { nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 83fd09fc8f77..ab5de3246c5c 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -48,10 +48,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) *max_len = len; return FILEID_INVALID; } - if (IS_AUTOMOUNT(inode)) { - *max_len = FILEID_INVALID; - goto out; - } p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32; p[FILEID_LOW_OFF] = NFS_FILEID(inode); @@ -59,7 +55,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) p[len - 1] = 0; /* Padding */ nfs_copy_fh(clnt_fh, server_fh); *max_len = len; -out: dprintk("%s: result fh fileid %llu mode %u size %d\n", __func__, NFS_FILEID(inode), inode->i_mode, *max_len); return *max_len; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 4e54d8b5413a..d175724ff566 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -895,9 +895,7 @@ fl_pnfs_update_layout(struct inode *ino, lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, gfp_flags); - if (!lseg) - lseg = ERR_PTR(-ENOMEM); - if (IS_ERR(lseg)) + if (IS_ERR_OR_NULL(lseg)) goto out; lo = NFS_I(ino)->layout; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 93552c482992..ceeaf0fb6657 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -735,12 +735,20 @@ int nfs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; + struct nfs_server *server = NFS_SERVER(inode); + unsigned long cache_validity; int err = 0; + bool force_sync = query_flags & AT_STATX_FORCE_SYNC; + bool do_update = false; trace_nfs_getattr_enter(inode); + + if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) + goto out_no_update; + /* Flush out writes to the server in order to update c/mtime. */ - if (S_ISREG(inode->i_mode)) { + if ((request_mask & (STATX_CTIME|STATX_MTIME)) && + S_ISREG(inode->i_mode)) { err = filemap_write_and_wait(inode->i_mapping); if (err) goto out; @@ -757,24 +765,42 @@ int nfs_getattr(const struct path *path, struct kstat *stat, */ if ((path->mnt->mnt_flags & MNT_NOATIME) || ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) - need_atime = 0; - - if (need_atime || nfs_need_revalidate_inode(inode)) { - struct nfs_server *server = NFS_SERVER(inode); - + request_mask &= ~STATX_ATIME; + + /* Is the user requesting attributes that might need revalidation? */ + if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| + STATX_MTIME|STATX_UID|STATX_GID| + STATX_SIZE|STATX_BLOCKS))) + goto out_no_revalidate; + + /* Check whether the cached attributes are stale */ + do_update |= force_sync || nfs_attribute_cache_expired(inode); + cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + do_update |= cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL); + if (request_mask & STATX_ATIME) + do_update |= cache_validity & NFS_INO_INVALID_ATIME; + if (request_mask & (STATX_CTIME|STATX_MTIME)) + do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE; + if (do_update) { + /* Update the attribute cache */ if (!(server->flags & NFS_MOUNT_NOAC)) nfs_readdirplus_parent_cache_miss(path->dentry); else nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); + if (err) + goto out; } else nfs_readdirplus_parent_cache_hit(path->dentry); - if (!err) { - generic_fillattr(inode, stat); - stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); - if (S_ISDIR(inode->i_mode)) - stat->blksize = NFS_SERVER(inode)->dtsize; - } +out_no_revalidate: + /* Only return attributes that were revalidated. */ + stat->result_mask &= request_mask; +out_no_update: + generic_fillattr(inode, stat); + stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + if (S_ISDIR(inode->i_mode)) + stat->blksize = NFS_SERVER(inode)->dtsize; out: trace_nfs_getattr_exit(inode, err); return err; @@ -1144,7 +1170,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { - unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 20fef85d2bb1..9034b4926909 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -99,7 +99,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) { if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { set_bit(NFS_INO_ODIRECT, &nfsi->flags); - nfs_wb_all(inode); + nfs_sync_mapping(inode->i_mapping); } } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 65a7e5da508c..04612c24d394 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -861,6 +861,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); + server->port = rpc_get_port(addr); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); @@ -1123,19 +1124,36 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - /* Get a client representation. - * Note: NFSv4 always uses TCP, */ + /* Get a client representation */ +#ifdef CONFIG_SUNRPC_XPRT_RDMA + rpc_set_port(data->addr, NFS_RDMA_PORT); error = nfs4_set_client(server, data->hostname, data->addr, data->addrlen, parent_client->cl_ipaddr, - rpc_protocol(parent_server->client), + XPRT_TRANSPORT_RDMA, + parent_server->client->cl_timeout, + parent_client->cl_mvops->minor_version, + parent_client->cl_net); + if (!error) + goto init_server; +#endif /* CONFIG_SUNRPC_XPRT_RDMA */ + + rpc_set_port(data->addr, NFS_PORT); + error = nfs4_set_client(server, data->hostname, + data->addr, + data->addrlen, + parent_client->cl_ipaddr, + XPRT_TRANSPORT_TCP, parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, parent_client->cl_net); if (error < 0) goto error; +#ifdef CONFIG_SUNRPC_XPRT_RDMA +init_server: +#endif error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); if (error < 0) goto error; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 30426c1a1bbd..22dc30a679a0 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -568,9 +568,13 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, struct idmap_msg *im; struct idmap *idmap = (struct idmap *)aux; struct key *key = cons->key; - int ret = -ENOMEM; + int ret = -ENOKEY; + + if (!aux) + goto out1; /* msg and im are freed in idmap_pipe_destroy_msg */ + ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out1; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 8c3f327d858d..24f06dcc2b08 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -270,8 +270,6 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (mountdata->addrlen == 0) continue; - rpc_set_port(mountdata->addr, NFS_PORT); - memcpy(page2, buf->data, buf->len); page2[buf->len] = '\0'; mountdata->hostname = page2; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 17a03f2c4330..47f3c273245e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2020,7 +2020,7 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, struct file_lock *fl, int err) { switch (err) { default: @@ -2067,7 +2067,11 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -ENOMEM: case -NFS4ERR_DENIED: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + if (fl) { + struct nfs4_lock_state *lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); + } return 0; } return err; @@ -2103,7 +2107,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, err = nfs4_open_recover_helper(opendata, FMODE_READ); } nfs4_opendata_put(opendata); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err); } static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) @@ -3150,6 +3154,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); nfs4_stateid *res_stateid = NULL; + struct nfs4_exception exception = { + .state = state, + .inode = calldata->inode, + .stateid = &calldata->arg.stateid, + }; dprintk("%s: begin!\n", __func__); if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -3215,7 +3224,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) case -NFS4ERR_BAD_STATEID: break; default: - if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + task->tk_status = nfs4_async_handle_exception(task, + server, task->tk_status, &exception); + if (exception.retry) goto out_restart; } nfs_clear_open_stateid(state, &calldata->arg.stateid, @@ -5759,6 +5770,10 @@ struct nfs4_delegreturndata { static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_delegreturndata *data = calldata; + struct nfs4_exception exception = { + .inode = data->inode, + .stateid = &data->stateid, + }; if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -5820,10 +5835,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } /* Fallthrough */ default: - if (nfs4_async_handle_error(task, data->res.server, - NULL, NULL) == -EAGAIN) { + task->tk_status = nfs4_async_handle_exception(task, + data->res.server, task->tk_status, + &exception); + if (exception.retry) goto out_restart; - } } data->rpc_status = task->tk_status; return; @@ -6061,6 +6077,10 @@ static void nfs4_locku_release_calldata(void *data) static void nfs4_locku_done(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; + struct nfs4_exception exception = { + .inode = calldata->lsp->ls_state->inode, + .stateid = &calldata->arg.stateid, + }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; @@ -6084,8 +6104,10 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) rpc_restart_call_prepare(task); break; default: - if (nfs4_async_handle_error(task, calldata->server, - NULL, NULL) == -EAGAIN) + task->tk_status = nfs4_async_handle_exception(task, + calldata->server, task->tk_status, + &exception); + if (exception.retry) rpc_restart_call_prepare(task); } nfs_release_seqid(calldata->arg.seqid); @@ -6741,7 +6763,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, if (err != 0) return err; err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } struct nfs_release_lockowner_data { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e4f4a09ed9f4..91a4d4eeb235 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1482,6 +1482,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ struct inode *inode = state->inode; struct nfs_inode *nfsi = NFS_I(inode); struct file_lock *fl; + struct nfs4_lock_state *lsp; int status = 0; struct file_lock_context *flctx = inode->i_flctx; struct list_head *list; @@ -1522,7 +1523,9 @@ restart: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); status = 0; } spin_lock(&flctx->flc_lock); diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 0d91d84e5822..c394e4447100 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -32,7 +32,7 @@ static struct ctl_table nfs4_cb_sysctls[] = { .data = &nfs_idmap_cache_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, }, { } }; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 77c6729e57f0..65c9c4175145 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7678,6 +7678,22 @@ nfs4_stat_to_errno(int stat) .p_name = #proc, \ } +#if defined(CONFIG_NFS_V4_1) +#define PROC41(proc, argtype, restype) \ + PROC(proc, argtype, restype) +#else +#define PROC41(proc, argtype, restype) \ + STUB(proc) +#endif + +#if defined(CONFIG_NFS_V4_2) +#define PROC42(proc, argtype, restype) \ + PROC(proc, argtype, restype) +#else +#define PROC42(proc, argtype, restype) \ + STUB(proc) +#endif + const struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), PROC(WRITE, enc_write, dec_write), @@ -7698,7 +7714,6 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC(ACCESS, enc_access, dec_access), PROC(GETATTR, enc_getattr, dec_getattr), PROC(LOOKUP, enc_lookup, dec_lookup), - PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), PROC(REMOVE, enc_remove, dec_remove), PROC(RENAME, enc_rename, dec_rename), @@ -7717,33 +7732,30 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), PROC(SECINFO, enc_secinfo, dec_secinfo), PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), -#if defined(CONFIG_NFS_V4_1) - PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), - PROC(CREATE_SESSION, enc_create_session, dec_create_session), - PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), - PROC(SEQUENCE, enc_sequence, dec_sequence), - PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), - PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), - PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), - PROC(LAYOUTGET, enc_layoutget, dec_layoutget), - PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), - PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), - PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), - PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), - PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + PROC41(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC41(CREATE_SESSION, enc_create_session, dec_create_session), + PROC41(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), + PROC41(SEQUENCE, enc_sequence, dec_sequence), + PROC41(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC41(RECLAIM_COMPLETE,enc_reclaim_complete, dec_reclaim_complete), + PROC41(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC41(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC41(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), + PROC41(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC41(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), + PROC41(TEST_STATEID, enc_test_stateid, dec_test_stateid), + PROC41(FREE_STATEID, enc_free_stateid, dec_free_stateid), STUB(GETDEVICELIST), - PROC(BIND_CONN_TO_SESSION, + PROC41(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), - PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), -#endif /* CONFIG_NFS_V4_1 */ -#ifdef CONFIG_NFS_V4_2 - PROC(SEEK, enc_seek, dec_seek), - PROC(ALLOCATE, enc_allocate, dec_allocate), - PROC(DEALLOCATE, enc_deallocate, dec_deallocate), - PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), - PROC(CLONE, enc_clone, dec_clone), - PROC(COPY, enc_copy, dec_copy), -#endif /* CONFIG_NFS_V4_2 */ + PROC41(DESTROY_CLIENTID,enc_destroy_clientid, dec_destroy_clientid), + PROC42(SEEK, enc_seek, dec_seek), + PROC42(ALLOCATE, enc_allocate, dec_allocate), + PROC42(DEALLOCATE, enc_deallocate, dec_deallocate), + PROC42(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), + PROC42(CLONE, enc_clone, dec_clone), + PROC42(COPY, enc_copy, dec_copy), + PROC(LOOKUPP, enc_lookupp, dec_lookupp), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 610d89d8942e..bd60f8d1e181 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -797,15 +797,15 @@ TRACE_EVENT(nfs_readpage_done, ) ); -/* - * XXX: I tried using NFS_UNSTABLE and friends in this table, but they - * all evaluate to 0 for some reason, even if I include linux/nfs.h. - */ +TRACE_DEFINE_ENUM(NFS_UNSTABLE); +TRACE_DEFINE_ENUM(NFS_DATA_SYNC); +TRACE_DEFINE_ENUM(NFS_FILE_SYNC); + #define nfs_show_stable(stable) \ __print_symbolic(stable, \ - { 0, " (UNSTABLE)" }, \ - { 1, " (DATA_SYNC)" }, \ - { 2, " (FILE_SYNC)" }) + { NFS_UNSTABLE, "UNSTABLE" }, \ + { NFS_DATA_SYNC, "DATA_SYNC" }, \ + { NFS_FILE_SYNC, "FILE_SYNC" }) TRACE_EVENT(nfs_initiate_write, TP_PROTO( @@ -838,12 +838,12 @@ TRACE_EVENT(nfs_initiate_write, TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%lu stable=%d%s", + "offset=%lld count=%lu stable=%s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->count, - __entry->stable, nfs_show_stable(__entry->stable) + nfs_show_stable(__entry->stable) ) ); @@ -882,13 +882,13 @@ TRACE_EVENT(nfs_writeback_done, TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld status=%d stable=%d%s " + "offset=%lld status=%d stable=%s " "verifier 0x%016llx", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->status, - __entry->stable, nfs_show_stable(__entry->stable), + nfs_show_stable(__entry->stable), __entry->verifier ) ); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index d0543e19098a..18a7626ac638 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -537,7 +537,7 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * @cinfo: Commit information for the call (writes only) */ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, - unsigned int count, unsigned int offset, + unsigned int count, int how, struct nfs_commit_info *cinfo) { struct nfs_page *req = hdr->req; @@ -546,10 +546,10 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, * NB: take care not to mess about with hdr->commit et al. */ hdr->args.fh = NFS_FH(hdr->inode); - hdr->args.offset = req_offset(req) + offset; + hdr->args.offset = req_offset(req); /* pnfs_set_layoutcommit needs this */ hdr->mds_offset = hdr->args.offset; - hdr->args.pgbase = req->wb_pgbase + offset; + hdr->args.pgbase = req->wb_pgbase; hdr->args.pages = hdr->page_array.pagevec; hdr->args.count = count; hdr->args.context = get_nfs_open_context(req->wb_context); @@ -789,7 +789,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo); + nfs_pgio_rpcsetup(hdr, mirror->pg_count, desc->pg_ioflags, &cinfo); desc->pg_rpc_callops = &nfs_pgio_common_ops; return 0; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d602fe9e1ac8..c13e826614b5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -655,7 +655,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return 0; list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { - dprintk("%s: freeing lseg %p iomode %d seq %u" + dprintk("%s: freeing lseg %p iomode %d seq %u " "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg->pls_range.offset, lseg->pls_range.length); @@ -2255,7 +2255,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); mirror->pg_recoalesce = 1; } - hdr->release(hdr); + hdr->completion_ops->completion(hdr); } static enum pnfs_try_status @@ -2378,7 +2378,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); mirror->pg_recoalesce = 1; } - hdr->release(hdr); + hdr->completion_ops->completion(hdr); } /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 8d507c361d98..daf6cbf5c15f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -40,6 +40,7 @@ enum { NFS_LSEG_ROC, /* roc bit received from server */ NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */ + NFS_LSEG_UNAVAILABLE, /* unavailable bit set for temporary problem */ }; /* Individual ip address */ @@ -86,6 +87,7 @@ enum pnfs_try_status { */ #define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ #define NFS4_DEF_DS_RETRANS 5 +#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) /* error codes for internal use */ #define NFS4ERR_RESET_TO_MDS 12001 @@ -524,8 +526,10 @@ static inline int pnfs_return_layout(struct inode *ino) struct nfs_inode *nfsi = NFS_I(ino); struct nfs_server *nfss = NFS_SERVER(ino); - if (pnfs_enabled_sb(nfss) && nfsi->layout) + if (pnfs_enabled_sb(nfss) && nfsi->layout) { + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &nfsi->layout->plh_flags); return _pnfs_return_layout(ino); + } return 0; } diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 2961fcd7a2df..e8a07b3f9aaa 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -43,7 +43,6 @@ #define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) #define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) -#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; static DEFINE_SPINLOCK(nfs4_deviceid_lock); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 12b2d477836b..7428a669d7a7 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1835,6 +1835,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); next: nfs_unlock_and_release_request(req); + /* Latency breaker */ + cond_resched(); } nfss = NFS_SERVER(data->inode); if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index d7d313fb9cd4..4fd95dbeb52f 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -17,6 +17,7 @@ #include <net/ipv6.h> #include <linux/fs.h> #include <linux/kref.h> +#include <linux/refcount.h> #include <linux/utsname.h> #include <linux/lockd/bind.h> #include <linux/lockd/xdr.h> @@ -58,7 +59,7 @@ struct nlm_host { u32 h_state; /* pseudo-state counter */ u32 h_nsmstate; /* true remote NSM state */ u32 h_pidcount; /* Pseudopids */ - atomic_t h_count; /* reference count */ + refcount_t h_count; /* reference count */ struct mutex h_mutex; /* mutex for pmap binding */ unsigned long h_nextrebind; /* next portmap call */ unsigned long h_expires; /* eligible for GC */ @@ -83,7 +84,7 @@ struct nlm_host { struct nsm_handle { struct list_head sm_link; - atomic_t sm_count; + refcount_t sm_count; char *sm_mon_name; char *sm_name; struct sockaddr_storage sm_addr; @@ -122,7 +123,7 @@ static inline struct sockaddr *nlm_srcaddr(const struct nlm_host *host) */ struct nlm_lockowner { struct list_head list; - atomic_t count; + refcount_t count; struct nlm_host *host; fl_owner_t owner; @@ -136,7 +137,7 @@ struct nlm_wait; */ #define NLMCLNT_OHSIZE ((__NEW_UTS_LEN) + 10u) struct nlm_rqst { - atomic_t a_count; + refcount_t a_count; unsigned int a_flags; /* initial RPC task flags */ struct nlm_host * a_host; /* host handle */ struct nlm_args a_args; /* arguments */ diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 47adac640191..57ffaa20d564 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -457,7 +457,12 @@ enum lock_type4 { #define NFS4_DEBUG 1 -/* Index of predefined Linux client operations */ +/* + * Index of predefined Linux client operations + * + * To ensure that /proc/net/rpc/nfs remains correctly ordered, please + * append only to this enum when adding new client operations. + */ enum { NFSPROC4_CLNT_NULL = 0, /* Unused */ @@ -480,7 +485,6 @@ enum { NFSPROC4_CLNT_ACCESS, NFSPROC4_CLNT_GETATTR, NFSPROC4_CLNT_LOOKUP, - NFSPROC4_CLNT_LOOKUPP, NFSPROC4_CLNT_LOOKUP_ROOT, NFSPROC4_CLNT_REMOVE, NFSPROC4_CLNT_RENAME, @@ -500,7 +504,6 @@ enum { NFSPROC4_CLNT_SECINFO, NFSPROC4_CLNT_FSID_PRESENT, - /* nfs41 */ NFSPROC4_CLNT_EXCHANGE_ID, NFSPROC4_CLNT_CREATE_SESSION, NFSPROC4_CLNT_DESTROY_SESSION, @@ -518,13 +521,14 @@ enum { NFSPROC4_CLNT_BIND_CONN_TO_SESSION, NFSPROC4_CLNT_DESTROY_CLIENTID, - /* nfs42 */ NFSPROC4_CLNT_SEEK, NFSPROC4_CLNT_ALLOCATE, NFSPROC4_CLNT_DEALLOCATE, NFSPROC4_CLNT_LAYOUTSTATS, NFSPROC4_CLNT_CLONE, NFSPROC4_CLNT_COPY, + + NFSPROC4_CLNT_LOOKUPP, }; /* nfs41 types */ diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 71c237e8240e..ed761f751ecb 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -179,7 +179,6 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int rpc_restart_call_prepare(struct rpc_task *); int rpc_restart_call(struct rpc_task *); void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int); -int rpc_protocol(struct rpc_clnt *); struct net * rpc_net_ns(struct rpc_clnt *); size_t rpc_max_payload(struct rpc_clnt *); size_t rpc_max_bc_payload(struct rpc_clnt *); diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 221b7a2e5406..5859563e3c1f 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -64,7 +64,7 @@ enum rpcrdma_memreg { RPCRDMA_MEMWINDOWS, RPCRDMA_MEMWINDOWS_ASYNC, RPCRDMA_MTHCAFMR, - RPCRDMA_FRMR, + RPCRDMA_FRWR, RPCRDMA_ALLPHYSICAL, RPCRDMA_LAST }; diff --git a/include/trace/events/rdma.h b/include/trace/events/rdma.h new file mode 100644 index 000000000000..aa19afc73a4e --- /dev/null +++ b/include/trace/events/rdma.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2017 Oracle. All rights reserved. + */ + +/* + * enum ib_event_type, from include/rdma/ib_verbs.h + */ + +#define IB_EVENT_LIST \ + ib_event(CQ_ERR) \ + ib_event(QP_FATAL) \ + ib_event(QP_REQ_ERR) \ + ib_event(QP_ACCESS_ERR) \ + ib_event(COMM_EST) \ + ib_event(SQ_DRAINED) \ + ib_event(PATH_MIG) \ + ib_event(PATH_MIG_ERR) \ + ib_event(DEVICE_FATAL) \ + ib_event(PORT_ACTIVE) \ + ib_event(PORT_ERR) \ + ib_event(LID_CHANGE) \ + ib_event(PKEY_CHANGE) \ + ib_event(SM_CHANGE) \ + ib_event(SRQ_ERR) \ + ib_event(SRQ_LIMIT_REACHED) \ + ib_event(QP_LAST_WQE_REACHED) \ + ib_event(CLIENT_REREGISTER) \ + ib_event(GID_CHANGE) \ + ib_event_end(WQ_FATAL) + +#undef ib_event +#undef ib_event_end + +#define ib_event(x) TRACE_DEFINE_ENUM(IB_EVENT_##x); +#define ib_event_end(x) TRACE_DEFINE_ENUM(IB_EVENT_##x); + +IB_EVENT_LIST + +#undef ib_event +#undef ib_event_end + +#define ib_event(x) { IB_EVENT_##x, #x }, +#define ib_event_end(x) { IB_EVENT_##x, #x } + +#define rdma_show_ib_event(x) \ + __print_symbolic(x, IB_EVENT_LIST) + +/* + * enum ib_wc_status type, from include/rdma/ib_verbs.h + */ +#define IB_WC_STATUS_LIST \ + ib_wc_status(SUCCESS) \ + ib_wc_status(LOC_LEN_ERR) \ + ib_wc_status(LOC_QP_OP_ERR) \ + ib_wc_status(LOC_EEC_OP_ERR) \ + ib_wc_status(LOC_PROT_ERR) \ + ib_wc_status(WR_FLUSH_ERR) \ + ib_wc_status(MW_BIND_ERR) \ + ib_wc_status(BAD_RESP_ERR) \ + ib_wc_status(LOC_ACCESS_ERR) \ + ib_wc_status(REM_INV_REQ_ERR) \ + ib_wc_status(REM_ACCESS_ERR) \ + ib_wc_status(REM_OP_ERR) \ + ib_wc_status(RETRY_EXC_ERR) \ + ib_wc_status(RNR_RETRY_EXC_ERR) \ + ib_wc_status(LOC_RDD_VIOL_ERR) \ + ib_wc_status(REM_INV_RD_REQ_ERR) \ + ib_wc_status(REM_ABORT_ERR) \ + ib_wc_status(INV_EECN_ERR) \ + ib_wc_status(INV_EEC_STATE_ERR) \ + ib_wc_status(FATAL_ERR) \ + ib_wc_status(RESP_TIMEOUT_ERR) \ + ib_wc_status_end(GENERAL_ERR) + +#undef ib_wc_status +#undef ib_wc_status_end + +#define ib_wc_status(x) TRACE_DEFINE_ENUM(IB_WC_##x); +#define ib_wc_status_end(x) TRACE_DEFINE_ENUM(IB_WC_##x); + +IB_WC_STATUS_LIST + +#undef ib_wc_status +#undef ib_wc_status_end + +#define ib_wc_status(x) { IB_WC_##x, #x }, +#define ib_wc_status_end(x) { IB_WC_##x, #x } + +#define rdma_show_wc_status(x) \ + __print_symbolic(x, IB_WC_STATUS_LIST) + +/* + * enum rdma_cm_event_type, from include/rdma/rdma_cm.h + */ +#define RDMA_CM_EVENT_LIST \ + rdma_cm_event(ADDR_RESOLVED) \ + rdma_cm_event(ADDR_ERROR) \ + rdma_cm_event(ROUTE_RESOLVED) \ + rdma_cm_event(ROUTE_ERROR) \ + rdma_cm_event(CONNECT_REQUEST) \ + rdma_cm_event(CONNECT_RESPONSE) \ + rdma_cm_event(CONNECT_ERROR) \ + rdma_cm_event(UNREACHABLE) \ + rdma_cm_event(REJECTED) \ + rdma_cm_event(ESTABLISHED) \ + rdma_cm_event(DISCONNECTED) \ + rdma_cm_event(DEVICE_REMOVAL) \ + rdma_cm_event(MULTICAST_JOIN) \ + rdma_cm_event(MULTICAST_ERROR) \ + rdma_cm_event(ADDR_CHANGE) \ + rdma_cm_event_end(TIMEWAIT_EXIT) + +#undef rdma_cm_event +#undef rdma_cm_event_end + +#define rdma_cm_event(x) TRACE_DEFINE_ENUM(RDMA_CM_EVENT_##x); +#define rdma_cm_event_end(x) TRACE_DEFINE_ENUM(RDMA_CM_EVENT_##x); + +RDMA_CM_EVENT_LIST + +#undef rdma_cm_event +#undef rdma_cm_event_end + +#define rdma_cm_event(x) { RDMA_CM_EVENT_##x, #x }, +#define rdma_cm_event_end(x) { RDMA_CM_EVENT_##x, #x } + +#define rdma_show_cm_event(x) \ + __print_symbolic(x, RDMA_CM_EVENT_LIST) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h new file mode 100644 index 000000000000..50ed3f8bf534 --- /dev/null +++ b/include/trace/events/rpcrdma.h @@ -0,0 +1,890 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2017 Oracle. All rights reserved. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rpcrdma + +#if !defined(_TRACE_RPCRDMA_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RPCRDMA_H + +#include <linux/tracepoint.h> +#include <trace/events/rdma.h> + +/** + ** Event classes + **/ + +DECLARE_EVENT_CLASS(xprtrdma_reply_event, + TP_PROTO( + const struct rpcrdma_rep *rep + ), + + TP_ARGS(rep), + + TP_STRUCT__entry( + __field(const void *, rep) + __field(const void *, r_xprt) + __field(u32, xid) + __field(u32, version) + __field(u32, proc) + ), + + TP_fast_assign( + __entry->rep = rep; + __entry->r_xprt = rep->rr_rxprt; + __entry->xid = be32_to_cpu(rep->rr_xid); + __entry->version = be32_to_cpu(rep->rr_vers); + __entry->proc = be32_to_cpu(rep->rr_proc); + ), + + TP_printk("rxprt %p xid=0x%08x rep=%p: version %u proc %u", + __entry->r_xprt, __entry->xid, __entry->rep, + __entry->version, __entry->proc + ) +); + +#define DEFINE_REPLY_EVENT(name) \ + DEFINE_EVENT(xprtrdma_reply_event, name, \ + TP_PROTO( \ + const struct rpcrdma_rep *rep \ + ), \ + TP_ARGS(rep)) + +DECLARE_EVENT_CLASS(xprtrdma_rxprt, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt + ), + + TP_ARGS(r_xprt), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p", + __get_str(addr), __get_str(port), __entry->r_xprt + ) +); + +#define DEFINE_RXPRT_EVENT(name) \ + DEFINE_EVENT(xprtrdma_rxprt, name, \ + TP_PROTO( \ + const struct rpcrdma_xprt *r_xprt \ + ), \ + TP_ARGS(r_xprt)) + +DECLARE_EVENT_CLASS(xprtrdma_rdch_event, + TP_PROTO( + const struct rpc_task *task, + unsigned int pos, + struct rpcrdma_mr *mr, + int nsegs + ), + + TP_ARGS(task, pos, mr, nsegs), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, mr) + __field(unsigned int, pos) + __field(int, nents) + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + __field(int, nsegs) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->mr = mr; + __entry->pos = pos; + __entry->nents = mr->mr_nents; + __entry->handle = mr->mr_handle; + __entry->length = mr->mr_length; + __entry->offset = mr->mr_offset; + __entry->nsegs = nsegs; + ), + + TP_printk("task:%u@%u mr=%p pos=%u %u@0x%016llx:0x%08x (%s)", + __entry->task_id, __entry->client_id, __entry->mr, + __entry->pos, __entry->length, + (unsigned long long)__entry->offset, __entry->handle, + __entry->nents < __entry->nsegs ? "more" : "last" + ) +); + +#define DEFINE_RDCH_EVENT(name) \ + DEFINE_EVENT(xprtrdma_rdch_event, name, \ + TP_PROTO( \ + const struct rpc_task *task, \ + unsigned int pos, \ + struct rpcrdma_mr *mr, \ + int nsegs \ + ), \ + TP_ARGS(task, pos, mr, nsegs)) + +DECLARE_EVENT_CLASS(xprtrdma_wrch_event, + TP_PROTO( + const struct rpc_task *task, + struct rpcrdma_mr *mr, + int nsegs + ), + + TP_ARGS(task, mr, nsegs), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, mr) + __field(int, nents) + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + __field(int, nsegs) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->mr = mr; + __entry->nents = mr->mr_nents; + __entry->handle = mr->mr_handle; + __entry->length = mr->mr_length; + __entry->offset = mr->mr_offset; + __entry->nsegs = nsegs; + ), + + TP_printk("task:%u@%u mr=%p %u@0x%016llx:0x%08x (%s)", + __entry->task_id, __entry->client_id, __entry->mr, + __entry->length, (unsigned long long)__entry->offset, + __entry->handle, + __entry->nents < __entry->nsegs ? "more" : "last" + ) +); + +#define DEFINE_WRCH_EVENT(name) \ + DEFINE_EVENT(xprtrdma_wrch_event, name, \ + TP_PROTO( \ + const struct rpc_task *task, \ + struct rpcrdma_mr *mr, \ + int nsegs \ + ), \ + TP_ARGS(task, mr, nsegs)) + +TRACE_DEFINE_ENUM(FRWR_IS_INVALID); +TRACE_DEFINE_ENUM(FRWR_IS_VALID); +TRACE_DEFINE_ENUM(FRWR_FLUSHED_FR); +TRACE_DEFINE_ENUM(FRWR_FLUSHED_LI); + +#define xprtrdma_show_frwr_state(x) \ + __print_symbolic(x, \ + { FRWR_IS_INVALID, "INVALID" }, \ + { FRWR_IS_VALID, "VALID" }, \ + { FRWR_FLUSHED_FR, "FLUSHED_FR" }, \ + { FRWR_FLUSHED_LI, "FLUSHED_LI" }) + +DECLARE_EVENT_CLASS(xprtrdma_frwr_done, + TP_PROTO( + const struct ib_wc *wc, + const struct rpcrdma_frwr *frwr + ), + + TP_ARGS(wc, frwr), + + TP_STRUCT__entry( + __field(const void *, mr) + __field(unsigned int, state) + __field(unsigned int, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->mr = container_of(frwr, struct rpcrdma_mr, frwr); + __entry->state = frwr->fr_state; + __entry->status = wc->status; + __entry->vendor_err = __entry->status ? wc->vendor_err : 0; + ), + + TP_printk( + "mr=%p state=%s: %s (%u/0x%x)", + __entry->mr, xprtrdma_show_frwr_state(__entry->state), + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err + ) +); + +#define DEFINE_FRWR_DONE_EVENT(name) \ + DEFINE_EVENT(xprtrdma_frwr_done, name, \ + TP_PROTO( \ + const struct ib_wc *wc, \ + const struct rpcrdma_frwr *frwr \ + ), \ + TP_ARGS(wc, frwr)) + +DECLARE_EVENT_CLASS(xprtrdma_mr, + TP_PROTO( + const struct rpcrdma_mr *mr + ), + + TP_ARGS(mr), + + TP_STRUCT__entry( + __field(const void *, mr) + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + ), + + TP_fast_assign( + __entry->mr = mr; + __entry->handle = mr->mr_handle; + __entry->length = mr->mr_length; + __entry->offset = mr->mr_offset; + ), + + TP_printk("mr=%p %u@0x%016llx:0x%08x", + __entry->mr, __entry->length, + (unsigned long long)__entry->offset, + __entry->handle + ) +); + +#define DEFINE_MR_EVENT(name) \ + DEFINE_EVENT(xprtrdma_mr, name, \ + TP_PROTO( \ + const struct rpcrdma_mr *mr \ + ), \ + TP_ARGS(mr)) + +DECLARE_EVENT_CLASS(xprtrdma_cb_event, + TP_PROTO( + const struct rpc_rqst *rqst + ), + + TP_ARGS(rqst), + + TP_STRUCT__entry( + __field(const void *, rqst) + __field(const void *, rep) + __field(const void *, req) + __field(u32, xid) + ), + + TP_fast_assign( + __entry->rqst = rqst; + __entry->req = rpcr_to_rdmar(rqst); + __entry->rep = rpcr_to_rdmar(rqst)->rl_reply; + __entry->xid = be32_to_cpu(rqst->rq_xid); + ), + + TP_printk("xid=0x%08x, rqst=%p req=%p rep=%p", + __entry->xid, __entry->rqst, __entry->req, __entry->rep + ) +); + +#define DEFINE_CB_EVENT(name) \ + DEFINE_EVENT(xprtrdma_cb_event, name, \ + TP_PROTO( \ + const struct rpc_rqst *rqst \ + ), \ + TP_ARGS(rqst)) + +/** + ** Connection events + **/ + +TRACE_EVENT(xprtrdma_conn_upcall, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + struct rdma_cm_event *event + ), + + TP_ARGS(r_xprt, event), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(unsigned int, event) + __field(int, status) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->event = event->event; + __entry->status = event->status; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p: %s (%u/%d)", + __get_str(addr), __get_str(port), + __entry->r_xprt, rdma_show_cm_event(__entry->event), + __entry->event, __entry->status + ) +); + +TRACE_EVENT(xprtrdma_disconnect, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + int status + ), + + TP_ARGS(r_xprt, status), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(int, status) + __field(int, connected) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->status = status; + __entry->connected = r_xprt->rx_ep.rep_connected; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p: status=%d %sconnected", + __get_str(addr), __get_str(port), + __entry->r_xprt, __entry->status, + __entry->connected == 1 ? "still " : "dis" + ) +); + +DEFINE_RXPRT_EVENT(xprtrdma_conn_start); +DEFINE_RXPRT_EVENT(xprtrdma_conn_tout); +DEFINE_RXPRT_EVENT(xprtrdma_create); +DEFINE_RXPRT_EVENT(xprtrdma_destroy); +DEFINE_RXPRT_EVENT(xprtrdma_remove); +DEFINE_RXPRT_EVENT(xprtrdma_reinsert); +DEFINE_RXPRT_EVENT(xprtrdma_reconnect); +DEFINE_RXPRT_EVENT(xprtrdma_inject_dsc); + +TRACE_EVENT(xprtrdma_qp_error, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + const struct ib_event *event + ), + + TP_ARGS(r_xprt, event), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(unsigned int, event) + __string(name, event->device->name) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->event = event->event; + __assign_str(name, event->device->name); + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p: dev %s: %s (%u)", + __get_str(addr), __get_str(port), __entry->r_xprt, + __get_str(name), rdma_show_ib_event(__entry->event), + __entry->event + ) +); + +/** + ** Call events + **/ + +TRACE_EVENT(xprtrdma_createmrs, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + unsigned int count + ), + + TP_ARGS(r_xprt, count), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(unsigned int, count) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->count = count; + ), + + TP_printk("r_xprt=%p: created %u MRs", + __entry->r_xprt, __entry->count + ) +); + +DEFINE_RXPRT_EVENT(xprtrdma_nomrs); + +DEFINE_RDCH_EVENT(xprtrdma_read_chunk); +DEFINE_WRCH_EVENT(xprtrdma_write_chunk); +DEFINE_WRCH_EVENT(xprtrdma_reply_chunk); + +TRACE_DEFINE_ENUM(rpcrdma_noch); +TRACE_DEFINE_ENUM(rpcrdma_readch); +TRACE_DEFINE_ENUM(rpcrdma_areadch); +TRACE_DEFINE_ENUM(rpcrdma_writech); +TRACE_DEFINE_ENUM(rpcrdma_replych); + +#define xprtrdma_show_chunktype(x) \ + __print_symbolic(x, \ + { rpcrdma_noch, "inline" }, \ + { rpcrdma_readch, "read list" }, \ + { rpcrdma_areadch, "*read list" }, \ + { rpcrdma_writech, "write list" }, \ + { rpcrdma_replych, "reply chunk" }) + +TRACE_EVENT(xprtrdma_marshal, + TP_PROTO( + const struct rpc_rqst *rqst, + unsigned int hdrlen, + unsigned int rtype, + unsigned int wtype + ), + + TP_ARGS(rqst, hdrlen, rtype, wtype), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(unsigned int, hdrlen) + __field(unsigned int, headlen) + __field(unsigned int, pagelen) + __field(unsigned int, taillen) + __field(unsigned int, rtype) + __field(unsigned int, wtype) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->hdrlen = hdrlen; + __entry->headlen = rqst->rq_snd_buf.head[0].iov_len; + __entry->pagelen = rqst->rq_snd_buf.page_len; + __entry->taillen = rqst->rq_snd_buf.tail[0].iov_len; + __entry->rtype = rtype; + __entry->wtype = wtype; + ), + + TP_printk("task:%u@%u xid=0x%08x: hdr=%u xdr=%u/%u/%u %s/%s", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->hdrlen, + __entry->headlen, __entry->pagelen, __entry->taillen, + xprtrdma_show_chunktype(__entry->rtype), + xprtrdma_show_chunktype(__entry->wtype) + ) +); + +TRACE_EVENT(xprtrdma_post_send, + TP_PROTO( + const struct rpcrdma_req *req, + int status + ), + + TP_ARGS(req, status), + + TP_STRUCT__entry( + __field(const void *, req) + __field(int, num_sge) + __field(bool, signaled) + __field(int, status) + ), + + TP_fast_assign( + __entry->req = req; + __entry->num_sge = req->rl_sendctx->sc_wr.num_sge; + __entry->signaled = req->rl_sendctx->sc_wr.send_flags & + IB_SEND_SIGNALED; + __entry->status = status; + ), + + TP_printk("req=%p, %d SGEs%s, status=%d", + __entry->req, __entry->num_sge, + (__entry->signaled ? ", signaled" : ""), + __entry->status + ) +); + +TRACE_EVENT(xprtrdma_post_recv, + TP_PROTO( + const struct rpcrdma_rep *rep, + int status + ), + + TP_ARGS(rep, status), + + TP_STRUCT__entry( + __field(const void *, rep) + __field(int, status) + ), + + TP_fast_assign( + __entry->rep = rep; + __entry->status = status; + ), + + TP_printk("rep=%p status=%d", + __entry->rep, __entry->status + ) +); + +/** + ** Completion events + **/ + +TRACE_EVENT(xprtrdma_wc_send, + TP_PROTO( + const struct rpcrdma_sendctx *sc, + const struct ib_wc *wc + ), + + TP_ARGS(sc, wc), + + TP_STRUCT__entry( + __field(const void *, req) + __field(unsigned int, unmap_count) + __field(unsigned int, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->req = sc->sc_req; + __entry->unmap_count = sc->sc_unmap_count; + __entry->status = wc->status; + __entry->vendor_err = __entry->status ? wc->vendor_err : 0; + ), + + TP_printk("req=%p, unmapped %u pages: %s (%u/0x%x)", + __entry->req, __entry->unmap_count, + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err + ) +); + +TRACE_EVENT(xprtrdma_wc_receive, + TP_PROTO( + const struct rpcrdma_rep *rep, + const struct ib_wc *wc + ), + + TP_ARGS(rep, wc), + + TP_STRUCT__entry( + __field(const void *, rep) + __field(unsigned int, byte_len) + __field(unsigned int, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->rep = rep; + __entry->byte_len = wc->byte_len; + __entry->status = wc->status; + __entry->vendor_err = __entry->status ? wc->vendor_err : 0; + ), + + TP_printk("rep=%p, %u bytes: %s (%u/0x%x)", + __entry->rep, __entry->byte_len, + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err + ) +); + +DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg); +DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li); +DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake); + +DEFINE_MR_EVENT(xprtrdma_localinv); +DEFINE_MR_EVENT(xprtrdma_dma_unmap); +DEFINE_MR_EVENT(xprtrdma_remoteinv); +DEFINE_MR_EVENT(xprtrdma_recover_mr); + +/** + ** Reply events + **/ + +TRACE_EVENT(xprtrdma_reply, + TP_PROTO( + const struct rpc_task *task, + const struct rpcrdma_rep *rep, + const struct rpcrdma_req *req, + unsigned int credits + ), + + TP_ARGS(task, rep, req, credits), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, rep) + __field(const void *, req) + __field(u32, xid) + __field(unsigned int, credits) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->rep = rep; + __entry->req = req; + __entry->xid = be32_to_cpu(rep->rr_xid); + __entry->credits = credits; + ), + + TP_printk("task:%u@%u xid=0x%08x, %u credits, rep=%p -> req=%p", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->credits, __entry->rep, __entry->req + ) +); + +TRACE_EVENT(xprtrdma_defer_cmp, + TP_PROTO( + const struct rpcrdma_rep *rep + ), + + TP_ARGS(rep), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, rep) + __field(u32, xid) + ), + + TP_fast_assign( + __entry->task_id = rep->rr_rqst->rq_task->tk_pid; + __entry->client_id = rep->rr_rqst->rq_task->tk_client->cl_clid; + __entry->rep = rep; + __entry->xid = be32_to_cpu(rep->rr_xid); + ), + + TP_printk("task:%u@%u xid=0x%08x rep=%p", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->rep + ) +); + +DEFINE_REPLY_EVENT(xprtrdma_reply_vers); +DEFINE_REPLY_EVENT(xprtrdma_reply_rqst); +DEFINE_REPLY_EVENT(xprtrdma_reply_short); +DEFINE_REPLY_EVENT(xprtrdma_reply_hdr); + +TRACE_EVENT(xprtrdma_fixup, + TP_PROTO( + const struct rpc_rqst *rqst, + int len, + int hdrlen + ), + + TP_ARGS(rqst, len, hdrlen), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, base) + __field(int, len) + __field(int, hdrlen) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->base = rqst->rq_rcv_buf.head[0].iov_base; + __entry->len = len; + __entry->hdrlen = hdrlen; + ), + + TP_printk("task:%u@%u base=%p len=%d hdrlen=%d", + __entry->task_id, __entry->client_id, + __entry->base, __entry->len, __entry->hdrlen + ) +); + +TRACE_EVENT(xprtrdma_fixup_pg, + TP_PROTO( + const struct rpc_rqst *rqst, + int pageno, + const void *pos, + int len, + int curlen + ), + + TP_ARGS(rqst, pageno, pos, len, curlen), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, pos) + __field(int, pageno) + __field(int, len) + __field(int, curlen) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->pos = pos; + __entry->pageno = pageno; + __entry->len = len; + __entry->curlen = curlen; + ), + + TP_printk("task:%u@%u pageno=%d pos=%p len=%d curlen=%d", + __entry->task_id, __entry->client_id, + __entry->pageno, __entry->pos, __entry->len, __entry->curlen + ) +); + +TRACE_EVENT(xprtrdma_decode_seg, + TP_PROTO( + u32 handle, + u32 length, + u64 offset + ), + + TP_ARGS(handle, length, offset), + + TP_STRUCT__entry( + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + ), + + TP_fast_assign( + __entry->handle = handle; + __entry->length = length; + __entry->offset = offset; + ), + + TP_printk("%u@0x%016llx:0x%08x", + __entry->length, (unsigned long long)__entry->offset, + __entry->handle + ) +); + +/** + ** Allocation/release of rpcrdma_reqs and rpcrdma_reps + **/ + +TRACE_EVENT(xprtrdma_allocate, + TP_PROTO( + const struct rpc_task *task, + const struct rpcrdma_req *req + ), + + TP_ARGS(task, req), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, req) + __field(const void *, rep) + __field(size_t, callsize) + __field(size_t, rcvsize) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->req = req; + __entry->rep = req ? req->rl_reply : NULL; + __entry->callsize = task->tk_rqstp->rq_callsize; + __entry->rcvsize = task->tk_rqstp->rq_rcvsize; + ), + + TP_printk("task:%u@%u req=%p rep=%p (%zu, %zu)", + __entry->task_id, __entry->client_id, + __entry->req, __entry->rep, + __entry->callsize, __entry->rcvsize + ) +); + +TRACE_EVENT(xprtrdma_rpc_done, + TP_PROTO( + const struct rpc_task *task, + const struct rpcrdma_req *req + ), + + TP_ARGS(task, req), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(const void *, req) + __field(const void *, rep) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->req = req; + __entry->rep = req->rl_reply; + ), + + TP_printk("task:%u@%u req=%p rep=%p", + __entry->task_id, __entry->client_id, + __entry->req, __entry->rep + ) +); + +DEFINE_RXPRT_EVENT(xprtrdma_noreps); + +/** + ** Callback events + **/ + +TRACE_EVENT(xprtrdma_cb_setup, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + unsigned int reqs + ), + + TP_ARGS(r_xprt, reqs), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(unsigned int, reqs) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->reqs = reqs; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p: %u reqs", + __get_str(addr), __get_str(port), + __entry->r_xprt, __entry->reqs + ) +); + +DEFINE_CB_EVENT(xprtrdma_cb_call); +DEFINE_CB_EVENT(xprtrdma_cb_reply); + +#endif /* _TRACE_RPCRDMA_H */ + +#include <trace/define_trace.h> diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 8c153f68509e..970c91a83173 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -32,7 +32,7 @@ DECLARE_EVENT_CLASS(rpc_task_status, __entry->status = task->tk_status; ), - TP_printk("task:%u@%u, status %d", + TP_printk("task:%u@%u status=%d", __entry->task_id, __entry->client_id, __entry->status) ); @@ -66,7 +66,7 @@ TRACE_EVENT(rpc_connect_status, __entry->status = status; ), - TP_printk("task:%u@%u, status %d", + TP_printk("task:%u@%u status=%d", __entry->task_id, __entry->client_id, __entry->status) ); @@ -175,7 +175,7 @@ DECLARE_EVENT_CLASS(rpc_task_queued, ), TP_fast_assign( - __entry->client_id = clnt->cl_clid; + __entry->client_id = clnt ? clnt->cl_clid : -1; __entry->task_id = task->tk_pid; __entry->timeout = task->tk_timeout; __entry->runstate = task->tk_runstate; @@ -184,7 +184,7 @@ DECLARE_EVENT_CLASS(rpc_task_queued, __assign_str(q_name, rpc_qname(q)); ), - TP_printk("task:%u@%u flags=%4.4x state=%4.4lx status=%d timeout=%lu queue=%s", + TP_printk("task:%u@%d flags=%4.4x state=%4.4lx status=%d timeout=%lu queue=%s", __entry->task_id, __entry->client_id, __entry->flags, __entry->runstate, @@ -390,6 +390,10 @@ DECLARE_EVENT_CLASS(rpc_xprt_event, __entry->status) ); +DEFINE_EVENT(rpc_xprt_event, xprt_timer, + TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status), + TP_ARGS(xprt, xid, status)); + DEFINE_EVENT(rpc_xprt_event, xprt_lookup_rqst, TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status), TP_ARGS(xprt, xid, status)); diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h index 057d22a48416..946cb62d64b0 100644 --- a/include/uapi/linux/nfs.h +++ b/include/uapi/linux/nfs.h @@ -12,6 +12,7 @@ #define NFS_PROGRAM 100003 #define NFS_PORT 2049 +#define NFS_RDMA_PORT 20049 #define NFS_MAXDATA 8192 #define NFS_MAXPATHLEN 1024 #define NFS_MAXNAMLEN 255 diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index e2a4184f3c5d..6e432ecd7f99 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1376,22 +1376,6 @@ rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize EXPORT_SYMBOL_GPL(rpc_setbufsize); /** - * rpc_protocol - Get transport protocol number for an RPC client - * @clnt: RPC client to query - * - */ -int rpc_protocol(struct rpc_clnt *clnt) -{ - int protocol; - - rcu_read_lock(); - protocol = rcu_dereference(clnt->cl_xprt)->prot; - rcu_read_unlock(); - return protocol; -} -EXPORT_SYMBOL_GPL(rpc_protocol); - -/** * rpc_net_ns - Get the network namespace for this RPC client * @clnt: RPC client to query * diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b1b49edd7c4d..896691afbb1a 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -755,22 +755,20 @@ static void __rpc_execute(struct rpc_task *task) void (*do_action)(struct rpc_task *); /* - * Execute any pending callback first. + * Perform the next FSM step or a pending callback. + * + * tk_action may be NULL if the task has been killed. + * In particular, note that rpc_killall_tasks may + * do this at any time, so beware when dereferencing. */ - do_action = task->tk_callback; - task->tk_callback = NULL; - if (do_action == NULL) { - /* - * Perform the next FSM step. - * tk_action may be NULL if the task has been killed. - * In particular, note that rpc_killall_tasks may - * do this at any time, so beware when dereferencing. - */ - do_action = task->tk_action; - if (do_action == NULL) - break; + do_action = task->tk_action; + if (task->tk_callback) { + do_action = task->tk_callback; + task->tk_callback = NULL; } - trace_rpc_task_run_action(task->tk_client, task, task->tk_action); + if (!do_action) + break; + trace_rpc_task_run_action(task->tk_client, task, do_action); do_action(task); /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 33b74fd84051..2436fd1125fc 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -940,8 +940,8 @@ static void xprt_timer(struct rpc_task *task) if (task->tk_status != -ETIMEDOUT) return; - dprintk("RPC: %5u xprt_timer\n", task->tk_pid); + trace_xprt_timer(xprt, req->rq_xid, task->tk_status); if (!req->rq_reply_bytes_recvd) { if (xprt->ops->timer) xprt->ops->timer(xprt, task); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 8b818bb3518a..ed1a4a3065ee 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -43,7 +43,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, req = rpcrdma_create_req(r_xprt); if (IS_ERR(req)) return PTR_ERR(req); - __set_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags); rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, GFP_KERNEL); @@ -74,21 +73,13 @@ out_fail: static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, unsigned int count) { - struct rpcrdma_rep *rep; int rc = 0; while (count--) { - rep = rpcrdma_create_rep(r_xprt); - if (IS_ERR(rep)) { - pr_err("RPC: %s: reply buffer alloc failed\n", - __func__); - rc = PTR_ERR(rep); + rc = rpcrdma_create_rep(r_xprt); + if (rc) break; - } - - rpcrdma_recv_buffer_put(rep); } - return rc; } @@ -129,6 +120,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) rqst->rq_xprt = &r_xprt->rx_xprt; INIT_LIST_HEAD(&rqst->rq_list); INIT_LIST_HEAD(&rqst->rq_bc_list); + __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); if (rpcrdma_bc_setup_rqst(r_xprt, rqst)) goto out_free; @@ -148,7 +140,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) buffer->rb_bc_srv_max_requests = reqs; request_module("svcrdma"); - + trace_xprtrdma_cb_setup(r_xprt, reqs); return 0; out_free: @@ -196,13 +188,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) return maxmsg - RPCRDMA_HDRLEN_MIN; } -/** - * rpcrdma_bc_marshal_reply - Send backwards direction reply - * @rqst: buffer containing RPC reply data - * - * Returns zero on success. - */ -int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) +static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); @@ -226,7 +212,46 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, &rqst->rq_snd_buf, rpcrdma_noch)) return -EIO; + + trace_xprtrdma_cb_reply(rqst); + return 0; +} + +/** + * xprt_rdma_bc_send_reply - marshal and send a backchannel reply + * @rqst: RPC rqst with a backchannel RPC reply in rq_snd_buf + * + * Caller holds the transport's write lock. + * + * Returns: + * %0 if the RPC message has been sent + * %-ENOTCONN if the caller should reconnect and call again + * %-EIO if a permanent error occurred and the request was not + * sent. Do not try to send this message again. + */ +int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + int rc; + + if (!xprt_connected(rqst->rq_xprt)) + goto drop_connection; + + rc = rpcrdma_bc_marshal_reply(rqst); + if (rc < 0) + goto failed_marshal; + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; return 0; + +failed_marshal: + if (rc != -ENOTCONN) + return rc; +drop_connection: + xprt_disconnect_done(rqst->rq_xprt); + return -ENOTCONN; } /** @@ -262,11 +287,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) dprintk("RPC: %s: freeing rqst %p (req %p)\n", __func__, rqst, rpcr_to_rdmar(rqst)); - smp_mb__before_atomic(); - WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)); - clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - smp_mb__after_atomic(); - spin_lock_bh(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); spin_unlock_bh(&xprt->bc_pa_lock); @@ -274,7 +294,7 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) /** * rpcrdma_bc_receive_call - Handle a backward direction call - * @xprt: transport receiving the call + * @r_xprt: transport receiving the call * @rep: receive buffer containing the call * * Operational assumptions: @@ -313,7 +333,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpc_rqst, rq_bc_pa_list); list_del(&rqst->rq_bc_pa_list); spin_unlock(&xprt->bc_pa_lock); - dprintk("RPC: %s: using rqst %p\n", __func__, rqst); /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; @@ -321,7 +340,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, rqst->rq_xid = *p; rqst->rq_private_buf.len = size; - set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); buf = &rqst->rq_rcv_buf; memset(buf, 0, sizeof(*buf)); @@ -335,12 +353,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, * the Upper Layer is done decoding it. */ req = rpcr_to_rdmar(rqst); - dprintk("RPC: %s: attaching rep %p to req %p\n", - __func__, rep, req); req->rl_reply = rep; - - /* Defeat the retransmit detection logic in send_request */ - req->rl_connect_cookie = 0; + trace_xprtrdma_cb_call(rqst); /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 29fc84c7ff98..d5f95bb39300 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015, 2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. */ @@ -47,7 +47,7 @@ fmr_is_supported(struct rpcrdma_ia *ia) } static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { static struct ib_fmr_attr fmr_attr = { .max_pages = RPCRDMA_MAX_FMR_SGES, @@ -55,106 +55,108 @@ fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) .page_shift = PAGE_SHIFT }; - mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, sizeof(u64), GFP_KERNEL); - if (!mw->fmr.fm_physaddrs) + if (!mr->fmr.fm_physaddrs) goto out_free; - mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mw->mw_sg), GFP_KERNEL); - if (!mw->mw_sg) + mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) goto out_free; - sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES); + sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, &fmr_attr); - if (IS_ERR(mw->fmr.fm_mr)) + if (IS_ERR(mr->fmr.fm_mr)) goto out_fmr_err; return 0; out_fmr_err: dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mw->fmr.fm_mr)); + PTR_ERR(mr->fmr.fm_mr)); out_free: - kfree(mw->mw_sg); - kfree(mw->fmr.fm_physaddrs); + kfree(mr->mr_sg); + kfree(mr->fmr.fm_physaddrs); return -ENOMEM; } static int -__fmr_unmap(struct rpcrdma_mw *mw) +__fmr_unmap(struct rpcrdma_mr *mr) { LIST_HEAD(l); int rc; - list_add(&mw->fmr.fm_mr->list, &l); + list_add(&mr->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); - list_del(&mw->fmr.fm_mr->list); + list_del(&mr->fmr.fm_mr->list); return rc; } static void -fmr_op_release_mr(struct rpcrdma_mw *r) +fmr_op_release_mr(struct rpcrdma_mr *mr) { LIST_HEAD(unmap_list); int rc; /* Ensure MW is not on any rl_registered list */ - if (!list_empty(&r->mw_list)) - list_del(&r->mw_list); + if (!list_empty(&mr->mr_list)) + list_del(&mr->mr_list); - kfree(r->fmr.fm_physaddrs); - kfree(r->mw_sg); + kfree(mr->fmr.fm_physaddrs); + kfree(mr->mr_sg); /* In case this one was left mapped, try to unmap it * to prevent dealloc_fmr from failing with EBUSY */ - rc = __fmr_unmap(r); + rc = __fmr_unmap(mr); if (rc) pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - r, rc); + mr, rc); - rc = ib_dealloc_fmr(r->fmr.fm_mr); + rc = ib_dealloc_fmr(mr->fmr.fm_mr); if (rc) pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", - r, rc); + mr, rc); - kfree(r); + kfree(mr); } /* Reset of a single FMR. */ static void -fmr_op_recover_mr(struct rpcrdma_mw *mw) +fmr_op_recover_mr(struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; int rc; /* ORDER: invalidate first */ - rc = __fmr_unmap(mw); - - /* ORDER: then DMA unmap */ - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); + rc = __fmr_unmap(mr); if (rc) goto out_release; - rpcrdma_put_mw(r_xprt, mw); + /* ORDER: then DMA unmap */ + rpcrdma_mr_unmap_and_put(mr); + r_xprt->rx_stats.mrs_recovered++; return; out_release: - pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw); + pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); r_xprt->rx_stats.mrs_orphaned++; - spin_lock(&r_xprt->rx_buf.rb_mwlock); - list_del(&mw->mw_all); - spin_unlock(&r_xprt->rx_buf.rb_mwlock); + trace_xprtrdma_dma_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mw); + fmr_op_release_mr(mr); } static int @@ -180,15 +182,15 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) */ static struct rpcrdma_mr_seg * fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, struct rpcrdma_mw **out) + int nsegs, bool writing, struct rpcrdma_mr **out) { struct rpcrdma_mr_seg *seg1 = seg; int len, pageoff, i, rc; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; u64 *dma_pages; - mw = rpcrdma_get_mw(r_xprt); - if (!mw) + mr = rpcrdma_mr_get(r_xprt); + if (!mr) return ERR_PTR(-ENOBUFS); pageoff = offset_in_page(seg1->mr_offset); @@ -199,12 +201,12 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, nsegs = RPCRDMA_MAX_FMR_SGES; for (i = 0; i < nsegs;) { if (seg->mr_page) - sg_set_page(&mw->mw_sg[i], + sg_set_page(&mr->mr_sg[i], seg->mr_page, seg->mr_len, offset_in_page(seg->mr_offset)); else - sg_set_buf(&mw->mw_sg[i], seg->mr_offset, + sg_set_buf(&mr->mr_sg[i], seg->mr_offset, seg->mr_len); len += seg->mr_len; ++seg; @@ -214,40 +216,38 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - mw->mw_dir = rpcrdma_data_dir(writing); + mr->mr_dir = rpcrdma_data_dir(writing); - mw->mw_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, i, mw->mw_dir); - if (!mw->mw_nents) + mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, i, mr->mr_dir); + if (!mr->mr_nents) goto out_dmamap_err; - for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) - dma_pages[i] = sg_dma_address(&mw->mw_sg[i]); - rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents, + for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) + dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); + rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents, dma_pages[0]); if (rc) goto out_maperr; - mw->mw_handle = mw->fmr.fm_mr->rkey; - mw->mw_length = len; - mw->mw_offset = dma_pages[0] + pageoff; + mr->mr_handle = mr->fmr.fm_mr->rkey; + mr->mr_length = len; + mr->mr_offset = dma_pages[0] + pageoff; - *out = mw; + *out = mr; return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", - mw->mw_sg, i); - rpcrdma_put_mw(r_xprt, mw); + mr->mr_sg, i); + rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_maperr: pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", len, (unsigned long long)dma_pages[0], - pageoff, mw->mw_nents, rc); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - rpcrdma_put_mw(r_xprt, mw); + pageoff, mr->mr_nents, rc); + rpcrdma_mr_unmap_and_put(mr); return ERR_PTR(-EIO); } @@ -256,13 +256,13 @@ out_maperr: * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. * - * Caller ensures that @mws is not empty before the call. This + * Caller ensures that @mrs is not empty before the call. This * function empties the list. */ static void -fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; LIST_HEAD(unmap_list); int rc; @@ -271,10 +271,11 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) * ib_unmap_fmr() is slow, so use a single call instead * of one call per mapped FMR. */ - list_for_each_entry(mw, mws, mw_list) { + list_for_each_entry(mr, mrs, mr_list) { dprintk("RPC: %s: unmapping fmr %p\n", - __func__, &mw->fmr); - list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); + __func__, &mr->fmr); + trace_xprtrdma_localinv(mr); + list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); } r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); @@ -284,14 +285,10 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. */ - while (!list_empty(mws)) { - mw = rpcrdma_pop_mw(mws); - dprintk("RPC: %s: DMA unmapping fmr %p\n", - __func__, &mw->fmr); - list_del(&mw->fmr.fm_mr->list); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - rpcrdma_put_mw(r_xprt, mw); + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); + list_del(&mr->fmr.fm_mr->list); + rpcrdma_mr_unmap_and_put(mr); } return; @@ -299,10 +296,10 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) out_reset: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); - while (!list_empty(mws)) { - mw = rpcrdma_pop_mw(mws); - list_del(&mw->fmr.fm_mr->list); - fmr_op_recover_mr(mw); + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); + list_del(&mr->fmr.fm_mr->list); + fmr_op_recover_mr(mr); } } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 773e66e10a15..90f688f19783 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015, 2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. */ /* Lightweight memory registration using Fast Registration Work - * Requests (FRWR). Also referred to sometimes as FRMR mode. + * Requests (FRWR). * * FRWR features ordered asynchronous registration and deregistration * of arbitrarily sized memory regions. This is the fastest and safest @@ -15,9 +15,9 @@ /* Normal operation * * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG - * Work Request (frmr_op_map). When the RDMA operation is finished, this + * Work Request (frwr_op_map). When the RDMA operation is finished, this * Memory Region is invalidated using a LOCAL_INV Work Request - * (frmr_op_unmap). + * (frwr_op_unmap_sync). * * Typically these Work Requests are not signaled, and neither are RDMA * SEND Work Requests (with the exception of signaling occasionally to @@ -26,7 +26,7 @@ * * As an optimization, frwr_op_unmap marks MRs INVALID before the * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on - * rb_mws immediately so that no work (like managing a linked list + * rb_mrs immediately so that no work (like managing a linked list * under a spinlock) is needed in the completion upcall. * * But this means that frwr_op_map() can occasionally encounter an MR @@ -60,7 +60,7 @@ * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered * with ib_dereg_mr and then are re-initialized. Because MR recovery * allocates fresh resources, it is deferred to a workqueue, and the - * recovered MRs are placed back on the rb_mws list when recovery is + * recovered MRs are placed back on the rb_mrs list when recovery is * complete. frwr_op_map allocates another MR for the current RPC while * the broken MR is reset. * @@ -96,26 +96,26 @@ out_not_supported: } static int -frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) +frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { - unsigned int depth = ia->ri_max_frmr_depth; - struct rpcrdma_frmr *f = &r->frmr; + unsigned int depth = ia->ri_max_frwr_depth; + struct rpcrdma_frwr *frwr = &mr->frwr; int rc; - f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); - if (IS_ERR(f->fr_mr)) + frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + if (IS_ERR(frwr->fr_mr)) goto out_mr_err; - r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); - if (!r->mw_sg) + mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) goto out_list_err; - sg_init_table(r->mw_sg, depth); - init_completion(&f->fr_linv_done); + sg_init_table(mr->mr_sg, depth); + init_completion(&frwr->fr_linv_done); return 0; out_mr_err: - rc = PTR_ERR(f->fr_mr); + rc = PTR_ERR(frwr->fr_mr); dprintk("RPC: %s: ib_alloc_mr status %i\n", __func__, rc); return rc; @@ -124,83 +124,85 @@ out_list_err: rc = -ENOMEM; dprintk("RPC: %s: sg allocation failure\n", __func__); - ib_dereg_mr(f->fr_mr); + ib_dereg_mr(frwr->fr_mr); return rc; } static void -frwr_op_release_mr(struct rpcrdma_mw *r) +frwr_op_release_mr(struct rpcrdma_mr *mr) { int rc; - /* Ensure MW is not on any rl_registered list */ - if (!list_empty(&r->mw_list)) - list_del(&r->mw_list); + /* Ensure MR is not on any rl_registered list */ + if (!list_empty(&mr->mr_list)) + list_del(&mr->mr_list); - rc = ib_dereg_mr(r->frmr.fr_mr); + rc = ib_dereg_mr(mr->frwr.fr_mr); if (rc) pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - r, rc); - kfree(r->mw_sg); - kfree(r); + mr, rc); + kfree(mr->mr_sg); + kfree(mr); } static int -__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) +__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { - struct rpcrdma_frmr *f = &r->frmr; + struct rpcrdma_frwr *frwr = &mr->frwr; int rc; - rc = ib_dereg_mr(f->fr_mr); + rc = ib_dereg_mr(frwr->fr_mr); if (rc) { pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", - rc, r); + rc, mr); return rc; } - f->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_mr)) { + frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, + ia->ri_max_frwr_depth); + if (IS_ERR(frwr->fr_mr)) { pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", - PTR_ERR(f->fr_mr), r); - return PTR_ERR(f->fr_mr); + PTR_ERR(frwr->fr_mr), mr); + return PTR_ERR(frwr->fr_mr); } - dprintk("RPC: %s: recovered FRMR %p\n", __func__, f); - f->fr_state = FRMR_IS_INVALID; + dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); + frwr->fr_state = FRWR_IS_INVALID; return 0; } -/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. +/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. */ static void -frwr_op_recover_mr(struct rpcrdma_mw *mw) +frwr_op_recover_mr(struct rpcrdma_mr *mr) { - enum rpcrdma_frmr_state state = mw->frmr.fr_state; - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + enum rpcrdma_frwr_state state = mr->frwr.fr_state; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; - rc = __frwr_reset_mr(ia, mw); - if (state != FRMR_FLUSHED_LI) + rc = __frwr_mr_reset(ia, mr); + if (state != FRWR_FLUSHED_LI) { + trace_xprtrdma_dma_unmap(mr); ib_dma_unmap_sg(ia->ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); + mr->mr_sg, mr->mr_nents, mr->mr_dir); + } if (rc) goto out_release; - rpcrdma_put_mw(r_xprt, mw); + rpcrdma_mr_put(mr); r_xprt->rx_stats.mrs_recovered++; return; out_release: - pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw); + pr_err("rpcrdma: FRWR reset failed %d, %p release\n", rc, mr); r_xprt->rx_stats.mrs_orphaned++; - spin_lock(&r_xprt->rx_buf.rb_mwlock); - list_del(&mw->mw_all); - spin_unlock(&r_xprt->rx_buf.rb_mwlock); + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + spin_unlock(&r_xprt->rx_buf.rb_mrlock); - frwr_op_release_mr(mw); + frwr_op_release_mr(mr); } static int @@ -214,31 +216,31 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; - ia->ri_max_frmr_depth = + ia->ri_max_frwr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, attrs->max_fast_reg_page_list_len); dprintk("RPC: %s: device's max FR page list len = %u\n", - __func__, ia->ri_max_frmr_depth); - - /* Add room for frmr register and invalidate WRs. - * 1. FRMR reg WR for head - * 2. FRMR invalidate WR for head - * 3. N FRMR reg WRs for pagelist - * 4. N FRMR invalidate WRs for pagelist - * 5. FRMR reg WR for tail - * 6. FRMR invalidate WR for tail + __func__, ia->ri_max_frwr_depth); + + /* Add room for frwr register and invalidate WRs. + * 1. FRWR reg WR for head + * 2. FRWR invalidate WR for head + * 3. N FRWR reg WRs for pagelist + * 4. N FRWR invalidate WRs for pagelist + * 5. FRWR reg WR for tail + * 6. FRWR invalidate WR for tail * 7. The RDMA_SEND WR */ depth = 7; - /* Calculate N if the device max FRMR depth is smaller than + /* Calculate N if the device max FRWR depth is smaller than * RPCRDMA_MAX_DATA_SEGS. */ - if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { - delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth; + if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; do { - depth += 2; /* FRMR reg + invalidate */ - delta -= ia->ri_max_frmr_depth; + depth += 2; /* FRWR reg + invalidate */ + delta -= ia->ri_max_frwr_depth; } while (delta > 0); } @@ -252,7 +254,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frmr_depth); + ia->ri_max_frwr_depth); return 0; } @@ -265,7 +267,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ia *ia = &r_xprt->rx_ia; return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth); + RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth); } static void @@ -286,16 +288,16 @@ __frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frmr *frmr; - struct ib_cqe *cqe; + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = + container_of(cqe, struct rpcrdma_frwr, fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS) { - cqe = wc->wr_cqe; - frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - frmr->fr_state = FRMR_FLUSHED_FR; + frwr->fr_state = FRWR_FLUSHED_FR; __frwr_sendcompletion_flush(wc, "fastreg"); } + trace_xprtrdma_wc_fastreg(wc, frwr); } /** @@ -307,16 +309,16 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frmr *frmr; - struct ib_cqe *cqe; + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, + fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ if (wc->status != IB_WC_SUCCESS) { - cqe = wc->wr_cqe; - frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - frmr->fr_state = FRMR_FLUSHED_LI; + frwr->fr_state = FRWR_FLUSHED_LI; __frwr_sendcompletion_flush(wc, "localinv"); } + trace_xprtrdma_wc_li(wc, frwr); } /** @@ -329,17 +331,17 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) { - struct rpcrdma_frmr *frmr; - struct ib_cqe *cqe; + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, + fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - cqe = wc->wr_cqe; - frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); if (wc->status != IB_WC_SUCCESS) { - frmr->fr_state = FRMR_FLUSHED_LI; + frwr->fr_state = FRWR_FLUSHED_LI; __frwr_sendcompletion_flush(wc, "localinv"); } - complete(&frmr->fr_linv_done); + complete(&frwr->fr_linv_done); + trace_xprtrdma_wc_li_wake(wc, frwr); } /* Post a REG_MR Work Request to register a memory region @@ -347,41 +349,39 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) */ static struct rpcrdma_mr_seg * frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, struct rpcrdma_mw **out) + int nsegs, bool writing, struct rpcrdma_mr **out) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; - struct rpcrdma_mw *mw; - struct rpcrdma_frmr *frmr; - struct ib_mr *mr; + struct rpcrdma_frwr *frwr; + struct rpcrdma_mr *mr; + struct ib_mr *ibmr; struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; int rc, i, n; u8 key; - mw = NULL; + mr = NULL; do { - if (mw) - rpcrdma_defer_mr_recovery(mw); - mw = rpcrdma_get_mw(r_xprt); - if (!mw) + if (mr) + rpcrdma_mr_defer_recovery(mr); + mr = rpcrdma_mr_get(r_xprt); + if (!mr) return ERR_PTR(-ENOBUFS); - } while (mw->frmr.fr_state != FRMR_IS_INVALID); - frmr = &mw->frmr; - frmr->fr_state = FRMR_IS_VALID; - mr = frmr->fr_mr; - reg_wr = &frmr->fr_regwr; - - if (nsegs > ia->ri_max_frmr_depth) - nsegs = ia->ri_max_frmr_depth; + } while (mr->frwr.fr_state != FRWR_IS_INVALID); + frwr = &mr->frwr; + frwr->fr_state = FRWR_IS_VALID; + + if (nsegs > ia->ri_max_frwr_depth) + nsegs = ia->ri_max_frwr_depth; for (i = 0; i < nsegs;) { if (seg->mr_page) - sg_set_page(&mw->mw_sg[i], + sg_set_page(&mr->mr_sg[i], seg->mr_page, seg->mr_len, offset_in_page(seg->mr_offset)); else - sg_set_buf(&mw->mw_sg[i], seg->mr_offset, + sg_set_buf(&mr->mr_sg[i], seg->mr_offset, seg->mr_len); ++seg; @@ -392,30 +392,29 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - mw->mw_dir = rpcrdma_data_dir(writing); + mr->mr_dir = rpcrdma_data_dir(writing); - mw->mw_nents = ib_dma_map_sg(ia->ri_device, mw->mw_sg, i, mw->mw_dir); - if (!mw->mw_nents) + mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); + if (!mr->mr_nents) goto out_dmamap_err; - n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); - if (unlikely(n != mw->mw_nents)) + ibmr = frwr->fr_mr; + n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); + if (unlikely(n != mr->mr_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", - __func__, frmr, mw->mw_nents, mr->length); - - key = (u8)(mr->rkey & 0x000000FF); - ib_update_fast_reg_key(mr, ++key); + key = (u8)(ibmr->rkey & 0x000000FF); + ib_update_fast_reg_key(ibmr, ++key); + reg_wr = &frwr->fr_regwr; reg_wr->wr.next = NULL; reg_wr->wr.opcode = IB_WR_REG_MR; - frmr->fr_cqe.done = frwr_wc_fastreg; - reg_wr->wr.wr_cqe = &frmr->fr_cqe; + frwr->fr_cqe.done = frwr_wc_fastreg; + reg_wr->wr.wr_cqe = &frwr->fr_cqe; reg_wr->wr.num_sge = 0; reg_wr->wr.send_flags = 0; - reg_wr->mr = mr; - reg_wr->key = mr->rkey; + reg_wr->mr = ibmr; + reg_wr->key = ibmr->rkey; reg_wr->access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; @@ -424,47 +423,64 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (rc) goto out_senderr; - mw->mw_handle = mr->rkey; - mw->mw_length = mr->length; - mw->mw_offset = mr->iova; + mr->mr_handle = ibmr->rkey; + mr->mr_length = ibmr->length; + mr->mr_offset = ibmr->iova; - *out = mw; + *out = mr; return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", - mw->mw_sg, i); - frmr->fr_state = FRMR_IS_INVALID; - rpcrdma_put_mw(r_xprt, mw); + mr->mr_sg, i); + frwr->fr_state = FRWR_IS_INVALID; + rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", - frmr->fr_mr, n, mw->mw_nents); - rpcrdma_defer_mr_recovery(mw); + frwr->fr_mr, n, mr->mr_nents); + rpcrdma_mr_defer_recovery(mr); return ERR_PTR(-EIO); out_senderr: - pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); - rpcrdma_defer_mr_recovery(mw); + pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc); + rpcrdma_mr_defer_recovery(mr); return ERR_PTR(-ENOTCONN); } +/* Handle a remotely invalidated mr on the @mrs list + */ +static void +frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) +{ + struct rpcrdma_mr *mr; + + list_for_each_entry(mr, mrs, mr_list) + if (mr->mr_handle == rep->rr_inv_rkey) { + list_del(&mr->mr_list); + trace_xprtrdma_remoteinv(mr); + mr->frwr.fr_state = FRWR_IS_INVALID; + rpcrdma_mr_unmap_and_put(mr); + break; /* only one invalidated MR per RPC */ + } +} + /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. * - * Caller ensures that @mws is not empty before the call. This + * Caller ensures that @mrs is not empty before the call. This * function empties the list. */ static void -frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) { struct ib_send_wr *first, **prev, *last, *bad_wr; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_frmr *f; - struct rpcrdma_mw *mw; + struct rpcrdma_frwr *frwr; + struct rpcrdma_mr *mr; int count, rc; /* ORDER: Invalidate all of the MRs first @@ -472,31 +488,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - f = NULL; + frwr = NULL; count = 0; prev = &first; - list_for_each_entry(mw, mws, mw_list) { - mw->frmr.fr_state = FRMR_IS_INVALID; + list_for_each_entry(mr, mrs, mr_list) { + mr->frwr.fr_state = FRWR_IS_INVALID; - if (mw->mw_flags & RPCRDMA_MW_F_RI) - continue; + frwr = &mr->frwr; + trace_xprtrdma_localinv(mr); - f = &mw->frmr; - dprintk("RPC: %s: invalidating frmr %p\n", - __func__, f); - - f->fr_cqe.done = frwr_wc_localinv; - last = &f->fr_invwr; + frwr->fr_cqe.done = frwr_wc_localinv; + last = &frwr->fr_invwr; memset(last, 0, sizeof(*last)); - last->wr_cqe = &f->fr_cqe; + last->wr_cqe = &frwr->fr_cqe; last->opcode = IB_WR_LOCAL_INV; - last->ex.invalidate_rkey = mw->mw_handle; + last->ex.invalidate_rkey = mr->mr_handle; count++; *prev = last; prev = &last->next; } - if (!f) + if (!frwr) goto unmap; /* Strong send queue ordering guarantees that when the @@ -504,8 +516,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) * are complete. */ last->send_flags = IB_SEND_SIGNALED; - f->fr_cqe.done = frwr_wc_localinv_wake; - reinit_completion(&f->fr_linv_done); + frwr->fr_cqe.done = frwr_wc_localinv_wake; + reinit_completion(&frwr->fr_linv_done); /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us @@ -515,36 +527,32 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) bad_wr = NULL; rc = ib_post_send(ia->ri_id->qp, first, &bad_wr); if (bad_wr != first) - wait_for_completion(&f->fr_linv_done); + wait_for_completion(&frwr->fr_linv_done); if (rc) goto reset_mrs; /* ORDER: Now DMA unmap all of the MRs, and return - * them to the free MW list. + * them to the free MR list. */ unmap: - while (!list_empty(mws)) { - mw = rpcrdma_pop_mw(mws); - dprintk("RPC: %s: DMA unmapping frmr %p\n", - __func__, &mw->frmr); - ib_dma_unmap_sg(ia->ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - rpcrdma_put_mw(r_xprt, mw); + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); + rpcrdma_mr_unmap_and_put(mr); } return; reset_mrs: - pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); + pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); /* Find and reset the MRs in the LOCAL_INV WRs that did not * get posted. */ while (bad_wr) { - f = container_of(bad_wr, struct rpcrdma_frmr, - fr_invwr); - mw = container_of(f, struct rpcrdma_mw, frmr); + frwr = container_of(bad_wr, struct rpcrdma_frwr, + fr_invwr); + mr = container_of(frwr, struct rpcrdma_mr, frwr); - __frwr_reset_mr(ia, mw); + __frwr_mr_reset(ia, mr); bad_wr = bad_wr->next; } @@ -553,6 +561,7 @@ reset_mrs: const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 560712bd9fa2..a762d192372b 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -1,18 +1,20 @@ /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015, 2017 Oracle. All rights reserved. */ /* rpcrdma.ko module initialization */ +#include <linux/types.h> +#include <linux/compiler.h> #include <linux/module.h> #include <linux/init.h> #include <linux/sunrpc/svc_rdma.h> -#include "xprt_rdma.h" -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif +#include <asm/swab.h> + +#define CREATE_TRACE_POINTS +#include "xprt_rdma.h" MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc."); MODULE_DESCRIPTION("RPC/RDMA Transport"); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a3f2ab283aeb..162e5dd82466 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -292,15 +292,15 @@ encode_item_not_present(struct xdr_stream *xdr) } static void -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) +xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) { - *iptr++ = cpu_to_be32(mw->mw_handle); - *iptr++ = cpu_to_be32(mw->mw_length); - xdr_encode_hyper(iptr, mw->mw_offset); + *iptr++ = cpu_to_be32(mr->mr_handle); + *iptr++ = cpu_to_be32(mr->mr_length); + xdr_encode_hyper(iptr, mr->mr_offset); } static int -encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) +encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) { __be32 *p; @@ -308,12 +308,12 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) if (unlikely(!p)) return -EMSGSIZE; - xdr_encode_rdma_segment(p, mw); + xdr_encode_rdma_segment(p, mr); return 0; } static int -encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, +encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, u32 position) { __be32 *p; @@ -324,7 +324,7 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, *p++ = xdr_one; /* Item present */ *p++ = cpu_to_be32(position); - xdr_encode_rdma_segment(p, mw); + xdr_encode_rdma_segment(p, mr); return 0; } @@ -348,7 +348,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; unsigned int pos; int nsegs; @@ -363,21 +363,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, do { seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - false, &mw); + false, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_push_mw(mw, &req->rl_registered); + rpcrdma_mr_push(mr, &req->rl_registered); - if (encode_read_segment(xdr, mw, pos) < 0) + if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; - dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", - rqst->rq_task->tk_pid, __func__, pos, - mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); - + trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs); r_xprt->rx_stats.read_chunk_count++; - nsegs -= mw->mw_nents; + nsegs -= mr->mr_nents; } while (nsegs); return 0; @@ -404,7 +400,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; int nsegs, nchunks; __be32 *segcount; @@ -425,23 +421,19 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); + true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_push_mw(mw, &req->rl_registered); + rpcrdma_mr_push(mr, &req->rl_registered); - if (encode_rdma_segment(xdr, mw) < 0) + if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", - rqst->rq_task->tk_pid, __func__, - mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); - + trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.write_chunk_count++; - r_xprt->rx_stats.total_rdma_request += seg->mr_len; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; - nsegs -= mw->mw_nents; + nsegs -= mr->mr_nents; } while (nsegs); /* Update count of segments in this Write chunk */ @@ -468,7 +460,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; int nsegs, nchunks; __be32 *segcount; @@ -487,23 +479,19 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); + true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_push_mw(mw, &req->rl_registered); + rpcrdma_mr_push(mr, &req->rl_registered); - if (encode_rdma_segment(xdr, mw) < 0) + if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", - rqst->rq_task->tk_pid, __func__, - mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); - + trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.reply_chunk_count++; - r_xprt->rx_stats.total_rdma_request += seg->mr_len; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; - nsegs -= mw->mw_nents; + nsegs -= mr->mr_nents; } while (nsegs); /* Update count of segments in the Reply chunk */ @@ -524,9 +512,6 @@ rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) struct ib_sge *sge; unsigned int count; - dprintk("RPC: %s: unmapping %u sges for sc=%p\n", - __func__, sc->sc_unmap_count, sc); - /* The first two SGEs contain the transport header and * the inline buffer. These are always left mapped so * they can be cheaply re-used. @@ -754,11 +739,6 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) __be32 *p; int ret; -#if defined(CONFIG_SUNRPC_BACKCHANNEL) - if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) - return rpcrdma_bc_marshal_reply(rqst); -#endif - rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(xdr, &req->rl_hdrbuf, req->rl_rdmabuf->rg_base); @@ -821,6 +801,17 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) rtype = rpcrdma_areadch; } + /* If this is a retransmit, discard previously registered + * chunks. Very likely the connection has been replaced, + * so these registrations are invalid and unusable. + */ + while (unlikely(!list_empty(&req->rl_registered))) { + struct rpcrdma_mr *mr; + + mr = rpcrdma_mr_pop(&req->rl_registered); + rpcrdma_mr_defer_recovery(mr); + } + /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * @@ -868,10 +859,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) if (ret) goto out_err; - dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n", - rqst->rq_task->tk_pid, __func__, - transfertypes[rtype], transfertypes[wtype], - xdr_stream_pos(xdr)); + trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype); ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr), &rqst->rq_snd_buf, rtype); @@ -926,8 +914,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) curlen = rqst->rq_rcv_buf.head[0].iov_len; if (curlen > copy_len) curlen = copy_len; - dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", - __func__, srcp, copy_len, curlen); + trace_xprtrdma_fixup(rqst, copy_len, curlen); srcp += curlen; copy_len -= curlen; @@ -947,9 +934,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) if (curlen > pagelist_len) curlen = pagelist_len; - dprintk("RPC: %s: page %d" - " srcp 0x%p len %d curlen %d\n", - __func__, i, srcp, copy_len, curlen); + trace_xprtrdma_fixup_pg(rqst, i, srcp, + copy_len, curlen); destp = kmap_atomic(ppages[i]); memcpy(destp + page_base, srcp, curlen); flush_dcache_page(ppages[i]); @@ -984,24 +970,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) return fixup_copy_count; } -/* Caller must guarantee @rep remains stable during this call. - */ -static void -rpcrdma_mark_remote_invalidation(struct list_head *mws, - struct rpcrdma_rep *rep) -{ - struct rpcrdma_mw *mw; - - if (!(rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)) - return; - - list_for_each_entry(mw, mws, mw_list) - if (mw->mw_handle == rep->rr_inv_rkey) { - mw->mw_flags = RPCRDMA_MW_F_RI; - break; /* only one invalidated MR per RPC */ - } -} - /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes * the RPC/RDMA header small and fixed in size, so it is @@ -1058,26 +1026,19 @@ out_short: static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) { + u32 handle; + u64 offset; __be32 *p; p = xdr_inline_decode(xdr, 4 * sizeof(*p)); if (unlikely(!p)) return -EIO; - ifdebug(FACILITY) { - u64 offset; - u32 handle; - - handle = be32_to_cpup(p++); - *length = be32_to_cpup(p++); - xdr_decode_hyper(p, &offset); - dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n", - __func__, *length, (unsigned long long)offset, - handle); - } else { - *length = be32_to_cpup(p + 1); - } + handle = be32_to_cpup(p++); + *length = be32_to_cpup(p++); + xdr_decode_hyper(p, &offset); + trace_xprtrdma_decode_seg(handle, *length, offset); return 0; } @@ -1098,8 +1059,6 @@ static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) *length += seglength; } - dprintk("RPC: %s: segcount=%u, %u bytes\n", - __func__, be32_to_cpup(p), *length); return 0; } @@ -1296,8 +1255,7 @@ out: * being marshaled. */ out_badheader: - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", - rqst->rq_task->tk_pid, __func__, be32_to_cpu(rep->rr_proc)); + trace_xprtrdma_reply_hdr(rep); r_xprt->rx_stats.bad_reply_count++; status = -EIO; goto out; @@ -1339,9 +1297,12 @@ void rpcrdma_deferred_completion(struct work_struct *work) struct rpcrdma_rep *rep = container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); - rpcrdma_release_rqst(rep->rr_rxprt, req); + trace_xprtrdma_defer_cmp(rep); + if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) + r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered); + rpcrdma_release_rqst(r_xprt, req); rpcrdma_complete_rqst(rep); } @@ -1360,8 +1321,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; - dprintk("RPC: %s: incoming rep %p\n", __func__, rep); - if (rep->rr_hdrbuf.head[0].iov_len == 0) goto out_badstatus; @@ -1405,8 +1364,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) rep->rr_rqst = rqst; clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); - dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", - __func__, rep, req, be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); return; @@ -1420,8 +1378,7 @@ out_badstatus: return; out_badversion: - dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(rep->rr_vers)); + trace_xprtrdma_reply_vers(rep); goto repost; /* The RPC transaction has already been terminated, or the header @@ -1429,12 +1386,11 @@ out_badversion: */ out_norqst: spin_unlock(&xprt->recv_lock); - dprintk("RPC: %s: no match for incoming xid 0x%08x\n", - __func__, be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_reply_rqst(rep); goto repost; out_shortreply: - dprintk("RPC: %s: short/invalid reply\n", __func__); + trace_xprtrdma_reply_short(rep); /* If no pending RPC transaction was matched, post a replacement * receive buffer before returning. diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6ee1ad8978f3..4b1ecfe979cf 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -67,8 +67,7 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; -static unsigned int xprt_rdma_inline_write_padding; -unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; +unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; int xprt_rdma_pad_optimize; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -81,6 +80,7 @@ static unsigned int zero; static unsigned int max_padding = PAGE_SIZE; static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; static unsigned int max_memreg = RPCRDMA_LAST - 1; +static unsigned int dummy; static struct ctl_table_header *sunrpc_table_header; @@ -114,7 +114,7 @@ static struct ctl_table xr_tunables_table[] = { }, { .procname = "rdma_inline_write_padding", - .data = &xprt_rdma_inline_write_padding, + .data = &dummy, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -259,13 +259,10 @@ xprt_rdma_connect_worker(struct work_struct *work) xprt_clear_connected(xprt); - dprintk("RPC: %s: %sconnect\n", __func__, - r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); if (rc) xprt_wake_pending_tasks(xprt, rc); - dprintk("RPC: %s: exit\n", __func__); xprt_clear_connecting(xprt); } @@ -275,7 +272,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, rx_xprt); - pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt); + trace_xprtrdma_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); } @@ -295,7 +292,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - dprintk("RPC: %s: called\n", __func__); + trace_xprtrdma_destroy(r_xprt); cancel_delayed_work_sync(&r_xprt->rx_connect_worker); @@ -306,11 +303,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); - xprt_free(xprt); - dprintk("RPC: %s: returning\n", __func__); - module_put(THIS_MODULE); } @@ -361,9 +355,7 @@ xprt_setup_rdma(struct xprt_create *args) /* * Set up RDMA-specific connect data. */ - - sap = (struct sockaddr *)&cdata.addr; - memcpy(sap, args->dstaddr, args->addrlen); + sap = args->dstaddr; /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ @@ -373,6 +365,7 @@ xprt_setup_rdma(struct xprt_create *args) if (rpc_get_port(sap)) xprt_set_bound(xprt); + xprt_rdma_format_addresses(xprt, sap); cdata.max_requests = xprt->max_reqs; @@ -387,8 +380,6 @@ xprt_setup_rdma(struct xprt_create *args) if (cdata.inline_rsize > cdata.rsize) cdata.inline_rsize = cdata.rsize; - cdata.padding = xprt_rdma_inline_write_padding; - /* * Create new transport instance, which includes initialized * o ia @@ -398,7 +389,7 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt, sap); + rc = rpcrdma_ia_open(new_xprt); if (rc) goto out1; @@ -407,31 +398,19 @@ xprt_setup_rdma(struct xprt_create *args) */ new_xprt->rx_data = cdata; new_ep = &new_xprt->rx_ep; - new_ep->rep_remote_addr = cdata.addr; rc = rpcrdma_ep_create(&new_xprt->rx_ep, &new_xprt->rx_ia, &new_xprt->rx_data); if (rc) goto out2; - /* - * Allocate pre-registered send and receive buffers for headers and - * any inline data. Also specify any padding which will be provided - * from a preregistered zero buffer. - */ rc = rpcrdma_buffer_create(new_xprt); if (rc) goto out3; - /* - * Register a callback for connection events. This is necessary because - * connection loss notification is async. We also catch connection loss - * when reaping receives. - */ INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); - xprt_rdma_format_addresses(xprt, sap); xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); if (xprt->max_payload == 0) goto out4; @@ -445,16 +424,19 @@ xprt_setup_rdma(struct xprt_create *args) dprintk("RPC: %s: %s:%s\n", __func__, xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT]); + trace_xprtrdma_create(new_xprt); return xprt; out4: - xprt_rdma_free_addresses(xprt); - rc = -EINVAL; + rpcrdma_buffer_destroy(&new_xprt->rx_buf); + rc = -ENODEV; out3: rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: + trace_xprtrdma_destroy(new_xprt); + xprt_rdma_free_addresses(xprt); xprt_free(xprt); return ERR_PTR(rc); } @@ -488,16 +470,34 @@ xprt_rdma_close(struct rpc_xprt *xprt) rpcrdma_ep_disconnect(ep, ia); } +/** + * xprt_rdma_set_port - update server port with rpcbind result + * @xprt: controlling RPC transport + * @port: new port value + * + * Transport connect status is unchanged. + */ static void xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) { - struct sockaddr_in *sap; + struct sockaddr *sap = (struct sockaddr *)&xprt->addr; + char buf[8]; - sap = (struct sockaddr_in *)&xprt->addr; - sap->sin_port = htons(port); - sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr; - sap->sin_port = htons(port); - dprintk("RPC: %s: %u\n", __func__, port); + dprintk("RPC: %s: setting port for xprt %p (%s:%s) to %u\n", + __func__, xprt, + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT], + port); + + rpc_set_port(sap, port); + + kfree(xprt->address_strings[RPC_DISPLAY_PORT]); + snprintf(buf, sizeof(buf), "%u", port); + xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); + + kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); + snprintf(buf, sizeof(buf), "%4hx", port); + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); } /** @@ -516,8 +516,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) static void xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) { - dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt); - xprt_force_disconnect(xprt); } @@ -640,7 +638,7 @@ xprt_rdma_allocate(struct rpc_task *task) req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (req == NULL) - return -ENOMEM; + goto out_get; flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) @@ -653,19 +651,18 @@ xprt_rdma_allocate(struct rpc_task *task) if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n", - task->tk_pid, __func__, rqst->rq_callsize, - rqst->rq_rcvsize, req); - req->rl_cpu = smp_processor_id(); req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = req->rl_recvbuf->rg_base; + trace_xprtrdma_allocate(task, req); return 0; out_fail: rpcrdma_buffer_put(req); +out_get: + trace_xprtrdma_allocate(task, NULL); return -ENOMEM; } @@ -682,13 +679,9 @@ xprt_rdma_free(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - if (test_bit(RPCRDMA_REQ_F_BACKCHANNEL, &req->rl_flags)) - return; - - dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) rpcrdma_release_rqst(r_xprt, req); + trace_xprtrdma_rpc_done(task, req); rpcrdma_buffer_put(req); } @@ -698,22 +691,12 @@ xprt_rdma_free(struct rpc_task *task) * * Caller holds the transport's write lock. * - * Return values: - * 0: The request has been sent - * ENOTCONN: Caller needs to invoke connect logic then call again - * ENOBUFS: Call again later to send the request - * EIO: A permanent error occurred. The request was not sent, - * and don't try it again - * - * send_request invokes the meat of RPC RDMA. It must do the following: - * - * 1. Marshal the RPC request into an RPC RDMA request, which means - * putting a header in front of data, and creating IOVs for RDMA - * from those in the request. - * 2. In marshaling, detect opportunities for RDMA, and use them. - * 3. Post a recv message to set up asynch completion, then send - * the request (rpcrdma_ep_post). - * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). + * Returns: + * %0 if the RPC message has been sent + * %-ENOTCONN if the caller should reconnect and call again + * %-ENOBUFS if the caller should call again later + * %-EIO if a permanent error occurred and the request was not + * sent. Do not try to send this message again. */ static int xprt_rdma_send_request(struct rpc_task *task) @@ -724,14 +707,14 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (unlikely(!rqst->rq_buffer)) + return xprt_rdma_bc_send_reply(rqst); +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + if (!xprt_connected(xprt)) goto drop_connection; - /* On retransmit, remove any previously registered chunks */ - if (unlikely(!list_empty(&req->rl_registered))) - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, - &req->rl_registered); - rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) goto failed_marshal; @@ -744,7 +727,7 @@ xprt_rdma_send_request(struct rpc_task *task) goto drop_connection; req->rl_connect_cookie = xprt->connect_cookie; - set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); + __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) goto drop_connection; @@ -904,8 +887,7 @@ int xprt_rdma_init(void) "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", xprt_rdma_slot_table_entries, xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); - dprintk("\tPadding %d\n\tMemreg %d\n", - xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); + dprintk("\tPadding 0\n\tMemreg %d\n", xprt_rdma_memreg_strategy); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (!sunrpc_table_header) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8607c029c0dd..f4eb63e8e689 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -71,8 +71,8 @@ /* * internal functions */ -static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf); +static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); struct workqueue_struct *rpcrdma_receive_wq __read_mostly; @@ -108,7 +108,10 @@ static void rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; + struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, + rx_ep); + trace_xprtrdma_qp_error(r_xprt, event); pr_err("rpcrdma: %s on device %s ep %p\n", ib_event_msg(event->event), event->device->name, context); @@ -133,6 +136,7 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct rpcrdma_sendctx, sc_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ + trace_xprtrdma_wc_send(sc, wc); if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) pr_err("rpcrdma: Send: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), @@ -155,13 +159,11 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rr_cqe); /* WARNING: Only wr_id and status are reliable at this point */ + trace_xprtrdma_wc_receive(rep, wc); if (wc->status != IB_WC_SUCCESS) goto out_fail; /* status == SUCCESS means all fields in wc are trustworthy */ - dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", - __func__, rep, wc->byte_len); - rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); rep->rr_wc_flags = wc->wc_flags; rep->rr_inv_rkey = wc->ex.invalidate_rkey; @@ -192,7 +194,6 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ - r_xprt->rx_ia.ri_reminv_expected = false; r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; @@ -200,7 +201,6 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - r_xprt->rx_ia.ri_reminv_expected = true; r_xprt->rx_ia.ri_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); @@ -221,11 +221,9 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_xprt *xprt = id->context; struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; -#endif int connstate = 0; + trace_xprtrdma_conn_upcall(xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: @@ -234,21 +232,17 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) break; case RDMA_CM_EVENT_ADDR_ERROR: ia->ri_async_rc = -EHOSTUNREACH; - dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", - __func__, ep); complete(&ia->ri_done); break; case RDMA_CM_EVENT_ROUTE_ERROR: ia->ri_async_rc = -ENETUNREACH; - dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", - __func__, ep); complete(&ia->ri_done); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: removing device %s for %pIS:%u\n", + pr_info("rpcrdma: removing device %s for %s:%s\n", ia->ri_device->name, - sap, rpc_get_port(sap)); + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; @@ -271,8 +265,8 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -ENETDOWN; goto connected; case RDMA_CM_EVENT_REJECTED: - dprintk("rpcrdma: connection to %pIS:%u rejected: %s\n", - sap, rpc_get_port(sap), + dprintk("rpcrdma: connection to %s:%s rejected: %s\n", + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), rdma_reject_msg(id, event->status)); connstate = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) @@ -287,8 +281,9 @@ connected: wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pIS:%u on %s/%s (ep 0x%p): %s\n", - __func__, sap, rpc_get_port(sap), + dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n", + __func__, + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), ia->ri_device->name, ia->ri_ops->ro_displayname, ep, rdma_event_msg(event->event)); break; @@ -298,13 +293,14 @@ connected: } static struct rdma_cm_id * -rpcrdma_create_id(struct rpcrdma_xprt *xprt, - struct rpcrdma_ia *ia, struct sockaddr *addr) +rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) { unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; struct rdma_cm_id *id; int rc; + trace_xprtrdma_conn_start(xprt); + init_completion(&ia->ri_done); init_completion(&ia->ri_remove_done); @@ -318,7 +314,9 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } ia->ri_async_rc = -ETIMEDOUT; - rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); + rc = rdma_resolve_addr(id, NULL, + (struct sockaddr *)&xprt->rx_xprt.addr, + RDMA_RESOLVE_TIMEOUT); if (rc) { dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", __func__, rc); @@ -326,8 +324,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { - dprintk("RPC: %s: wait() exited: %i\n", - __func__, rc); + trace_xprtrdma_conn_tout(xprt); goto out; } @@ -344,8 +341,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { - dprintk("RPC: %s: wait() exited: %i\n", - __func__, rc); + trace_xprtrdma_conn_tout(xprt); goto out; } rc = ia->ri_async_rc; @@ -365,19 +361,18 @@ out: /** * rpcrdma_ia_open - Open and initialize an Interface Adapter. - * @xprt: controlling transport - * @addr: IP address of remote peer + * @xprt: transport with IA to (re)initialize * * Returns 0 on success, negative errno if an appropriate * Interface Adapter could not be found and opened. */ int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr) +rpcrdma_ia_open(struct rpcrdma_xprt *xprt) { struct rpcrdma_ia *ia = &xprt->rx_ia; int rc; - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); + ia->ri_id = rpcrdma_create_id(xprt, ia); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); goto out_err; @@ -392,7 +387,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr) } switch (xprt_rdma_memreg_strategy) { - case RPCRDMA_FRMR: + case RPCRDMA_FRWR: if (frwr_is_supported(ia)) { ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; @@ -462,10 +457,12 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) rpcrdma_dma_unmap_regbuf(req->rl_sendbuf); rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); } - rpcrdma_destroy_mrs(buf); + rpcrdma_mrs_destroy(buf); /* Allow waiters to continue */ complete(&ia->ri_remove_done); + + trace_xprtrdma_remove(r_xprt); } /** @@ -476,7 +473,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) void rpcrdma_ia_close(struct rpcrdma_ia *ia) { - dprintk("RPC: %s: entering\n", __func__); if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); @@ -630,9 +626,6 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - dprintk("RPC: %s: entering, connected is %d\n", - __func__, ep->rep_connected); - cancel_delayed_work_sync(&ep->rep_connect_worker); if (ia->ri_id->qp) { @@ -653,13 +646,12 @@ static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; int rc, err; - pr_info("%s: r_xprt = %p\n", __func__, r_xprt); + trace_xprtrdma_reinsert(r_xprt); rc = -EHOSTUNREACH; - if (rpcrdma_ia_open(r_xprt, sap)) + if (rpcrdma_ia_open(r_xprt)) goto out1; rc = -ENOMEM; @@ -676,7 +668,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, goto out3; } - rpcrdma_create_mrs(r_xprt); + rpcrdma_mrs_create(r_xprt); return 0; out3: @@ -691,16 +683,15 @@ static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; struct rdma_cm_id *id, *old; int err, rc; - dprintk("RPC: %s: reconnecting...\n", __func__); + trace_xprtrdma_reconnect(r_xprt); rpcrdma_ep_disconnect(ep, ia); rc = -EHOSTUNREACH; - id = rpcrdma_create_id(r_xprt, ia, sap); + id = rpcrdma_create_id(r_xprt, ia); if (IS_ERR(id)) goto out; @@ -817,16 +808,14 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) int rc; rc = rdma_disconnect(ia->ri_id); - if (!rc) { + if (!rc) /* returns without wait if not connected */ wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 1); - dprintk("RPC: %s: after wait, %sconnected\n", __func__, - (ep->rep_connected == 1) ? "still " : "dis"); - } else { - dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); + else ep->rep_connected = rc; - } + trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt, + rx_ep), rc); ib_drain_qp(ia->ri_id->qp); } @@ -998,15 +987,15 @@ rpcrdma_mr_recovery_worker(struct work_struct *work) { struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, rb_recovery_worker.work); - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; spin_lock(&buf->rb_recovery_lock); while (!list_empty(&buf->rb_stale_mrs)) { - mw = rpcrdma_pop_mw(&buf->rb_stale_mrs); + mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); - dprintk("RPC: %s: recovering MR %p\n", __func__, mw); - mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw); + trace_xprtrdma_recover_mr(mr); + mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); spin_lock(&buf->rb_recovery_lock); } @@ -1014,20 +1003,20 @@ rpcrdma_mr_recovery_worker(struct work_struct *work) } void -rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) +rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; spin_lock(&buf->rb_recovery_lock); - rpcrdma_push_mw(mw, &buf->rb_stale_mrs); + rpcrdma_mr_push(mr, &buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); schedule_delayed_work(&buf->rb_recovery_worker, 0); } static void -rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) +rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; @@ -1036,32 +1025,32 @@ rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) LIST_HEAD(all); for (count = 0; count < 32; count++) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; int rc; - mw = kzalloc(sizeof(*mw), GFP_KERNEL); - if (!mw) + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) break; - rc = ia->ri_ops->ro_init_mr(ia, mw); + rc = ia->ri_ops->ro_init_mr(ia, mr); if (rc) { - kfree(mw); + kfree(mr); break; } - mw->mw_xprt = r_xprt; + mr->mr_xprt = r_xprt; - list_add(&mw->mw_list, &free); - list_add(&mw->mw_all, &all); + list_add(&mr->mr_list, &free); + list_add(&mr->mr_all, &all); } - spin_lock(&buf->rb_mwlock); - list_splice(&free, &buf->rb_mws); + spin_lock(&buf->rb_mrlock); + list_splice(&free, &buf->rb_mrs); list_splice(&all, &buf->rb_all); r_xprt->rx_stats.mrs_allocated += count; - spin_unlock(&buf->rb_mwlock); + spin_unlock(&buf->rb_mrlock); - dprintk("RPC: %s: created %u MRs\n", __func__, count); + trace_xprtrdma_createmrs(r_xprt, count); } static void @@ -1072,7 +1061,7 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); - rpcrdma_create_mrs(r_xprt); + rpcrdma_mrs_create(r_xprt); } struct rpcrdma_req * @@ -1093,10 +1082,17 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) return req; } -struct rpcrdma_rep * +/** + * rpcrdma_create_rep - Allocate an rpcrdma_rep object + * @r_xprt: controlling transport + * + * Returns 0 on success or a negative errno on failure. + */ +int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; int rc; @@ -1121,12 +1117,18 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; - return rep; + + spin_lock(&buf->rb_lock); + list_add(&rep->rr_list, &buf->rb_recv_bufs); + spin_unlock(&buf->rb_lock); + return 0; out_free: kfree(rep); out: - return ERR_PTR(rc); + dprintk("RPC: %s: reply buffer %d alloc failed\n", + __func__, rc); + return rc; } int @@ -1137,10 +1139,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; - spin_lock_init(&buf->rb_mwlock); + spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); spin_lock_init(&buf->rb_recovery_lock); - INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all); INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, @@ -1148,7 +1150,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) INIT_DELAYED_WORK(&buf->rb_recovery_worker, rpcrdma_mr_recovery_worker); - rpcrdma_create_mrs(r_xprt); + rpcrdma_mrs_create(r_xprt); INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); @@ -1167,17 +1169,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; i++) { - struct rpcrdma_rep *rep; - - rep = rpcrdma_create_rep(r_xprt); - if (IS_ERR(rep)) { - dprintk("RPC: %s: reply buffer %d alloc failed\n", - __func__, i); - rc = PTR_ERR(rep); + for (i = 0; i <= buf->rb_max_requests; i++) { + rc = rpcrdma_create_rep(r_xprt); + if (rc) goto out; - } - list_add(&rep->rr_list, &buf->rb_recv_bufs); } rc = rpcrdma_sendctxs_create(r_xprt); @@ -1229,26 +1224,26 @@ rpcrdma_destroy_req(struct rpcrdma_req *req) } static void -rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) +rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); struct rpcrdma_ia *ia = rdmab_to_ia(buf); - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; unsigned int count; count = 0; - spin_lock(&buf->rb_mwlock); + spin_lock(&buf->rb_mrlock); while (!list_empty(&buf->rb_all)) { - mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&mw->mw_all); + mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all); + list_del(&mr->mr_all); - spin_unlock(&buf->rb_mwlock); - ia->ri_ops->ro_release_mr(mw); + spin_unlock(&buf->rb_mrlock); + ia->ri_ops->ro_release_mr(mr); count++; - spin_lock(&buf->rb_mwlock); + spin_lock(&buf->rb_mrlock); } - spin_unlock(&buf->rb_mwlock); + spin_unlock(&buf->rb_mrlock); r_xprt->rx_stats.mrs_allocated = 0; dprintk("RPC: %s: released %u MRs\n", __func__, count); @@ -1285,27 +1280,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) spin_unlock(&buf->rb_reqslock); buf->rb_recv_count = 0; - rpcrdma_destroy_mrs(buf); + rpcrdma_mrs_destroy(buf); } -struct rpcrdma_mw * -rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) +/** + * rpcrdma_mr_get - Allocate an rpcrdma_mr object + * @r_xprt: controlling transport + * + * Returns an initialized rpcrdma_mr or NULL if no free + * rpcrdma_mr objects are available. + */ +struct rpcrdma_mr * +rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_mw *mw = NULL; + struct rpcrdma_mr *mr = NULL; - spin_lock(&buf->rb_mwlock); - if (!list_empty(&buf->rb_mws)) - mw = rpcrdma_pop_mw(&buf->rb_mws); - spin_unlock(&buf->rb_mwlock); + spin_lock(&buf->rb_mrlock); + if (!list_empty(&buf->rb_mrs)) + mr = rpcrdma_mr_pop(&buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); - if (!mw) - goto out_nomws; - mw->mw_flags = 0; - return mw; + if (!mr) + goto out_nomrs; + return mr; -out_nomws: - dprintk("RPC: %s: no MWs available\n", __func__); +out_nomrs: + trace_xprtrdma_nomrs(r_xprt); if (r_xprt->rx_ep.rep_connected != -ENODEV) schedule_delayed_work(&buf->rb_refresh_worker, 0); @@ -1315,14 +1316,39 @@ out_nomws: return NULL; } +static void +__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr) +{ + spin_lock(&buf->rb_mrlock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); +} + +/** + * rpcrdma_mr_put - Release an rpcrdma_mr object + * @mr: object to release + * + */ void -rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) +rpcrdma_mr_put(struct rpcrdma_mr *mr) { - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr); +} + +/** + * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it + * @mr: object to release + * + */ +void +rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) +{ + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - spin_lock(&buf->rb_mwlock); - rpcrdma_push_mw(mw, &buf->rb_mws); - spin_unlock(&buf->rb_mwlock); + trace_xprtrdma_dma_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + __rpcrdma_mr_put(&r_xprt->rx_buf, mr); } static struct rpcrdma_rep * @@ -1359,11 +1385,11 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) req = rpcrdma_buffer_get_req_locked(buffers); req->rl_reply = rpcrdma_buffer_get_rep(buffers); spin_unlock(&buffers->rb_lock); + return req; out_reqbuf: spin_unlock(&buffers->rb_lock); - pr_warn("RPC: %s: out of request buffers\n", __func__); return NULL; } @@ -1519,9 +1545,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, req->rl_reply = NULL; } - dprintk("RPC: %s: posting %d s/g entries\n", - __func__, send_wr->num_sge); - if (!ep->rep_send_count || test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { send_wr->send_flags |= IB_SEND_SIGNALED; @@ -1530,14 +1553,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, send_wr->send_flags &= ~IB_SEND_SIGNALED; --ep->rep_send_count; } + rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); + trace_xprtrdma_post_send(req, rc); if (rc) - goto out_postsend_err; + return -ENOTCONN; return 0; - -out_postsend_err: - pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc); - return -ENOTCONN; } int @@ -1550,23 +1571,20 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf)) goto out_map; rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail); + trace_xprtrdma_post_recv(rep, rc); if (rc) - goto out_postrecv; + return -ENOTCONN; return 0; out_map: pr_err("rpcrdma: failed to DMA map the Receive buffer\n"); return -EIO; - -out_postrecv: - pr_err("rpcrdma: ib_post_recv returned %i\n", rc); - return -ENOTCONN; } /** * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests * @r_xprt: transport associated with these backchannel resources - * @min_reqs: minimum number of incoming requests expected + * @count: minimum number of incoming requests expected * * Returns zero if all requested buffers were posted, or a negative errno. */ @@ -1594,7 +1612,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) out_reqbuf: spin_unlock(&buffers->rb_lock); - pr_warn("%s: no extra receive buffers\n", __func__); + trace_xprtrdma_noreps(r_xprt); return -ENOMEM; out_rc: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 1342f743f1c4..69883a960a3f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -73,11 +73,10 @@ struct rpcrdma_ia { struct completion ri_remove_done; int ri_async_rc; unsigned int ri_max_segs; - unsigned int ri_max_frmr_depth; + unsigned int ri_max_frwr_depth; unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; unsigned int ri_max_send_sges; - bool ri_reminv_expected; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; unsigned long ri_flags; @@ -101,7 +100,6 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; }; @@ -232,29 +230,29 @@ enum { }; /* - * struct rpcrdma_mw - external memory region metadata + * struct rpcrdma_mr - external memory region metadata * * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). * - * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During + * Each rpcrdma_buffer has a list of free MWs anchored in rb_mrs. During * call_allocate, rpcrdma_buffer_get() assigns one to each segment in * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep * track of registration metadata while each RPC is pending. * rpcrdma_deregister_external() uses this metadata to unmap and * release these resources when an RPC is complete. */ -enum rpcrdma_frmr_state { - FRMR_IS_INVALID, /* ready to be used */ - FRMR_IS_VALID, /* in use */ - FRMR_FLUSHED_FR, /* flushed FASTREG WR */ - FRMR_FLUSHED_LI, /* flushed LOCALINV WR */ +enum rpcrdma_frwr_state { + FRWR_IS_INVALID, /* ready to be used */ + FRWR_IS_VALID, /* in use */ + FRWR_FLUSHED_FR, /* flushed FASTREG WR */ + FRWR_FLUSHED_LI, /* flushed LOCALINV WR */ }; -struct rpcrdma_frmr { +struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; - enum rpcrdma_frmr_state fr_state; + enum rpcrdma_frwr_state fr_state; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; @@ -267,26 +265,20 @@ struct rpcrdma_fmr { u64 *fm_physaddrs; }; -struct rpcrdma_mw { - struct list_head mw_list; - struct scatterlist *mw_sg; - int mw_nents; - enum dma_data_direction mw_dir; - unsigned long mw_flags; +struct rpcrdma_mr { + struct list_head mr_list; + struct scatterlist *mr_sg; + int mr_nents; + enum dma_data_direction mr_dir; union { struct rpcrdma_fmr fmr; - struct rpcrdma_frmr frmr; + struct rpcrdma_frwr frwr; }; - struct rpcrdma_xprt *mw_xprt; - u32 mw_handle; - u32 mw_length; - u64 mw_offset; - struct list_head mw_all; -}; - -/* mw_flags */ -enum { - RPCRDMA_MW_F_RI = 1, + struct rpcrdma_xprt *mr_xprt; + u32 mr_handle; + u32 mr_length; + u64 mr_offset; + struct list_head mr_all; }; /* @@ -362,8 +354,7 @@ struct rpcrdma_req { /* rl_flags */ enum { - RPCRDMA_REQ_F_BACKCHANNEL = 0, - RPCRDMA_REQ_F_PENDING, + RPCRDMA_REQ_F_PENDING = 0, RPCRDMA_REQ_F_TX_RESOURCES, }; @@ -374,25 +365,25 @@ rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) } static inline struct rpcrdma_req * -rpcr_to_rdmar(struct rpc_rqst *rqst) +rpcr_to_rdmar(const struct rpc_rqst *rqst) { return rqst->rq_xprtdata; } static inline void -rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list) +rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list) { - list_add_tail(&mw->mw_list, list); + list_add_tail(&mr->mr_list, list); } -static inline struct rpcrdma_mw * -rpcrdma_pop_mw(struct list_head *list) +static inline struct rpcrdma_mr * +rpcrdma_mr_pop(struct list_head *list) { - struct rpcrdma_mw *mw; + struct rpcrdma_mr *mr; - mw = list_first_entry(list, struct rpcrdma_mw, mw_list); - list_del(&mw->mw_list); - return mw; + mr = list_first_entry(list, struct rpcrdma_mr, mr_list); + list_del(&mr->mr_list); + return mr; } /* @@ -402,8 +393,8 @@ rpcrdma_pop_mw(struct list_head *list) * One of these is associated with a transport instance */ struct rpcrdma_buffer { - spinlock_t rb_mwlock; /* protect rb_mws list */ - struct list_head rb_mws; + spinlock_t rb_mrlock; /* protect rb_mrs list */ + struct list_head rb_mrs; struct list_head rb_all; unsigned long rb_sc_head; @@ -438,13 +429,11 @@ struct rpcrdma_buffer { * This data should be set with mount options */ struct rpcrdma_create_data_internal { - struct sockaddr_storage addr; /* RDMA server address */ unsigned int max_requests; /* max requests (slots) in flight */ unsigned int rsize; /* mount rsize - max read hdr+data */ unsigned int wsize; /* mount wsize - max write hdr+data */ unsigned int inline_rsize; /* max non-rdma read data payload */ unsigned int inline_wsize; /* max non-rdma write data payload */ - unsigned int padding; /* non-rdma write header padding */ }; /* @@ -484,17 +473,19 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mr_seg * (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool, - struct rpcrdma_mw **); + struct rpcrdma_mr **); + void (*ro_reminv)(struct rpcrdma_rep *rep, + struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_recover_mr)(struct rpcrdma_mw *); + void (*ro_recover_mr)(struct rpcrdma_mr *mr); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); int (*ro_init_mr)(struct rpcrdma_ia *, - struct rpcrdma_mw *); - void (*ro_release_mr)(struct rpcrdma_mw *); + struct rpcrdma_mr *); + void (*ro_release_mr)(struct rpcrdma_mr *mr); const char *ro_displayname; const int ro_send_w_inv_ok; }; @@ -525,6 +516,18 @@ struct rpcrdma_xprt { #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) +static inline const char * +rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt) +{ + return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]; +} + +static inline const char * +rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt) +{ + return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_PORT]; +} + /* Setting this to 0 ensures interoperability with early servers. * Setting this to 1 enhances certain unaligned read/write performance. * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ @@ -538,7 +541,7 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Interface Adapter calls - xprtrdma/verbs.c */ -int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr); +int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); void rpcrdma_ia_remove(struct rpcrdma_ia *ia); void rpcrdma_ia_close(struct rpcrdma_ia *); bool frwr_is_supported(struct rpcrdma_ia *); @@ -564,22 +567,23 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *); * Buffer calls - xprtrdma/verbs.c */ struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); -struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); void rpcrdma_destroy_req(struct rpcrdma_req *); +int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); -struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); -void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); +struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); +void rpcrdma_mr_put(struct rpcrdma_mr *mr); +void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); +void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); + struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); - struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, gfp_t); bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); @@ -663,7 +667,7 @@ int xprt_rdma_bc_up(struct svc_serv *, struct net *); size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); -int rpcrdma_bc_marshal_reply(struct rpc_rqst *); +int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst); void xprt_rdma_bc_free_rqst(struct rpc_rqst *); void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ @@ -671,3 +675,5 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); extern struct xprt_class xprt_rdma_bc; #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ + +#include <trace/events/rpcrdma.h> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 6d0cc3b8f932..18803021f242 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -52,6 +52,8 @@ #include "sunrpc.h" +#define RPC_TCP_READ_CHUNK_SZ (3*512*1024) + static void xs_close(struct rpc_xprt *xprt); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); @@ -1003,6 +1005,7 @@ static void xs_local_data_receive(struct sock_xprt *transport) struct sock *sk; int err; +restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) @@ -1016,6 +1019,11 @@ static void xs_local_data_receive(struct sock_xprt *transport) } if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; + if (need_resched()) { + mutex_unlock(&transport->recv_mutex); + cond_resched(); + goto restart; + } } out: mutex_unlock(&transport->recv_mutex); @@ -1094,6 +1102,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport) struct sock *sk; int err; +restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) @@ -1107,6 +1116,11 @@ static void xs_udp_data_receive(struct sock_xprt *transport) } if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; + if (need_resched()) { + mutex_unlock(&transport->recv_mutex); + cond_resched(); + goto restart; + } } out: mutex_unlock(&transport->recv_mutex); @@ -1479,6 +1493,7 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns .offset = offset, .count = len, }; + size_t ret; dprintk("RPC: xs_tcp_data_recv started\n"); do { @@ -1507,9 +1522,14 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns /* Skip over any trailing bytes on short reads */ xs_tcp_read_discard(transport, &desc); } while (desc.count); + ret = len - desc.count; + if (ret < rd_desc->count) + rd_desc->count -= ret; + else + rd_desc->count = 0; trace_xs_tcp_data_recv(transport); dprintk("RPC: xs_tcp_data_recv done\n"); - return len - desc.count; + return ret; } static void xs_tcp_data_receive(struct sock_xprt *transport) @@ -1517,30 +1537,34 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) struct rpc_xprt *xprt = &transport->xprt; struct sock *sk; read_descriptor_t rd_desc = { - .count = 2*1024*1024, .arg.data = xprt, }; unsigned long total = 0; - int loop; int read = 0; +restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) goto out; /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - for (loop = 0; loop < 64; loop++) { + for (;;) { + rd_desc.count = RPC_TCP_READ_CHUNK_SZ; lock_sock(sk); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - if (read <= 0) { + if (rd_desc.count != 0 || read < 0) { clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); release_sock(sk); break; } release_sock(sk); total += read; - rd_desc.count = 65536; + if (need_resched()) { + mutex_unlock(&transport->recv_mutex); + cond_resched(); + goto restart; + } } if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) queue_work(xprtiod_workqueue, &transport->recv_worker); |