From 3132e49ecef9dab43d858d8e7066662c6a1efb16 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Aug 2016 15:58:24 -0400 Subject: pnfs: track multiple layout types in fsinfo structure Current NFSv4.1/pNFS client assumes that MDS supports only one layout type. While it's true for most existing servers, nevertheless, this can be change in the near future. For now, this patch just plumbs in the ability to track a list of layouts in the fsinfo structure. The existing behavior of the client is preserved, by having it just select the first entry in the list. Signed-off-by: Tigran Mkrtchyan Signed-off-by: Jeff Layton Reviewed-by: J. Bruce Fields Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 7cc0deee5bde..f11b26ed001b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -124,6 +124,11 @@ struct nfs_fattr { | NFS_ATTR_FATTR_SPACE_USED \ | NFS_ATTR_FATTR_V4_SECURITY_LABEL) +/* + * Maximal number of supported layout drivers. + */ +#define NFS_MAX_LAYOUT_TYPES 8 + /* * Info on the file system */ @@ -139,7 +144,7 @@ struct nfs_fsinfo { __u64 maxfilesize; struct timespec time_delta; /* server time granularity */ __u32 lease_time; /* in seconds */ - __u32 layouttype; /* supported pnfs layout driver */ + __u32 layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */ __u32 blksize; /* preferred pnfs io block size */ __u32 clone_blksize; /* granularity of a CLONE operation */ }; -- cgit v1.2.3 From 3b58a8a9049d5e191402665c339690a148504358 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:23 -0400 Subject: SUNRPC rpc_clnt_xprt_switch_put Give the NFS layer access to the xprt_switch_put function Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 5c02b0691587..c12f86b752cb 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -199,5 +199,7 @@ void rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo); const char *rpc_proc_name(const struct rpc_task *task); + +void rpc_clnt_xprt_switch_put(struct rpc_clnt *); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4bb526f57a09..0ff5cbf3d631 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2695,6 +2695,12 @@ rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) } EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); +void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) +{ + xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { -- cgit v1.2.3 From dd69171769cf4649a7ff3755e91cbd242a833727 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:24 -0400 Subject: SUNRPC rpc_clnt_xprt_switch_add_xprt Give the NFS layer access to the rpc_xprt_switch_add_xprt function Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index c12f86b752cb..b069d6e2c3d6 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -201,5 +201,6 @@ void rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, const char *rpc_proc_name(const struct rpc_task *task); void rpc_clnt_xprt_switch_put(struct rpc_clnt *); +void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0ff5cbf3d631..43ec46547dc9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2701,6 +2701,13 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); +void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), + xprt); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { -- cgit v1.2.3 From 39e5d2df959dd4aea81fa33d765d2a5cc67a0512 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:25 -0400 Subject: SUNRPC search xprt switch for sockaddr Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 2 ++ include/linux/sunrpc/xprtmultipath.h | 2 ++ net/sunrpc/clnt.c | 15 +++++++++++++++ net/sunrpc/xprtmultipath.c | 24 +++++++++++++++++++++++- 4 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index b069d6e2c3d6..35cc539e2921 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -202,5 +202,7 @@ const char *rpc_proc_name(const struct rpc_task *task); void rpc_clnt_xprt_switch_put(struct rpc_clnt *); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); +bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, + const struct sockaddr *sap); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index 5a9acffa41be..507418c1c69e 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -66,4 +66,6 @@ extern struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi); extern struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi); extern struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi); +extern bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap); #endif diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 43ec46547dc9..8d68efd2026f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2708,6 +2708,21 @@ void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); +bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, + const struct sockaddr *sap) +{ + struct rpc_xprt_switch *xps; + bool ret; + + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + + rcu_read_lock(); + ret = rpc_xprt_switch_has_addr(xps, sap); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 66c9d63f4797..ae92a9e9ba52 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -15,6 +15,7 @@ #include #include #include +#include #include typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head, @@ -49,7 +50,8 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, if (xprt == NULL) return; spin_lock(&xps->xps_lock); - if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) + if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) && + !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); } @@ -232,6 +234,26 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) return xprt_switch_find_current_entry(head, xpi->xpi_cursor); } +bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap) +{ + struct list_head *head; + struct rpc_xprt *pos; + + if (xps == NULL || sap == NULL) + return false; + + head = &xps->xps_xprt_list; + list_for_each_entry_rcu(pos, head, xprt_switch) { + if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) { + pr_info("RPC: addr %s already in xprt switch\n", + pos->address_strings[RPC_DISPLAY_ADDR]); + return true; + } + } + return false; +} + static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, const struct rpc_xprt *cur) -- cgit v1.2.3 From fda0ab41170ee0a1c7a3781ff8cfb4395c3dd784 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:26 -0400 Subject: SUNRPC: rpc_clnt_add_xprt setup function for NFS layer Use a setup function to call into the NFS layer to test an rpc_xprt for session trunking so as to not leak the rpc_xprt_switch into the nfs layer. Search for the address in the rpc_xprt_switch first so as not to put an unnecessary EXCHANGE_ID on the wire. Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 12 +++++++++ net/sunrpc/clnt.c | 64 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 35cc539e2921..85cc819676e8 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -125,6 +125,13 @@ struct rpc_create_args { struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ }; +struct rpc_add_xprt_test { + int (*add_xprt_test)(struct rpc_clnt *, + struct rpc_xprt *, + void *calldata); + void *data; +}; + /* Values for "flags" field */ #define RPC_CLNT_CREATE_HARDRTRY (1UL << 0) #define RPC_CLNT_CREATE_AUTOBIND (1UL << 2) @@ -198,6 +205,11 @@ int rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *, void rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo); +int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *, + struct rpc_xprt_switch *, + struct rpc_xprt *, + void *); + const char *rpc_proc_name(const struct rpc_task *task); void rpc_clnt_xprt_switch_put(struct rpc_clnt *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8d68efd2026f..b614cb1356b9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2613,6 +2613,70 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, } EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); +/** + * rpc_clnt_setup_test_and_add_xprt() + * + * This is an rpc_clnt_add_xprt setup() function which returns 1 so: + * 1) caller of the test function must dereference the rpc_xprt_switch + * and the rpc_xprt. + * 2) test function must call rpc_xprt_switch_add_xprt, usually in + * the rpc_call_done routine. + * + * Upon success (return of 1), the test function adds the new + * transport to the rpc_clnt xprt switch + * + * @clnt: struct rpc_clnt to get the new transport + * @xps: the rpc_xprt_switch to hold the new transport + * @xprt: the rpc_xprt to test + * @data: a struct rpc_add_xprt_test pointer that holds the test function + * and test function call data + */ +int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xps, + struct rpc_xprt *xprt, + void *data) +{ + struct rpc_cred *cred; + struct rpc_task *task; + struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data; + int status = -EADDRINUSE; + + xprt = xprt_get(xprt); + xprt_switch_get(xps); + + if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) + goto out_err; + + /* Test the connection */ + cred = authnull_ops.lookup_cred(NULL, NULL, 0); + task = rpc_call_null_helper(clnt, xprt, cred, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + NULL, NULL); + put_rpccred(cred); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out_err; + } + status = task->tk_status; + rpc_put_task(task); + + if (status < 0) + goto out_err; + + /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ + xtest->add_xprt_test(clnt, xprt, xtest->data); + + /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ + return 1; +out_err: + xprt_put(xprt); + xprt_switch_put(xps); + pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n", + status, xprt->address_strings[RPC_DISPLAY_ADDR]); + return status; +} +EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt); + /** * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt * @clnt: pointer to struct rpc_clnt -- cgit v1.2.3 From b9c5bc03be6aae41990efd09f83cf70a89ac9f4b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:12 -0400 Subject: SUNRPC: Refactor rpc_xdr_buf_init() Clean up: there is some XDR initialization logic that is common to the forward channel and backchannel. Move it to an XDR header so it can be shared. rpc_rqst::rq_buffer points to a buffer containing big-endian data. Update its annotation as part of the clean up. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 12 ++++++++++++ include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/backchannel_rqst.c | 8 +------- net/sunrpc/clnt.c | 24 ++++++------------------ net/sunrpc/xprtrdma/backchannel.c | 12 +----------- 5 files changed, 21 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 70c6b92e15a7..56c48c884a24 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -67,6 +67,18 @@ struct xdr_buf { len; /* Length of XDR encoded message */ }; +static inline void +xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) +{ + buf->head[0].iov_base = start; + buf->head[0].iov_len = len; + buf->tail[0].iov_len = 0; + buf->page_len = 0; + buf->flags = 0; + buf->len = 0; + buf->buflen = len; +} + /* * pre-xdr'ed macros. */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index a16070dd03ee..6f1d41b559a3 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -83,7 +83,7 @@ struct rpc_rqst { void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */ struct list_head rq_list; - __u32 * rq_buffer; /* XDR encode buffer */ + void *rq_buffer; /* Call XDR encode buffer */ size_t rq_callsize, rq_rcvsize; size_t rq_xmit_bytes_sent; /* total bytes sent */ diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 229956bf8457..ac701c28f44f 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -76,13 +76,7 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) page = alloc_page(gfp_flags); if (page == NULL) return -ENOMEM; - buf->head[0].iov_base = page_address(page); - buf->head[0].iov_len = PAGE_SIZE; - buf->tail[0].iov_base = NULL; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->len = 0; - buf->buflen = PAGE_SIZE; + xdr_buf_init(buf, page_address(page), PAGE_SIZE); return 0; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b614cb1356b9..6481986be7a7 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1746,18 +1746,6 @@ rpc_task_force_reencode(struct rpc_task *task) task->tk_rqstp->rq_bytes_sent = 0; } -static inline void -rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) -{ - buf->head[0].iov_base = start; - buf->head[0].iov_len = len; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->flags = 0; - buf->len = 0; - buf->buflen = len; -} - /* * 3. Encode arguments of an RPC call */ @@ -1770,12 +1758,12 @@ rpc_xdr_encode(struct rpc_task *task) dprint_status(task); - rpc_xdr_buf_init(&req->rq_snd_buf, - req->rq_buffer, - req->rq_callsize); - rpc_xdr_buf_init(&req->rq_rcv_buf, - (char *)req->rq_buffer + req->rq_callsize, - req->rq_rcvsize); + xdr_buf_init(&req->rq_snd_buf, + req->rq_buffer, + req->rq_callsize); + xdr_buf_init(&req->rq_rcv_buf, + (char *)req->rq_buffer + req->rq_callsize, + req->rq_rcvsize); p = rpc_encode_header(task); if (p == NULL) { diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 5f60ab2f858a..d3cfaf281e55 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -38,7 +38,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; - struct xdr_buf *buf; size_t size; req = rpcrdma_create_req(r_xprt); @@ -60,16 +59,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, req->rl_sendbuf = rb; /* so that rpcr_to_rdmar works when receiving a request */ rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; - - buf = &rqst->rq_snd_buf; - buf->head[0].iov_base = rqst->rq_buffer; - buf->head[0].iov_len = 0; - buf->tail[0].iov_base = NULL; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->len = 0; - buf->buflen = size; - + xdr_buf_init(&rqst->rq_snd_buf, rqst->rq_buffer, size); return 0; out_fail: -- cgit v1.2.3 From 5fe6eaa1f9a00b9a5927e3b791ecad2f3eaab130 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:20 -0400 Subject: SUNRPC: Generalize the RPC buffer allocation API xprtrdma needs to allocate the Call and Reply buffers separately. TBH, the reliance on using a single buffer for the pair of XDR buffers is transport implementation-specific. Transports that want to allocate separate Call and Reply buffers will ignore the "size" argument anyway. Don't bother passing it. The buf_alloc method can't return two pointers. Instead, make the method's return value an error code, and set the rq_buffer pointer in the method itself. This gives call_allocate an opportunity to terminate an RPC instead of looping forever when a permanent problem occurs. If a request is just bogus, or the transport is in a state where it can't allocate resources for any request, there needs to be a way to kill the RPC right there and not loop. This immediately fixes a rare problem in the backchannel send path, which loops if the server happens to send a CB request whose call+reply size is larger than a page (which it shouldn't do yet). One more issue: looks like xprt_inject_disconnect was incorrectly placed in the failure path in call_allocate. It needs to be in the success path, as it is for other call-sites. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 2 +- include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/clnt.c | 12 ++++++++---- net/sunrpc/sched.c | 24 +++++++++++++++--------- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 17 +++++++++-------- net/sunrpc/xprtrdma/transport.c | 26 ++++++++++++++++++-------- net/sunrpc/xprtsock.c | 17 +++++++++++------ 7 files changed, 63 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 817af0b4385e..38d4c1b378f2 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -239,7 +239,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, void *); void rpc_wake_up_status(struct rpc_wait_queue *, int); void rpc_delay(struct rpc_task *, unsigned long); -void * rpc_malloc(struct rpc_task *, size_t); +int rpc_malloc(struct rpc_task *); void rpc_free(void *); int rpciod_up(void); void rpciod_down(void); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 6f1d41b559a3..c01f468fb374 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -127,7 +127,7 @@ struct rpc_xprt_ops { void (*rpcbind)(struct rpc_task *task); void (*set_port)(struct rpc_xprt *xprt, unsigned short port); void (*connect)(struct rpc_xprt *xprt, struct rpc_task *task); - void * (*buf_alloc)(struct rpc_task *task, size_t size); + int (*buf_alloc)(struct rpc_task *task); void (*buf_free)(void *buffer); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 6481986be7a7..5499fda0c1f3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1691,6 +1691,7 @@ call_allocate(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct rpc_procinfo *proc = task->tk_msg.rpc_proc; + int status; dprint_status(task); @@ -1716,11 +1717,14 @@ call_allocate(struct rpc_task *task) req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen; req->rq_rcvsize <<= 2; - req->rq_buffer = xprt->ops->buf_alloc(task, - req->rq_callsize + req->rq_rcvsize); - if (req->rq_buffer != NULL) - return; + status = xprt->ops->buf_alloc(task); xprt_inject_disconnect(xprt); + if (status == 0) + return; + if (status != -ENOMEM) { + rpc_exit(task, status); + return; + } dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9ae588511aaf..b964d40b259b 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -849,14 +849,17 @@ static void rpc_async_schedule(struct work_struct *work) } /** - * rpc_malloc - allocate an RPC buffer - * @task: RPC task that will use this buffer - * @size: requested byte size + * rpc_malloc - allocate RPC buffer resources + * @task: RPC task + * + * A single memory region is allocated, which is split between the + * RPC call and RPC reply that this task is being used for. When + * this RPC is retired, the memory is released by calling rpc_free. * * To prevent rpciod from hanging, this allocator never sleeps, - * returning NULL and suppressing warning if the request cannot be serviced - * immediately. - * The caller can arrange to sleep in a way that is safe for rpciod. + * returning -ENOMEM and suppressing warning if the request cannot + * be serviced immediately. The caller can arrange to sleep in a + * way that is safe for rpciod. * * Most requests are 'small' (under 2KiB) and can be serviced from a * mempool, ensuring that NFS reads and writes can always proceed, @@ -865,8 +868,10 @@ static void rpc_async_schedule(struct work_struct *work) * In order to avoid memory starvation triggering more writebacks of * NFS requests, we avoid using GFP_KERNEL. */ -void *rpc_malloc(struct rpc_task *task, size_t size) +int rpc_malloc(struct rpc_task *task) { + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize + rqst->rq_rcvsize; struct rpc_buffer *buf; gfp_t gfp = GFP_NOIO | __GFP_NOWARN; @@ -880,12 +885,13 @@ void *rpc_malloc(struct rpc_task *task, size_t size) buf = kmalloc(size, gfp); if (!buf) - return NULL; + return -ENOMEM; buf->len = size; dprintk("RPC: %5u allocated buffer of size %zu at %p\n", task->tk_pid, size, buf); - return &buf->data; + rqst->rq_buffer = buf->data; + return 0; } EXPORT_SYMBOL_GPL(rpc_malloc); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a2a7519b0f23..124688ba67e5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -159,29 +159,30 @@ out_unmap: /* Server-side transport endpoint wants a whole page for its send * buffer. The client RPC code constructs the RPC header in this * buffer before it invokes ->send_request. - * - * Returns NULL if there was a temporary allocation failure. */ -static void * -xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) +static int +xprt_rdma_bc_allocate(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + size_t size = rqst->rq_callsize; struct svcxprt_rdma *rdma; struct page *page; rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - /* Prevent an infinite loop: try to make this case work */ - if (size > PAGE_SIZE) + if (size > PAGE_SIZE) { WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", size); + return -EINVAL; + } page = alloc_page(RPCRDMA_DEF_GFP); if (!page) - return NULL; + return -ENOMEM; - return page_address(page); + rqst->rq_buffer = page_address(page); + return 0; } static void diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index be95eced0726..daa7d4d43fd8 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -477,7 +477,15 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -/* +/** + * xprt_rdma_allocate - allocate transport resources for an RPC + * @task: RPC task + * + * Return values: + * 0: Success; rq_buffer points to RPC buffer to use + * ENOMEM: Out of memory, call again later + * EIO: A permanent error occurred, do not retry + * * The RDMA allocate/free functions need the task structure as a place * to hide the struct rpcrdma_req, which is necessary for the actual send/recv * sequence. @@ -486,11 +494,12 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer). * We may register rq_rcv_buf when using reply chunks. */ -static void * -xprt_rdma_allocate(struct rpc_task *task, size_t size) +static int +xprt_rdma_allocate(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize + rqst->rq_rcvsize; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; size_t min_size; @@ -498,7 +507,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (req == NULL) - return NULL; + return -ENOMEM; flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) @@ -515,7 +524,8 @@ out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ req->rl_task = task; - return req->rl_sendbuf->rg_base; + rqst->rq_buffer = req->rl_sendbuf->rg_base; + return 0; out_rdmabuf: min_size = r_xprt->rx_data.inline_wsize; @@ -558,7 +568,7 @@ out_sendbuf: out_fail: rpcrdma_buffer_put(req); - return NULL; + return -ENOMEM; } /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bf168838a029..bd30b4b18d72 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2533,23 +2533,28 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) * we allocate pages instead doing a kmalloc like rpc_malloc is because we want * to use the server side send routines. */ -static void *bc_malloc(struct rpc_task *task, size_t size) +static int bc_malloc(struct rpc_task *task) { + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize; struct page *page; struct rpc_buffer *buf; - WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer)); - if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) - return NULL; + if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) { + WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n", + size); + return -EINVAL; + } page = alloc_page(GFP_KERNEL); if (!page) - return NULL; + return -ENOMEM; buf = page_address(page); buf->len = PAGE_SIZE; - return buf->data; + rqst->rq_buffer = buf->data; + return 0; } /* -- cgit v1.2.3 From 3435c74aed2d7b743ccbf34616c523ebee7be943 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:29 -0400 Subject: SUNRPC: Generalize the RPC buffer release API xprtrdma needs to allocate the Call and Reply buffers separately. TBH, the reliance on using a single buffer for the pair of XDR buffers is transport implementation-specific. Instead of passing just the rq_buffer into the buf_free method, pass the task structure and let buf_free take care of freeing both XDR buffers at once. There's a micro-optimization here. In the common case, both xprt_release and the transport's buf_free method were checking if rq_buffer was NULL. Now the check is done only once per RPC. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 2 +- include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/sched.c | 10 ++++------ net/sunrpc/xprt.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 +- net/sunrpc/xprtrdma/transport.c | 26 ++++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - net/sunrpc/xprtsock.c | 6 ++---- 8 files changed, 20 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 38d4c1b378f2..7ba040c797ec 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -240,7 +240,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, void rpc_wake_up_status(struct rpc_wait_queue *, int); void rpc_delay(struct rpc_task *, unsigned long); int rpc_malloc(struct rpc_task *); -void rpc_free(void *); +void rpc_free(struct rpc_task *); int rpciod_up(void); void rpciod_down(void); int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c01f468fb374..72c2aebc592b 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -128,7 +128,7 @@ struct rpc_xprt_ops { void (*set_port)(struct rpc_xprt *xprt, unsigned short port); void (*connect)(struct rpc_xprt *xprt, struct rpc_task *task); int (*buf_alloc)(struct rpc_task *task); - void (*buf_free)(void *buffer); + void (*buf_free)(struct rpc_task *task); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); void (*timer)(struct rpc_xprt *xprt, struct rpc_task *task); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b964d40b259b..6690ebc774ed 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -896,18 +896,16 @@ int rpc_malloc(struct rpc_task *task) EXPORT_SYMBOL_GPL(rpc_malloc); /** - * rpc_free - free buffer allocated via rpc_malloc - * @buffer: buffer to free + * rpc_free - free RPC buffer resources allocated via rpc_malloc + * @task: RPC task * */ -void rpc_free(void *buffer) +void rpc_free(struct rpc_task *task) { + void *buffer = task->tk_rqstp->rq_buffer; size_t size; struct rpc_buffer *buf; - if (!buffer) - return; - buf = container_of(buffer, struct rpc_buffer, data); size = buf->len; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ea244b29138b..685e6d225414 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1295,7 +1295,7 @@ void xprt_release(struct rpc_task *task) xprt_schedule_autodisconnect(xprt); spin_unlock_bh(&xprt->transport_lock); if (req->rq_buffer) - xprt->ops->buf_free(req->rq_buffer); + xprt->ops->buf_free(task); xprt_inject_disconnect(xprt); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 124688ba67e5..fa893507d9eb 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -186,7 +186,7 @@ xprt_rdma_bc_allocate(struct rpc_task *task) } static void -xprt_rdma_bc_free(void *buffer) +xprt_rdma_bc_free(struct rpc_task *task) { /* No-op: ctxt and page have already been freed. */ } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index daa7d4d43fd8..ebf14ba437c6 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -523,7 +523,6 @@ xprt_rdma_allocate(struct rpc_task *task) out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ - req->rl_task = task; rqst->rq_buffer = req->rl_sendbuf->rg_base; return 0; @@ -571,31 +570,26 @@ out_fail: return -ENOMEM; } -/* - * This function returns all RDMA resources to the pool. +/** + * xprt_rdma_free - release resources allocated by xprt_rdma_allocate + * @task: RPC task + * + * Caller guarantees rqst->rq_buffer is non-NULL. */ static void -xprt_rdma_free(void *buffer) +xprt_rdma_free(struct rpc_task *task) { - struct rpcrdma_req *req; - struct rpcrdma_xprt *r_xprt; - struct rpcrdma_regbuf *rb; - - if (buffer == NULL) - return; + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); - req = rb->rg_owner; if (req->rl_backchannel) return; - r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); - dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, - !RPC_IS_ASYNC(req->rl_task)); - + !RPC_IS_ASYNC(task)); rpcrdma_buffer_put(req); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9df47c857d27..4838a85bdcf6 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -283,7 +283,6 @@ struct rpcrdma_req { struct list_head rl_free; unsigned int rl_niovs; unsigned int rl_connect_cookie; - struct rpc_task *rl_task; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bd30b4b18d72..bde39f2ff6e5 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2560,13 +2560,11 @@ static int bc_malloc(struct rpc_task *task) /* * Free the space allocated in the bc_alloc routine */ -static void bc_free(void *buffer) +static void bc_free(struct rpc_task *task) { + void *buffer = task->tk_rqstp->rq_buffer; struct rpc_buffer *buf; - if (!buffer) - return; - buf = container_of(buffer, struct rpc_buffer, data); free_page((unsigned long)buf); } -- cgit v1.2.3 From 68778945e46f143ed7974b427a8065f69a4ce944 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:37 -0400 Subject: SUNRPC: Separate buffer pointers for RPC Call and Reply messages For xprtrdma, the RPC Call and Reply buffers are involved in real I/O operations. To start with, the DMA direction of the I/O for a Call is opposite that of a Reply. In the current arrangement, the Reply buffer address is on a four-byte alignment just past the call buffer. Would be friendlier on some platforms if that was at a DMA cache alignment instead. Because the current arrangement allocates a single memory region which contains both buffers, the RPC Reply buffer often contains a page boundary in it when the Call buffer is large enough (which is frequent). It would be a little nicer for setting up DMA operations (and possible registration of the Reply buffer) if the two buffers were separated, well-aligned, and contained as few page boundaries as possible. Now, I could just pad out the single memory region used for the pair of buffers. But frequently that would mean a lot of unused space to ensure the Reply buffer did not have a page boundary. Add a separate pointer to rpc_rqst that points right to the RPC Reply buffer. This makes no difference to xprtsock, but it will help xprtrdma in subsequent patches. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 5 +++-- net/sunrpc/clnt.c | 2 +- net/sunrpc/sched.c | 1 + net/sunrpc/xprtrdma/transport.c | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 72c2aebc592b..46f069efa056 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -84,8 +84,9 @@ struct rpc_rqst { struct list_head rq_list; void *rq_buffer; /* Call XDR encode buffer */ - size_t rq_callsize, - rq_rcvsize; + size_t rq_callsize; + void *rq_rbuffer; /* Reply XDR decode buffer */ + size_t rq_rcvsize; size_t rq_xmit_bytes_sent; /* total bytes sent */ size_t rq_reply_bytes_recvd; /* total reply bytes */ /* received */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 5499fda0c1f3..34dd7b26ee5f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1766,7 +1766,7 @@ rpc_xdr_encode(struct rpc_task *task) req->rq_buffer, req->rq_callsize); xdr_buf_init(&req->rq_rcv_buf, - (char *)req->rq_buffer + req->rq_callsize, + req->rq_rbuffer, req->rq_rcvsize); p = rpc_encode_header(task); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6690ebc774ed..5db68b371db2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -891,6 +891,7 @@ int rpc_malloc(struct rpc_task *task) dprintk("RPC: %5u allocated buffer of size %zu at %p\n", task->tk_pid, size, buf); rqst->rq_buffer = buf->data; + rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } EXPORT_SYMBOL_GPL(rpc_malloc); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ebf14ba437c6..136caf3dd299 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -524,6 +524,7 @@ out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ rqst->rq_buffer = req->rl_sendbuf->rg_base; + rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_rcvsize; return 0; out_rdmabuf: -- cgit v1.2.3 From 5a6d1db4556940533f1a5b6521e522f3e46508ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:45 -0400 Subject: SUNRPC: Add a transport-specific private field in rpc_rqst Currently there's a hidden and indirect mechanism for finding the rpcrdma_req that goes with an rpc_rqst. It depends on getting from the rq_buffer pointer in struct rpc_rqst to the struct rpcrdma_regbuf that controls that buffer, and then to the struct rpcrdma_req it goes with. This was done back in the day to avoid the need to add a per-rqst pointer or to alter the buf_free API when support for RPC-over-RDMA was introduced. I'm about to change the way regbuf's work to support larger inline thresholds. Now is a good time to replace this indirect mechanism with something that is more straightforward. I guess this should be considered a clean up. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprtrdma/backchannel.c | 6 ++---- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 1 - net/sunrpc/xprtrdma/xprt_rdma.h | 13 +++++++------ 5 files changed, 11 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 46f069efa056..a5da60b24d83 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -83,6 +83,7 @@ struct rpc_rqst { void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */ struct list_head rq_list; + void *rq_xprtdata; /* Per-xprt private data */ void *rq_buffer; /* Call XDR encode buffer */ size_t rq_callsize; void *rq_rbuffer; /* Reply XDR decode buffer */ diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index d3cfaf281e55..c4904f881640 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -55,11 +55,9 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; - rb->rg_owner = req; req->rl_sendbuf = rb; - /* so that rpcr_to_rdmar works when receiving a request */ - rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; - xdr_buf_init(&rqst->rq_snd_buf, rqst->rq_buffer, size); + xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size); + rpcrdma_set_xprtdata(rqst, req); return 0; out_fail: diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 136caf3dd299..d83bffa92dfc 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -523,6 +523,7 @@ xprt_rdma_allocate(struct rpc_task *task) out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ + rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_rcvsize; return 0; @@ -559,7 +560,6 @@ out_sendbuf: rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); if (IS_ERR(rb)) goto out_fail; - rb->rg_owner = req; r_xprt->rx_stats.hardway_register_count += size; rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 799cce6cbe45..93def0bf07af 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1210,7 +1210,6 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) iov->length = size; iov->lkey = ia->ri_pd->local_dma_lkey; rb->rg_size = size; - rb->rg_owner = NULL; return rb; out_free: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 4838a85bdcf6..484855eddb85 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -113,7 +113,6 @@ struct rpcrdma_ep { struct rpcrdma_regbuf { size_t rg_size; - struct rpcrdma_req *rg_owner; struct ib_sge rg_iov; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -297,14 +296,16 @@ struct rpcrdma_req { struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; +static inline void +rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) +{ + rqst->rq_xprtdata = req; +} + static inline struct rpcrdma_req * rpcr_to_rdmar(struct rpc_rqst *rqst) { - void *buffer = rqst->rq_buffer; - struct rpcrdma_regbuf *rb; - - rb = container_of(buffer, struct rpcrdma_regbuf, rg_base); - return rb->rg_owner; + return rqst->rq_xprtdata; } /* -- cgit v1.2.3 From ff06bd191e722393d9abf7d6f9767f195274e909 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:56:59 -0400 Subject: rpcrdma: RDMA/CM private message data structure Introduce data structure used by both client and server to exchange implementation details during RDMA/CM connection establishment. This is an experimental out-of-band exchange between Linux RPC-over-RDMA Version One implementations, replacing the deprecated CCP (see RFC 5666bis). The purpose of this extension is to enable prototyping of features that might be introduced in a subsequent version of RPC-over-RDMA. Suggested by Christoph Hellwig and Devesh Sharma. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- include/linux/sunrpc/rpc_rdma.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index 3b1ff38f0c37..a7da6bf56610 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -41,6 +41,7 @@ #define _LINUX_SUNRPC_RPC_RDMA_H #include +#include #define RPCRDMA_VERSION 1 #define rpcrdma_version cpu_to_be32(RPCRDMA_VERSION) @@ -129,4 +130,38 @@ enum rpcrdma_proc { #define rdma_done cpu_to_be32(RDMA_DONE) #define rdma_error cpu_to_be32(RDMA_ERROR) +/* + * Private extension to RPC-over-RDMA Version One. + * Message passed during RDMA-CM connection set-up. + * + * Add new fields at the end, and don't permute existing + * fields. + */ +struct rpcrdma_connect_private { + __be32 cp_magic; + u8 cp_version; + u8 cp_flags; + u8 cp_send_size; + u8 cp_recv_size; +} __packed; + +#define rpcrdma_cmp_magic __cpu_to_be32(0xf6ab0e18) + +enum { + RPCRDMA_CMP_VERSION = 1, + RPCRDMA_CMP_F_SND_W_INV_OK = BIT(0), +}; + +static inline u8 +rpcrdma_encode_buffer_size(unsigned int size) +{ + return (size >> 10) - 1; +} + +static inline unsigned int +rpcrdma_decode_buffer_size(u8 val) +{ + return ((unsigned int)val + 1) << 10; +} + #endif /* _LINUX_SUNRPC_RPC_RDMA_H */ -- cgit v1.2.3 From 87cfb9a0c85ce4a0c96a4f3d692a85519b933ade Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:57:07 -0400 Subject: xprtrdma: Client-side support for rpcrdma_connect_private Send an RDMA-CM private message on connect, and look for one during a connection-established event. Both sides can communicate their various implementation limits. Implementations that don't support this sideband protocol ignore it. Once the client knows the server's inline threshold maxima, it can adjust the use of Reply chunks, and eliminate most use of Position Zero Read chunks. Moderately-sized I/O can be done using a pure inline RDMA Send instead of RDMA operations that require memory registration. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/rpc_rdma.h | 4 ++++ net/sunrpc/xprtrdma/fmr_ops.c | 5 ++--- net/sunrpc/xprtrdma/frwr_ops.c | 5 ++--- net/sunrpc/xprtrdma/rpc_rdma.c | 8 +++++--- net/sunrpc/xprtrdma/verbs.c | 40 +++++++++++++++++++++++++++++++++++++--- net/sunrpc/xprtrdma/xprt_rdma.h | 6 +++--- 6 files changed, 53 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index a7da6bf56610..cfda6adcf33c 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -46,6 +46,10 @@ #define RPCRDMA_VERSION 1 #define rpcrdma_version cpu_to_be32(RPCRDMA_VERSION) +enum { + RPCRDMA_V1_DEF_INLINE_SIZE = 1024, +}; + struct rpcrdma_segment { __be32 rs_handle; /* Registered memory handle */ __be32 rs_length; /* Length of the chunk in bytes */ diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 21cb3b150b37..16690a1b653e 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -160,9 +160,8 @@ static int fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { - rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, - RPCRDMA_MAX_DATA_SEGS / - RPCRDMA_MAX_FMR_SGES)); + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / + RPCRDMA_MAX_FMR_SGES); return 0; } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 892b5e1d9b09..fcfcf3ac030c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -242,9 +242,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, depth; } - rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, - RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frmr_depth)); + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / + ia->ri_max_frmr_depth); return 0; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c2906e314287..ea734c2c7ddb 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -118,10 +118,12 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) return size; } -void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia, - struct rpcrdma_create_data_internal *cdata, - unsigned int maxsegs) +void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + unsigned int maxsegs = ia->ri_max_segs; + ia->ri_max_inline_write = cdata->inline_wsize - rpcrdma_max_call_header_size(maxsegs); ia->ri_max_inline_read = cdata->inline_rsize - diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index a49c788aa59a..6bab8416a4fc 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -204,6 +204,33 @@ out_fail: goto out_schedule; } +static void +rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, + struct rdma_conn_param *param) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + const struct rpcrdma_connect_private *pmsg = param->private_data; + unsigned int rsize, wsize; + + rsize = RPCRDMA_V1_DEF_INLINE_SIZE; + wsize = RPCRDMA_V1_DEF_INLINE_SIZE; + + if (pmsg && + pmsg->cp_magic == rpcrdma_cmp_magic && + pmsg->cp_version == RPCRDMA_CMP_VERSION) { + rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); + wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); + } + + if (rsize < cdata->inline_rsize) + cdata->inline_rsize = rsize; + if (wsize < cdata->inline_wsize) + cdata->inline_wsize = wsize; + pr_info("rpcrdma: max send %u, max recv %u\n", + cdata->inline_wsize, cdata->inline_rsize); + rpcrdma_set_max_header_sizes(r_xprt); +} + static int rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) { @@ -244,6 +271,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) " (%d initiator)\n", __func__, attr->max_dest_rd_atomic, attr->max_rd_atomic); + rpcrdma_update_connect_private(xprt, &event->param.conn); goto connected; case RDMA_CM_EVENT_CONNECT_ERROR: connstate = -ENOTCONN; @@ -454,6 +482,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { + struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; struct ib_cq *sendcq, *recvcq; unsigned int max_qp_wr; int rc; @@ -536,9 +565,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Initialize cma parameters */ memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); - /* RPC/RDMA does not use private data */ - ep->rep_remote_cma.private_data = NULL; - ep->rep_remote_cma.private_data_len = 0; + /* Prepare RDMA-CM private message */ + pmsg->cp_magic = rpcrdma_cmp_magic; + pmsg->cp_version = RPCRDMA_CMP_VERSION; + pmsg->cp_flags = 0; + pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize); + ep->rep_remote_cma.private_data = pmsg; + ep->rep_remote_cma.private_data_len = sizeof(*pmsg); /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9aabca68c49d..89df1680b1eb 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -70,6 +70,7 @@ struct rpcrdma_ia { struct ib_pd *ri_pd; struct completion ri_done; int ri_async_rc; + unsigned int ri_max_segs; unsigned int ri_max_frmr_depth; unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; @@ -87,6 +88,7 @@ struct rpcrdma_ep { int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; + struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; @@ -523,9 +525,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ int rpcrdma_marshal_req(struct rpc_rqst *); -void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *, - struct rpcrdma_create_data_internal *, - unsigned int); +void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); /* RPC/RDMA module init - xprtrdma/transport.c */ -- cgit v1.2.3 From 44829d02d2d7a7064842ecf36239ea24df1cdf58 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:57:32 -0400 Subject: xprtrdma: Support larger inline thresholds The Version One default inline threshold is still 1KB. But allow testing with thresholds up to 64KB. This maximum is somewhat arbitrary. There's no fundamental architectural limit I'm aware of, but it's good to keep the size of Receive buffers reasonable. Now that Send can use a s/g list, a Send buffer is only as large as each RPC requires. Receive buffers are always the size of the inline threshold, however. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtrdma.h | 4 ++-- net/sunrpc/xprtrdma/transport.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 39267dc3486a..221b7a2e5406 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -53,8 +53,8 @@ #define RPCRDMA_MAX_SLOT_TABLE (256U) #define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */ -#define RPCRDMA_DEF_INLINE (1024) /* default inline thresh */ -#define RPCRDMA_MAX_INLINE (3068) /* max inline thresh */ +#define RPCRDMA_DEF_INLINE (4096) /* default inline thresh */ +#define RPCRDMA_MAX_INLINE (65536) /* max inline thresh */ /* Memory registration strategies, by number. * This is part of a kernel / user space API. Do not remove. */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6a358ab6ce27..ed5e285fd2ea 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -97,7 +97,7 @@ static struct ctl_table xr_tunables_table[] = { .data = &xprt_rdma_max_inline_read, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_inline_size, .extra2 = &max_inline_size, }, @@ -106,7 +106,7 @@ static struct ctl_table xr_tunables_table[] = { .data = &xprt_rdma_max_inline_write, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_inline_size, .extra2 = &max_inline_size, }, -- cgit v1.2.3 From ca440c383a588091cae9fbce610b86a6e9d961ad Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 15 Sep 2016 14:40:49 -0400 Subject: pnfs: add a new mechanism to select a layout driver according to an ordered list Currently, the layout driver selection code always chooses the first one from the list. That's not really ideal however, as the server can send the list of layout types in any order that it likes. It's up to the client to select the best one for its needs. This patch adds an ordered list of preferred driver types and has the selection code sort the list of available layout drivers according to it. Any unrecognized layout type is sorted to the end of the list. For now, the order of preference is hardcoded, but it should be possible to make this configurable in the future. Signed-off-by: Jeff Layton Reviewed-by: J. Bruce Fields Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 1 + fs/nfs/nfs4proc.c | 2 +- fs/nfs/nfs4xdr.c | 31 +++++++++++++++------------ fs/nfs/pnfs.c | 56 ++++++++++++++++++++++++++++++++++++++++++------- fs/nfs/pnfs.h | 5 +++-- include/linux/nfs_xdr.h | 1 + 6 files changed, 72 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 535c2563b701..51136b03788d 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -788,6 +788,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs } fsinfo.fattr = fattr; + fsinfo.nlayouttypes = 0; memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cfdf45a14cc3..acf5a2caa423 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4312,7 +4312,7 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s if (error == 0) { /* block layout checks this! */ server->pnfs_blksize = fsinfo->blksize; - set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + set_pnfs_layoutdriver(server, fhandle, fsinfo); } return error; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 41a02f994976..17b4e059c588 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4728,29 +4728,34 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, * Decode potentially multiple layout types. */ static int decode_pnfs_layout_types(struct xdr_stream *xdr, - uint32_t *layouttype) + struct nfs_fsinfo *fsinfo) { __be32 *p; - uint32_t num, i; + uint32_t i; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - num = be32_to_cpup(p); + fsinfo->nlayouttypes = be32_to_cpup(p); /* pNFS is not supported by the underlying file system */ - if (num == 0) { + if (fsinfo->nlayouttypes == 0) return 0; - } - if (num > NFS_MAX_LAYOUT_TYPES) - printk(KERN_INFO "NFS: %s: Warning: Too many (%d) pNFS layout types\n", __func__, num); /* Decode and set first layout type, move xdr->p past unused types */ - p = xdr_inline_decode(xdr, num * 4); + p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4); if (unlikely(!p)) goto out_overflow; - for(i = 0; i < num && i < NFS_MAX_LAYOUT_TYPES; i++) - layouttype[i] = be32_to_cpup(p++); + + /* If we get too many, then just cap it at the max */ + if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) { + printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n", + __func__, fsinfo->nlayouttypes); + fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES; + } + + for(i = 0; i < fsinfo->nlayouttypes; ++i) + fsinfo->layouttype[i] = be32_to_cpup(p++); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4762,7 +4767,7 @@ out_overflow: * Note we must ensure that layouttype is set in any non-error case. */ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, - uint32_t *layouttype) + struct nfs_fsinfo *fsinfo) { int status = 0; @@ -4770,7 +4775,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) return -EIO; if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = decode_pnfs_layout_types(xdr, layouttype); + status = decode_pnfs_layout_types(xdr, fsinfo); bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; } return status; @@ -4853,7 +4858,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); if (status != 0) goto xdr_error; - status = decode_attr_pnfstype(xdr, bitmap, fsinfo->layouttype); + status = decode_attr_pnfstype(xdr, bitmap, fsinfo); if (status != 0) goto xdr_error; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a6a683fbd230..b588ccf05045 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -98,6 +99,39 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) nfss->pnfs_curr_ld = NULL; } +/* + * When the server sends a list of layout types, we choose one in the order + * given in the list below. + * + * FIXME: should this list be configurable in some fashion? module param? + * mount option? something else? + */ +static const u32 ld_prefs[] = { + LAYOUT_SCSI, + LAYOUT_BLOCK_VOLUME, + LAYOUT_OSD2_OBJECTS, + LAYOUT_FLEX_FILES, + LAYOUT_NFSV4_1_FILES, + 0 +}; + +static int +ld_cmp(const void *e1, const void *e2) +{ + u32 ld1 = *((u32 *)e1); + u32 ld2 = *((u32 *)e2); + int i; + + for (i = 0; ld_prefs[i] != 0; i++) { + if (ld1 == ld_prefs[i]) + return -1; + + if (ld2 == ld_prefs[i]) + return 1; + } + return 0; +} + /* * Try to set the server's pnfs module to the pnfs layout type specified by id. * Currently only one pNFS layout driver per filesystem is supported. @@ -106,10 +140,11 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) */ void set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, - u32 *ids) + struct nfs_fsinfo *fsinfo) { struct pnfs_layoutdriver_type *ld_type = NULL; u32 id; + int i; if (!(server->nfs_client->cl_exchange_flags & (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { @@ -118,18 +153,23 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, goto out_no_driver; } - id = ids[0]; - if (!id) - goto out_no_driver; + sort(fsinfo->layouttype, fsinfo->nlayouttypes, + sizeof(*fsinfo->layouttype), ld_cmp, NULL); - ld_type = find_pnfs_driver(id); - if (!ld_type) { - request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + for (i = 0; i < fsinfo->nlayouttypes; i++) { + id = fsinfo->layouttype[i]; ld_type = find_pnfs_driver(id); + if (!ld_type) { + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, + id); + ld_type = find_pnfs_driver(id); + } + if (ld_type) + break; } if (!ld_type) { - dprintk("%s: No pNFS module found for %u.\n", __func__, id); + dprintk("%s: No pNFS module found!\n", __func__); goto out_no_driver; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index be515e6a3823..5c295512c967 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -236,7 +236,7 @@ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); -void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32 *); +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); @@ -657,7 +657,8 @@ pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) } static inline void set_pnfs_layoutdriver(struct nfs_server *s, - const struct nfs_fh *mntfh, u32 *ids) + const struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) { } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index f11b26ed001b..beb1e10f446e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -144,6 +144,7 @@ struct nfs_fsinfo { __u64 maxfilesize; struct timespec time_delta; /* server time granularity */ __u32 lease_time; /* in seconds */ + __u32 nlayouttypes; /* number of layouttypes */ __u32 layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */ __u32 blksize; /* preferred pnfs io block size */ __u32 clone_blksize; /* granularity of a CLONE operation */ -- cgit v1.2.3 From eed7c4143dd993ef380340af64b8f2e7a79102c8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 17 Sep 2016 18:17:34 -0400 Subject: nfs: add a new NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK constant As defined in RFC 5661, section 18.16. Signed-off-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/uapi/linux/nfs4.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index 2b871e0858d9..4ae62796bfde 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -39,8 +39,9 @@ #define NFS4_FH_VOL_MIGRATION 0x0004 #define NFS4_FH_VOL_RENAME 0x0008 -#define NFS4_OPEN_RESULT_CONFIRM 0x0002 -#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004 +#define NFS4_OPEN_RESULT_CONFIRM 0x0002 +#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004 +#define NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK 0x0020 #define NFS4_SHARE_ACCESS_MASK 0x000F #define NFS4_SHARE_ACCESS_READ 0x0001 -- cgit v1.2.3 From a1d617d8f134679741b0b35e8e1436b015ac5538 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 17 Sep 2016 18:17:39 -0400 Subject: nfs: allow blocking locks to be awoken by lock callbacks Add a waitqueue head to the client structure. Have clients set a wait on that queue prior to requesting a lock from the server. If the lock is blocked, then we can use that to wait for wakeups. Note that we do need to do this "manually" since we need to set the wait on the waitqueue prior to requesting the lock, but requesting a lock can involve activities that can block. However, only do that for NFSv4.1 locks, either by compiling out all of the waitqueue handling when CONFIG_NFS_V4_1 is disabled, or skipping all of it at runtime if we're dealing with v4.0, or v4.1 servers that don't send lock callbacks. Note too that even when we expect to get a lock callback, RFC5661 section 20.11.4 is pretty clear that we still need to poll for them, so we do still sleep on a timeout. We do however always poll at the longest interval in that case. Signed-off-by: Jeff Layton [Anna: nfs4_retry_setlk() "status" should default to -ERESTARTSYS] Signed-off-by: Anna Schumaker --- fs/nfs/callback_proc.c | 4 ++ fs/nfs/nfs4client.c | 3 ++ fs/nfs/nfs4proc.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++- include/linux/nfs_fs_sb.h | 3 ++ 4 files changed, 103 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 974881824414..e9aa235e9d10 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -638,6 +638,10 @@ __be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, void *dummy, dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n", rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + /* Don't wake anybody if the string looked bogus */ + if (args->cbnl_valid) + __wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args); + return htonl(NFS4_OK); } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index a2c9654d018f..074ac7131459 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -199,6 +199,9 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; +#if IS_ENABLED(CONFIG_NFS_V4_1) + init_waitqueue_head(&clp->cl_lock_waitq); +#endif return clp; error: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a74c4167b5b0..5ec333f3a19b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6166,7 +6166,8 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * #define NFS4_LOCK_MAXTIMEOUT (30 * HZ) static int -nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, + struct file_lock *request) { int status = -ERESTARTSYS; unsigned long timeout = NFS4_LOCK_MINTIMEOUT; @@ -6183,6 +6184,97 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) return status; } +#ifdef CONFIG_NFS_V4_1 +struct nfs4_lock_waiter { + struct task_struct *task; + struct inode *inode; + struct nfs_lowner *owner; + bool notified; +}; + +static int +nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +{ + int ret; + struct cb_notify_lock_args *cbnl = key; + struct nfs4_lock_waiter *waiter = wait->private; + struct nfs_lowner *lowner = &cbnl->cbnl_owner, + *wowner = waiter->owner; + + /* Only wake if the callback was for the same owner */ + if (lowner->clientid != wowner->clientid || + lowner->id != wowner->id || + lowner->s_dev != wowner->s_dev) + return 0; + + /* Make sure it's for the right inode */ + if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) + return 0; + + waiter->notified = true; + + /* override "private" so we can use default_wake_function */ + wait->private = waiter->task; + ret = autoremove_wake_function(wait, mode, flags, key); + wait->private = waiter; + return ret; +} + +static int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + int status = -ERESTARTSYS; + unsigned long flags; + struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_client *clp = server->nfs_client; + wait_queue_head_t *q = &clp->cl_lock_waitq; + struct nfs_lowner owner = { .clientid = clp->cl_clientid, + .id = lsp->ls_seqid.owner_id, + .s_dev = server->s_dev }; + struct nfs4_lock_waiter waiter = { .task = current, + .inode = state->inode, + .owner = &owner, + .notified = false }; + wait_queue_t wait; + + /* Don't bother with waitqueue if we don't expect a callback */ + if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) + return nfs4_retry_setlk_simple(state, cmd, request); + + init_wait(&wait); + wait.private = &waiter; + wait.func = nfs4_wake_lock_waiter; + add_wait_queue(q, &wait); + + while(!signalled()) { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + + status = -ERESTARTSYS; + spin_lock_irqsave(&q->lock, flags); + if (waiter.notified) { + spin_unlock_irqrestore(&q->lock, flags); + continue; + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&q->lock, flags); + + freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT); + } + + finish_wait(q, &wait); + return status; +} +#else /* !CONFIG_NFS_V4_1 */ +static inline int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + return nfs4_retry_setlk_simple(state, cmd, request); +} +#endif + static int nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) { diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 14a762d2734d..b34097c67848 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -103,6 +103,9 @@ struct nfs_client { #define NFS_SP4_MACH_CRED_WRITE 5 /* WRITE */ #define NFS_SP4_MACH_CRED_COMMIT 6 /* COMMIT */ #define NFS_SP4_MACH_CRED_PNFS_CLEANUP 7 /* LAYOUTRETURN */ +#if IS_ENABLED(CONFIG_NFS_V4_1) + wait_queue_head_t cl_lock_waitq; +#endif /* CONFIG_NFS_V4_1 */ #endif /* CONFIG_NFS_V4 */ /* Our own IP address, as a null-terminated string. -- cgit v1.2.3 From f7a62adad01cdb2b64c5a17cdd440736b99a5829 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 22 Sep 2016 13:39:02 -0400 Subject: NFSv4.1: Allow revoked stateids to skip the call to TEST_STATEID In some cases (e.g. when the SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED sequence flag is set) we may already know that the stateid was revoked and that the only valid operation we can call is FREE_STATEID. In those cases, allow the stateid to carry the information in the type field, so that we skip the redundant call to TEST_STATEID. Signed-off-by: Trond Myklebust Tested-by: Oleg Drokin Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 32 +++++++++++++++++++++++--------- include/linux/nfs4.h | 1 + 2 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index dfa46e49e356..02eab9113273 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2422,18 +2422,29 @@ static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, { int status; - status = nfs41_test_stateid(server, stateid, cred); + switch (stateid->type) { + default: + break; + case NFS4_INVALID_STATEID_TYPE: + case NFS4_SPECIAL_STATEID_TYPE: + return -NFS4ERR_BAD_STATEID; + case NFS4_REVOKED_STATEID_TYPE: + goto out_free; + } + status = nfs41_test_stateid(server, stateid, cred); switch (status) { case -NFS4ERR_EXPIRED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: - /* Ack the revoked state to the server */ - nfs41_free_stateid(server, stateid, cred); - case -NFS4ERR_BAD_STATEID: + break; + default: return status; } - return NFS_OK; +out_free: + /* Ack the revoked state to the server */ + nfs41_free_stateid(server, stateid, cred); + return -NFS4ERR_EXPIRED; } static void nfs41_check_delegation_stateid(struct nfs4_state *state) @@ -2468,7 +2479,7 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state) rcu_read_unlock(); status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); trace_nfs4_test_delegation_stateid(state, NULL, status); - if (status != NFS_OK) + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) nfs_finish_clear_delegation_stateid(state, &stateid); put_rpccred(cred); @@ -2497,7 +2508,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) status = nfs41_test_and_free_expired_stateid(server, stateid, cred); trace_nfs4_test_open_stateid(state, NULL, status); - if (status != NFS_OK) { + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) { clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); @@ -6105,7 +6116,7 @@ out: */ static int nfs41_check_expired_locks(struct nfs4_state *state) { - int status, ret = -NFS4ERR_BAD_STATEID; + int status, ret = NFS_OK; struct nfs4_lock_state *lsp; struct nfs_server *server = NFS_SERVER(state->inode); @@ -6117,9 +6128,12 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) &lsp->ls_stateid, cred); trace_nfs4_test_lock_stateid(state, lsp, status); - if (status != NFS_OK) { + if (status == -NFS4ERR_EXPIRED || + status == -NFS4ERR_BAD_STATEID) clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); + else if (status != NFS_OK) { ret = status; + break; } } }; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index c6564ada9beb..9094faf0699d 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -67,6 +67,7 @@ struct nfs4_stateid_struct { NFS4_DELEGATION_STATEID_TYPE, NFS4_LAYOUT_STATEID_TYPE, NFS4_PNFS_DS_STATEID_TYPE, + NFS4_REVOKED_STATEID_TYPE, } type; }; -- cgit v1.2.3 From e856a231d5d5742fe7c63e3a2b266bef668af5b4 Mon Sep 17 00:00:00 2001 From: Frank Sorenson Date: Thu, 29 Sep 2016 10:44:37 -0500 Subject: sunrpc: add hash_cred() function to rpc_authops struct Currently, a single hash algorithm is used to hash the auth_cred for the credcache for all rpc_auth types. Add a hash_cred() function to the rpc_authops struct to allow a hash function specific to each auth flavor. Signed-off-by: Frank Sorenson Signed-off-by: Anna Schumaker --- include/linux/sunrpc/auth.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 4ccf184e971f..b1bc62ba20a2 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -131,6 +131,7 @@ struct rpc_authops { struct rpc_auth * (*create)(struct rpc_auth_create_args *, struct rpc_clnt *); void (*destroy)(struct rpc_auth *); + int (*hash_cred)(struct auth_cred *, unsigned int); struct rpc_cred * (*lookup_cred)(struct rpc_auth *, struct auth_cred *, int); struct rpc_cred * (*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t); int (*list_pseudoflavors)(rpc_authflavor_t *, int); -- cgit v1.2.3