diff options
Diffstat (limited to 'net')
98 files changed, 3391 insertions, 2681 deletions
diff --git a/net/9p/Makefile b/net/9p/Makefile index c0486cfc85d9..aa0a5641e5d0 100644 --- a/net/9p/Makefile +++ b/net/9p/Makefile @@ -8,7 +8,6 @@ obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o mod.o \ client.o \ error.o \ - util.o \ protocol.o \ trans_fd.o \ trans_common.o \ diff --git a/net/9p/client.c b/net/9p/client.c index deae53a7dffc..5f23e18eecc0 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -231,144 +231,170 @@ free_and_return: return ret; } -static struct p9_fcall *p9_fcall_alloc(int alloc_msize) +static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, + int alloc_msize) { - struct p9_fcall *fc; - fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS); - if (!fc) - return NULL; + if (likely(c->fcall_cache) && alloc_msize == c->msize) { + fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); + fc->cache = c->fcall_cache; + } else { + fc->sdata = kmalloc(alloc_msize, GFP_NOFS); + fc->cache = NULL; + } + if (!fc->sdata) + return -ENOMEM; fc->capacity = alloc_msize; - fc->sdata = (char *) fc + sizeof(struct p9_fcall); - return fc; + return 0; +} + +void p9_fcall_fini(struct p9_fcall *fc) +{ + /* sdata can be NULL for interrupted requests in trans_rdma, + * and kmem_cache_free does not do NULL-check for us + */ + if (unlikely(!fc->sdata)) + return; + + if (fc->cache) + kmem_cache_free(fc->cache, fc->sdata); + else + kfree(fc->sdata); } +EXPORT_SYMBOL(p9_fcall_fini); + +static struct kmem_cache *p9_req_cache; /** - * p9_tag_alloc - lookup/allocate a request by tag - * @c: client session to lookup tag within - * @tag: numeric id for transaction - * - * this is a simple array lookup, but will grow the - * request_slots as necessary to accommodate transaction - * ids which did not previously have a slot. - * - * this code relies on the client spinlock to manage locks, its - * possible we should switch to something else, but I'd rather - * stick with something low-overhead for the common case. + * p9_req_alloc - Allocate a new request. + * @c: Client session. + * @type: Transaction type. + * @max_size: Maximum packet size for this request. * + * Context: Process context. + * Return: Pointer to new request. */ - static struct p9_req_t * -p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size) +p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) { - unsigned long flags; - int row, col; - struct p9_req_t *req; + struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS); int alloc_msize = min(c->msize, max_size); + int tag; - /* This looks up the original request by tag so we know which - * buffer to read the data into */ - tag++; - - if (tag >= c->max_tag) { - spin_lock_irqsave(&c->lock, flags); - /* check again since original check was outside of lock */ - while (tag >= c->max_tag) { - row = (tag / P9_ROW_MAXTAG); - c->reqs[row] = kcalloc(P9_ROW_MAXTAG, - sizeof(struct p9_req_t), GFP_ATOMIC); - - if (!c->reqs[row]) { - pr_err("Couldn't grow tag array\n"); - spin_unlock_irqrestore(&c->lock, flags); - return ERR_PTR(-ENOMEM); - } - for (col = 0; col < P9_ROW_MAXTAG; col++) { - req = &c->reqs[row][col]; - req->status = REQ_STATUS_IDLE; - init_waitqueue_head(&req->wq); - } - c->max_tag += P9_ROW_MAXTAG; - } - spin_unlock_irqrestore(&c->lock, flags); - } - row = tag / P9_ROW_MAXTAG; - col = tag % P9_ROW_MAXTAG; - - req = &c->reqs[row][col]; - if (!req->tc) - req->tc = p9_fcall_alloc(alloc_msize); - if (!req->rc) - req->rc = p9_fcall_alloc(alloc_msize); - if (!req->tc || !req->rc) - goto grow_failed; + if (!req) + return ERR_PTR(-ENOMEM); - p9pdu_reset(req->tc); - p9pdu_reset(req->rc); + if (p9_fcall_init(c, &req->tc, alloc_msize)) + goto free_req; + if (p9_fcall_init(c, &req->rc, alloc_msize)) + goto free; - req->tc->tag = tag-1; + p9pdu_reset(&req->tc); + p9pdu_reset(&req->rc); req->status = REQ_STATUS_ALLOC; + init_waitqueue_head(&req->wq); + INIT_LIST_HEAD(&req->req_list); + + idr_preload(GFP_NOFS); + spin_lock_irq(&c->lock); + if (type == P9_TVERSION) + tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1, + GFP_NOWAIT); + else + tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); + req->tc.tag = tag; + spin_unlock_irq(&c->lock); + idr_preload_end(); + if (tag < 0) + goto free; + + /* Init ref to two because in the general case there is one ref + * that is put asynchronously by a writer thread, one ref + * temporarily given by p9_tag_lookup and put by p9_client_cb + * in the recv thread, and one ref put by p9_tag_remove in the + * main thread. The only exception is virtio that does not use + * p9_tag_lookup but does not have a writer thread either + * (the write happens synchronously in the request/zc_request + * callback), so p9_client_cb eats the second ref there + * as the pointer is duplicated directly by virtqueue_add_sgs() + */ + refcount_set(&req->refcount.refcount, 2); return req; -grow_failed: - pr_err("Couldn't grow tag array\n"); - kfree(req->tc); - kfree(req->rc); - req->tc = req->rc = NULL; +free: + p9_fcall_fini(&req->tc); + p9_fcall_fini(&req->rc); +free_req: + kmem_cache_free(p9_req_cache, req); return ERR_PTR(-ENOMEM); } /** - * p9_tag_lookup - lookup a request by tag - * @c: client session to lookup tag within - * @tag: numeric id for transaction + * p9_tag_lookup - Look up a request by tag. + * @c: Client session. + * @tag: Transaction ID. * + * Context: Any context. + * Return: A request, or %NULL if there is no request with that tag. */ - struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag) { - int row, col; - - /* This looks up the original request by tag so we know which - * buffer to read the data into */ - tag++; - - if (tag >= c->max_tag) - return NULL; + struct p9_req_t *req; - row = tag / P9_ROW_MAXTAG; - col = tag % P9_ROW_MAXTAG; + rcu_read_lock(); +again: + req = idr_find(&c->reqs, tag); + if (req) { + /* We have to be careful with the req found under rcu_read_lock + * Thanks to SLAB_TYPESAFE_BY_RCU we can safely try to get the + * ref again without corrupting other data, then check again + * that the tag matches once we have the ref + */ + if (!p9_req_try_get(req)) + goto again; + if (req->tc.tag != tag) { + p9_req_put(req); + goto again; + } + } + rcu_read_unlock(); - return &c->reqs[row][col]; + return req; } EXPORT_SYMBOL(p9_tag_lookup); /** - * p9_tag_init - setup tags structure and contents - * @c: v9fs client struct - * - * This initializes the tags structure for each client instance. + * p9_tag_remove - Remove a tag. + * @c: Client session. + * @r: Request of reference. * + * Context: Any context. */ +static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r) +{ + unsigned long flags; + u16 tag = r->tc.tag; + + p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); + spin_lock_irqsave(&c->lock, flags); + idr_remove(&c->reqs, tag); + spin_unlock_irqrestore(&c->lock, flags); + return p9_req_put(r); +} -static int p9_tag_init(struct p9_client *c) +static void p9_req_free(struct kref *ref) { - int err = 0; + struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount); + p9_fcall_fini(&r->tc); + p9_fcall_fini(&r->rc); + kmem_cache_free(p9_req_cache, r); +} - c->tagpool = p9_idpool_create(); - if (IS_ERR(c->tagpool)) { - err = PTR_ERR(c->tagpool); - goto error; - } - err = p9_idpool_get(c->tagpool); /* reserve tag 0 */ - if (err < 0) { - p9_idpool_destroy(c->tagpool); - goto error; - } - c->max_tag = 0; -error: - return err; +int p9_req_put(struct p9_req_t *r) +{ + return kref_put(&r->refcount, p9_req_free); } +EXPORT_SYMBOL(p9_req_put); /** * p9_tag_cleanup - cleans up tags structure and reclaims resources @@ -379,52 +405,17 @@ error: */ static void p9_tag_cleanup(struct p9_client *c) { - int row, col; - - /* check to insure all requests are idle */ - for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { - for (col = 0; col < P9_ROW_MAXTAG; col++) { - if (c->reqs[row][col].status != REQ_STATUS_IDLE) { - p9_debug(P9_DEBUG_MUX, - "Attempting to cleanup non-free tag %d,%d\n", - row, col); - /* TODO: delay execution of cleanup */ - return; - } - } - } - - if (c->tagpool) { - p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */ - p9_idpool_destroy(c->tagpool); - } + struct p9_req_t *req; + int id; - /* free requests associated with tags */ - for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { - for (col = 0; col < P9_ROW_MAXTAG; col++) { - kfree(c->reqs[row][col].tc); - kfree(c->reqs[row][col].rc); - } - kfree(c->reqs[row]); + rcu_read_lock(); + idr_for_each_entry(&c->reqs, req, id) { + pr_info("Tag %d still in use\n", id); + if (p9_tag_remove(c, req) == 0) + pr_warn("Packet with tag %d has still references", + req->tc.tag); } - c->max_tag = 0; -} - -/** - * p9_free_req - free a request and clean-up as necessary - * c: client state - * r: request to release - * - */ - -static void p9_free_req(struct p9_client *c, struct p9_req_t *r) -{ - int tag = r->tc->tag; - p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); - - r->status = REQ_STATUS_IDLE; - if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool)) - p9_idpool_put(tag, c->tagpool); + rcu_read_unlock(); } /** @@ -435,7 +426,7 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r) */ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) { - p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc->tag); + p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag); /* * This barrier is needed to make sure any change made to req before @@ -445,7 +436,8 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) req->status = status; wake_up(&req->wq); - p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag); + p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag); + p9_req_put(req); } EXPORT_SYMBOL(p9_client_cb); @@ -516,18 +508,18 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) int err; int ecode; - err = p9_parse_header(req->rc, NULL, &type, NULL, 0); - if (req->rc->size >= c->msize) { + err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); + if (req->rc.size >= c->msize) { p9_debug(P9_DEBUG_ERROR, "requested packet size too big: %d\n", - req->rc->size); + req->rc.size); return -EIO; } /* * dump the response from server * This should be after check errors which poplulate pdu_fcall. */ - trace_9p_protocol_dump(c, req->rc); + trace_9p_protocol_dump(c, &req->rc); if (err) { p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; @@ -537,7 +529,7 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) if (!p9_is_proto_dotl(c)) { char *ename; - err = p9pdu_readf(req->rc, c->proto_version, "s?d", + err = p9pdu_readf(&req->rc, c->proto_version, "s?d", &ename, &ecode); if (err) goto out_err; @@ -553,7 +545,7 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) } kfree(ename); } else { - err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode); + err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); err = -ecode; p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); @@ -587,12 +579,12 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, int8_t type; char *ename = NULL; - err = p9_parse_header(req->rc, NULL, &type, NULL, 0); + err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); /* * dump the response from server * This should be after parse_header which poplulate pdu_fcall. */ - trace_9p_protocol_dump(c, req->rc); + trace_9p_protocol_dump(c, &req->rc); if (err) { p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; @@ -607,13 +599,13 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, /* 7 = header size for RERROR; */ int inline_len = in_hdrlen - 7; - len = req->rc->size - req->rc->offset; + len = req->rc.size - req->rc.offset; if (len > (P9_ZC_HDR_SZ - 7)) { err = -EFAULT; goto out_err; } - ename = &req->rc->sdata[req->rc->offset]; + ename = &req->rc.sdata[req->rc.offset]; if (len > inline_len) { /* We have error in external buffer */ if (!copy_from_iter_full(ename + inline_len, @@ -623,7 +615,7 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, } } ename = NULL; - err = p9pdu_readf(req->rc, c->proto_version, "s?d", + err = p9pdu_readf(&req->rc, c->proto_version, "s?d", &ename, &ecode); if (err) goto out_err; @@ -639,7 +631,7 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, } kfree(ename); } else { - err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode); + err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); err = -ecode; p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); @@ -672,7 +664,7 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) int16_t oldtag; int err; - err = p9_parse_header(oldreq->tc, NULL, NULL, &oldtag, 1); + err = p9_parse_header(&oldreq->tc, NULL, NULL, &oldtag, 1); if (err) return err; @@ -686,11 +678,12 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) * if we haven't received a response for oldreq, * remove it from the list */ - if (oldreq->status == REQ_STATUS_SENT) + if (oldreq->status == REQ_STATUS_SENT) { if (c->trans_mod->cancelled) c->trans_mod->cancelled(c, oldreq); + } - p9_free_req(c, req); + p9_tag_remove(c, req); return 0; } @@ -698,7 +691,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, int8_t type, int req_size, const char *fmt, va_list ap) { - int tag, err; + int err; struct p9_req_t *req; p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type); @@ -711,27 +704,22 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, if ((c->status == BeginDisconnect) && (type != P9_TCLUNK)) return ERR_PTR(-EIO); - tag = P9_NOTAG; - if (type != P9_TVERSION) { - tag = p9_idpool_get(c->tagpool); - if (tag < 0) - return ERR_PTR(-ENOMEM); - } - - req = p9_tag_alloc(c, tag, req_size); + req = p9_tag_alloc(c, type, req_size); if (IS_ERR(req)) return req; /* marshall the data */ - p9pdu_prepare(req->tc, tag, type); - err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap); + p9pdu_prepare(&req->tc, req->tc.tag, type); + err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap); if (err) goto reterr; - p9pdu_finalize(c, req->tc); - trace_9p_client_req(c, type, tag); + p9pdu_finalize(c, &req->tc); + trace_9p_client_req(c, type, req->tc.tag); return req; reterr: - p9_free_req(c, req); + p9_tag_remove(c, req); + /* We have to put also the 2nd reference as it won't be used */ + p9_req_put(req); return ERR_PTR(err); } @@ -741,7 +729,7 @@ reterr: * @type: type of request * @fmt: protocol format string (see protocol.c) * - * Returns request structure (which client must free using p9_free_req) + * Returns request structure (which client must free using p9_tag_remove) */ static struct p9_req_t * @@ -766,6 +754,8 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) err = c->trans_mod->request(c, req); if (err < 0) { + /* write won't happen */ + p9_req_put(req); if (err != -ERESTARTSYS && err != -EFAULT) c->status = Disconnected; goto recalc_sigpending; @@ -813,11 +803,11 @@ recalc_sigpending: goto reterr; err = p9_check_errors(c, req); - trace_9p_client_res(c, type, req->rc->tag, err); + trace_9p_client_res(c, type, req->rc.tag, err); if (!err) return req; reterr: - p9_free_req(c, req); + p9_tag_remove(c, req); return ERR_PTR(safe_errno(err)); } @@ -832,7 +822,7 @@ reterr: * @hdrlen: reader header size, This is the size of response protocol data * @fmt: protocol format string (see protocol.c) * - * Returns request structure (which client must free using p9_free_req) + * Returns request structure (which client must free using p9_tag_remove) */ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, struct iov_iter *uidata, @@ -895,11 +885,11 @@ recalc_sigpending: goto reterr; err = p9_check_zc_errors(c, req, uidata, in_hdrlen); - trace_9p_client_res(c, type, req->rc->tag, err); + trace_9p_client_res(c, type, req->rc.tag, err); if (!err) return req; reterr: - p9_free_req(c, req); + p9_tag_remove(c, req); return ERR_PTR(safe_errno(err)); } @@ -978,10 +968,10 @@ static int p9_client_version(struct p9_client *c) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version); + err = p9pdu_readf(&req->rc, c->proto_version, "ds", &msize, &version); if (err) { p9_debug(P9_DEBUG_9P, "version error %d\n", err); - trace_9p_protocol_dump(c, req->rc); + trace_9p_protocol_dump(c, &req->rc); goto error; } @@ -1002,7 +992,7 @@ static int p9_client_version(struct p9_client *c) error: kfree(version); - p9_free_req(c, req); + p9_tag_remove(c, req); return err; } @@ -1020,20 +1010,18 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) clnt->trans_mod = NULL; clnt->trans = NULL; + clnt->fcall_cache = NULL; client_id = utsname()->nodename; memcpy(clnt->name, client_id, strlen(client_id) + 1); spin_lock_init(&clnt->lock); idr_init(&clnt->fids); - - err = p9_tag_init(clnt); - if (err < 0) - goto free_client; + idr_init(&clnt->reqs); err = parse_opts(options, clnt); if (err < 0) - goto destroy_tagpool; + goto free_client; if (!clnt->trans_mod) clnt->trans_mod = v9fs_get_default_trans(); @@ -1042,7 +1030,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) err = -EPROTONOSUPPORT; p9_debug(P9_DEBUG_ERROR, "No transport defined or default transport\n"); - goto destroy_tagpool; + goto free_client; } p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", @@ -1059,14 +1047,21 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) if (err) goto close_trans; + /* P9_HDRSZ + 4 is the smallest packet header we can have that is + * followed by data accessed from userspace by read + */ + clnt->fcall_cache = + kmem_cache_create_usercopy("9p-fcall-cache", clnt->msize, + 0, 0, P9_HDRSZ + 4, + clnt->msize - (P9_HDRSZ + 4), + NULL); + return clnt; close_trans: clnt->trans_mod->close(clnt); put_trans: v9fs_put_trans(clnt->trans_mod); -destroy_tagpool: - p9_idpool_destroy(clnt->tagpool); free_client: kfree(clnt); return ERR_PTR(err); @@ -1092,6 +1087,7 @@ void p9_client_destroy(struct p9_client *clnt) p9_tag_cleanup(clnt); + kmem_cache_destroy(clnt->fcall_cache); kfree(clnt); } EXPORT_SYMBOL(p9_client_destroy); @@ -1135,10 +1131,10 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", &qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1147,7 +1143,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, memmove(&fid->qid, &qid, sizeof(struct p9_qid)); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return fid; error: @@ -1192,13 +1188,13 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids); + err = p9pdu_readf(&req->rc, clnt->proto_version, "R", &nwqids, &wqids); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto clunk_fid; } - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids); @@ -1259,9 +1255,9 @@ int p9_client_open(struct p9_fid *fid, int mode) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1273,7 +1269,7 @@ int p9_client_open(struct p9_fid *fid, int mode) fid->iounit = iounit; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1303,9 +1299,9 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", qid, &iounit); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", qid, &iounit); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1318,7 +1314,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 ofid->iounit = iounit; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1348,9 +1344,9 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1363,7 +1359,7 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, fid->iounit = iounit; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1387,9 +1383,9 @@ int p9_client_symlink(struct p9_fid *dfid, const char *name, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1397,7 +1393,7 @@ int p9_client_symlink(struct p9_fid *dfid, const char *name, qid->type, (unsigned long long)qid->path, qid->version); free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1417,7 +1413,7 @@ int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newna return PTR_ERR(req); p9_debug(P9_DEBUG_9P, "<<< RLINK\n"); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return 0; } EXPORT_SYMBOL(p9_client_link); @@ -1441,7 +1437,7 @@ int p9_client_fsync(struct p9_fid *fid, int datasync) p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; @@ -1476,7 +1472,7 @@ again: p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: /* * Fid is not valid even after a failed clunk @@ -1510,7 +1506,7 @@ int p9_client_remove(struct p9_fid *fid) p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: if (err == -ERESTARTSYS) p9_client_clunk(fid); @@ -1537,7 +1533,7 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) } p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1585,11 +1581,11 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) break; } - *err = p9pdu_readf(req->rc, clnt->proto_version, + *err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr); if (*err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); break; } if (rsize < count) { @@ -1599,7 +1595,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); if (!count) { - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); break; } @@ -1609,7 +1605,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) offset += n; if (n != count) { *err = -EFAULT; - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); break; } } else { @@ -1617,7 +1613,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) total += count; offset += count; } - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); } return total; } @@ -1658,10 +1654,10 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) break; } - *err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count); + *err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &count); if (*err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); break; } if (rsize < count) { @@ -1671,7 +1667,7 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); iov_iter_advance(from, count); total += count; offset += count; @@ -1702,10 +1698,10 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret); + err = p9pdu_readf(&req->rc, clnt->proto_version, "wS", &ignored, ret); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1722,7 +1718,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) from_kgid(&init_user_ns, ret->n_gid), from_kuid(&init_user_ns, ret->n_muid)); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return ret; error: @@ -1755,10 +1751,10 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "A", ret); + err = p9pdu_readf(&req->rc, clnt->proto_version, "A", ret); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1783,7 +1779,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec, ret->st_gen, ret->st_data_version); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return ret; error: @@ -1852,7 +1848,7 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1884,7 +1880,7 @@ int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr) goto error; } p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1907,12 +1903,12 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type, - &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, - &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); + err = p9pdu_readf(&req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type, + &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, + &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1923,7 +1919,7 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) sb->blocks, sb->bfree, sb->bavail, sb->files, sb->ffree, sb->fsid, (long int)sb->namelen); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1951,7 +1947,7 @@ int p9_client_rename(struct p9_fid *fid, p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1981,7 +1977,7 @@ int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name, p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n", newdirfid->fid, new_name); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -2015,13 +2011,13 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid, err = PTR_ERR(req); goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "q", attr_size); + err = p9pdu_readf(&req->rc, clnt->proto_version, "q", attr_size); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto clunk_fid; } - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); p9_debug(P9_DEBUG_9P, "<<< RXATTRWALK fid %d size %llu\n", attr_fid->fid, *attr_size); return attr_fid; @@ -2055,7 +2051,7 @@ int p9_client_xattrcreate(struct p9_fid *fid, const char *name, goto error; } p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -2103,9 +2099,9 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr); + err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } if (rsize < count) { @@ -2118,11 +2114,11 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) if (non_zc) memmove(data, dataptr, count); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return count; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -2144,16 +2140,16 @@ int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode, if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type, (unsigned long long)qid->path, qid->version); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } @@ -2175,16 +2171,16 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode, if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type, (unsigned long long)qid->path, qid->version); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } @@ -2210,14 +2206,14 @@ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "b", status); + err = p9pdu_readf(&req->rc, clnt->proto_version, "b", status); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } @@ -2241,18 +2237,18 @@ int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type, - &glock->start, &glock->length, &glock->proc_id, - &glock->client_id); + err = p9pdu_readf(&req->rc, clnt->proto_version, "bqqds", &glock->type, + &glock->start, &glock->length, &glock->proc_id, + &glock->client_id); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld " "proc_id %d client_id %s\n", glock->type, glock->start, glock->length, glock->proc_id, glock->client_id); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_getlock_dotl); @@ -2271,14 +2267,25 @@ int p9_client_readlink(struct p9_fid *fid, char **target) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "s", target); + err = p9pdu_readf(&req->rc, clnt->proto_version, "s", target); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_readlink); + +int __init p9_client_init(void) +{ + p9_req_cache = KMEM_CACHE(p9_req_t, SLAB_TYPESAFE_BY_RCU); + return p9_req_cache ? 0 : -ENOMEM; +} + +void __exit p9_client_exit(void) +{ + kmem_cache_destroy(p9_req_cache); +} diff --git a/net/9p/mod.c b/net/9p/mod.c index 253ba824a325..0da56d6af73b 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -171,11 +171,17 @@ void v9fs_put_trans(struct p9_trans_module *m) */ static int __init init_p9(void) { + int ret; + + ret = p9_client_init(); + if (ret) + return ret; + p9_error_init(); pr_info("Installing 9P2000 support\n"); p9_trans_fd_init(); - return 0; + return ret; } /** @@ -188,6 +194,7 @@ static void __exit exit_p9(void) pr_info("Unloading 9P2000 support\n"); p9_trans_fd_exit(); + p9_client_exit(); } module_init(init_p9) diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 4a1e1dd30b52..462ba144cb39 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -46,10 +46,15 @@ p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); void p9stat_free(struct p9_wstat *stbuf) { kfree(stbuf->name); + stbuf->name = NULL; kfree(stbuf->uid); + stbuf->uid = NULL; kfree(stbuf->gid); + stbuf->gid = NULL; kfree(stbuf->muid); + stbuf->muid = NULL; kfree(stbuf->extension); + stbuf->extension = NULL; } EXPORT_SYMBOL(p9stat_free); @@ -566,9 +571,10 @@ int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st) if (ret) { p9_debug(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret); trace_9p_protocol_dump(clnt, &fake_pdu); + return ret; } - return ret; + return fake_pdu.offset; } EXPORT_SYMBOL(p9stat_read); @@ -617,13 +623,19 @@ int p9dirent_read(struct p9_client *clnt, char *buf, int len, if (ret) { p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret); trace_9p_protocol_dump(clnt, &fake_pdu); - goto out; + return ret; } - strcpy(dirent->d_name, nameptr); + ret = strscpy(dirent->d_name, nameptr, sizeof(dirent->d_name)); + if (ret < 0) { + p9_debug(P9_DEBUG_ERROR, + "On the wire dirent name too long: %s\n", + nameptr); + kfree(nameptr); + return ret; + } kfree(nameptr); -out: return fake_pdu.offset; } EXPORT_SYMBOL(p9dirent_read); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index e2ef3c782c53..f868cf6fba79 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -131,7 +131,8 @@ struct p9_conn { int err; struct list_head req_list; struct list_head unsent_req_list; - struct p9_req_t *req; + struct p9_req_t *rreq; + struct p9_req_t *wreq; char tmp_buf[7]; struct p9_fcall rc; int wpos; @@ -291,7 +292,6 @@ static void p9_read_work(struct work_struct *work) __poll_t n; int err; struct p9_conn *m; - int status = REQ_STATUS_ERROR; m = container_of(work, struct p9_conn, rq); @@ -322,7 +322,7 @@ static void p9_read_work(struct work_struct *work) m->rc.offset += err; /* header read in */ - if ((!m->req) && (m->rc.offset == m->rc.capacity)) { + if ((!m->rreq) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new header\n"); /* Header size */ @@ -346,23 +346,23 @@ static void p9_read_work(struct work_struct *work) "mux %p pkt: size: %d bytes tag: %d\n", m, m->rc.size, m->rc.tag); - m->req = p9_tag_lookup(m->client, m->rc.tag); - if (!m->req || (m->req->status != REQ_STATUS_SENT)) { + m->rreq = p9_tag_lookup(m->client, m->rc.tag); + if (!m->rreq || (m->rreq->status != REQ_STATUS_SENT)) { p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", m->rc.tag); err = -EIO; goto error; } - if (m->req->rc == NULL) { + if (!m->rreq->rc.sdata) { p9_debug(P9_DEBUG_ERROR, "No recv fcall for tag %d (req %p), disconnecting!\n", - m->rc.tag, m->req); - m->req = NULL; + m->rc.tag, m->rreq); + m->rreq = NULL; err = -EIO; goto error; } - m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall); + m->rc.sdata = m->rreq->rc.sdata; memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity); m->rc.capacity = m->rc.size; } @@ -370,20 +370,27 @@ static void p9_read_work(struct work_struct *work) /* packet is read in * not an else because some packets (like clunk) have no payload */ - if ((m->req) && (m->rc.offset == m->rc.capacity)) { + if ((m->rreq) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new packet\n"); - m->req->rc->size = m->rc.offset; + m->rreq->rc.size = m->rc.offset; spin_lock(&m->client->lock); - if (m->req->status != REQ_STATUS_ERROR) - status = REQ_STATUS_RCVD; - list_del(&m->req->req_list); - /* update req->status while holding client->lock */ - p9_client_cb(m->client, m->req, status); + if (m->rreq->status == REQ_STATUS_SENT) { + list_del(&m->rreq->req_list); + p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD); + } else { + spin_unlock(&m->client->lock); + p9_debug(P9_DEBUG_ERROR, + "Request tag %d errored out while we were reading the reply\n", + m->rc.tag); + err = -EIO; + goto error; + } spin_unlock(&m->client->lock); m->rc.sdata = NULL; m->rc.offset = 0; m->rc.capacity = 0; - m->req = NULL; + p9_req_put(m->rreq); + m->rreq = NULL; } end_clear: @@ -469,9 +476,11 @@ static void p9_write_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "move req %p\n", req); list_move_tail(&req->req_list, &m->req_list); - m->wbuf = req->tc->sdata; - m->wsize = req->tc->size; + m->wbuf = req->tc.sdata; + m->wsize = req->tc.size; m->wpos = 0; + p9_req_get(req); + m->wreq = req; spin_unlock(&m->client->lock); } @@ -492,8 +501,11 @@ static void p9_write_work(struct work_struct *work) } m->wpos += err; - if (m->wpos == m->wsize) + if (m->wpos == m->wsize) { m->wpos = m->wsize = 0; + p9_req_put(m->wreq); + m->wreq = NULL; + } end_clear: clear_bit(Wworksched, &m->wsched); @@ -663,7 +675,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) struct p9_conn *m = &ts->conn; p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n", - m, current, req->tc, req->tc->id); + m, current, &req->tc, req->tc.id); if (m->err < 0) return m->err; @@ -694,6 +706,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) if (req->status == REQ_STATUS_UNSENT) { list_del(&req->req_list); req->status = REQ_STATUS_FLSHD; + p9_req_put(req); ret = 0; } spin_unlock(&client->lock); @@ -711,6 +724,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) spin_lock(&client->lock); list_del(&req->req_list); spin_unlock(&client->lock); + p9_req_put(req); return 0; } @@ -862,7 +876,15 @@ static void p9_conn_destroy(struct p9_conn *m) p9_mux_poll_stop(m); cancel_work_sync(&m->rq); + if (m->rreq) { + p9_req_put(m->rreq); + m->rreq = NULL; + } cancel_work_sync(&m->wq); + if (m->wreq) { + p9_req_put(m->wreq); + m->wreq = NULL; + } p9_conn_cancel(m, -ECONNRESET); diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index b513cffeeb3c..119103bfa82e 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -122,7 +122,7 @@ struct p9_rdma_context { dma_addr_t busa; union { struct p9_req_t *req; - struct p9_fcall *rc; + struct p9_fcall rc; }; }; @@ -274,8 +274,7 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: if (rdma) rdma->state = P9_RDMA_CLOSED; - if (c) - c->status = Disconnected; + c->status = Disconnected; break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: @@ -320,8 +319,8 @@ recv_done(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) goto err_out; - c->rc->size = wc->byte_len; - err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); + c->rc.size = wc->byte_len; + err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1); if (err) goto err_out; @@ -331,12 +330,13 @@ recv_done(struct ib_cq *cq, struct ib_wc *wc) /* Check that we have not yet received a reply for this request. */ - if (unlikely(req->rc)) { + if (unlikely(req->rc.sdata)) { pr_err("Duplicate reply for request %d", tag); goto err_out; } - req->rc = c->rc; + req->rc.size = c->rc.size; + req->rc.sdata = c->rc.sdata; p9_client_cb(client, req, REQ_STATUS_RCVD); out: @@ -361,9 +361,10 @@ send_done(struct ib_cq *cq, struct ib_wc *wc) container_of(wc->wr_cqe, struct p9_rdma_context, cqe); ib_dma_unmap_single(rdma->cm_id->device, - c->busa, c->req->tc->size, + c->busa, c->req->tc.size, DMA_TO_DEVICE); up(&rdma->sq_sem); + p9_req_put(c->req); kfree(c); } @@ -401,7 +402,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) struct ib_sge sge; c->busa = ib_dma_map_single(rdma->cm_id->device, - c->rc->sdata, client->msize, + c->rc.sdata, client->msize, DMA_FROM_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) goto error; @@ -443,9 +444,9 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) **/ if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { - /* Got one ! */ - kfree(req->rc); - req->rc = NULL; + /* Got one! */ + p9_fcall_fini(&req->rc); + req->rc.sdata = NULL; goto dont_need_post_recv; } else { /* We raced and lost. */ @@ -459,7 +460,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) err = -ENOMEM; goto recv_error; } - rpl_context->rc = req->rc; + rpl_context->rc.sdata = req->rc.sdata; /* * Post a receive buffer for this request. We need to ensure @@ -475,11 +476,11 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) err = post_recv(client, rpl_context); if (err) { - p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n"); + p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err); goto recv_error; } /* remove posted receive buffer from request structure */ - req->rc = NULL; + req->rc.sdata = NULL; dont_need_post_recv: /* Post the request */ @@ -491,7 +492,7 @@ dont_need_post_recv: c->req = req; c->busa = ib_dma_map_single(rdma->cm_id->device, - c->req->tc->sdata, c->req->tc->size, + c->req->tc.sdata, c->req->tc.size, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) { err = -EIO; @@ -501,7 +502,7 @@ dont_need_post_recv: c->cqe.done = send_done; sge.addr = c->busa; - sge.length = c->req->tc->size; + sge.length = c->req->tc.size; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; @@ -544,7 +545,7 @@ dont_need_post_recv: recv_error: kfree(rpl_context); spin_lock_irqsave(&rdma->req_lock, flags); - if (rdma->state < P9_RDMA_CLOSING) { + if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) { rdma->state = P9_RDMA_CLOSING; spin_unlock_irqrestore(&rdma->req_lock, flags); rdma_disconnect(rdma->cm_id); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 7728b0acde09..eb596c2ed546 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -155,7 +155,7 @@ static void req_done(struct virtqueue *vq) } if (len) { - req->rc->size = len; + req->rc.size = len; p9_client_cb(chan->client, req, REQ_STATUS_RCVD); } } @@ -207,6 +207,13 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) return 1; } +/* Reply won't come, so drop req ref */ +static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + p9_req_put(req); + return 0; +} + /** * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer, * this takes a list of pages. @@ -273,12 +280,12 @@ req_retry: out_sgs = in_sgs = 0; /* Handle out VirtIO ring buffers */ out = pack_sg_list(chan->sg, 0, - VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); + VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); if (out) sgs[out_sgs++] = chan->sg; in = pack_sg_list(chan->sg, out, - VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); + VIRTQUEUE_NUM, req->rc.sdata, req->rc.capacity); if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; @@ -404,6 +411,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, struct scatterlist *sgs[4]; size_t offs; int need_drop = 0; + int kicked = 0; p9_debug(P9_DEBUG_TRANS, "virtio request\n"); @@ -411,29 +419,33 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, __le32 sz; int n = p9_get_mapped_pages(chan, &out_pages, uodata, outlen, &offs, &need_drop); - if (n < 0) - return n; + if (n < 0) { + err = n; + goto err_out; + } out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); if (n != outlen) { __le32 v = cpu_to_le32(n); - memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4); + memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); outlen = n; } /* The size field of the message must include the length of the * header and the length of the data. We didn't actually know * the length of the data until this point so add it in now. */ - sz = cpu_to_le32(req->tc->size + outlen); - memcpy(&req->tc->sdata[0], &sz, sizeof(sz)); + sz = cpu_to_le32(req->tc.size + outlen); + memcpy(&req->tc.sdata[0], &sz, sizeof(sz)); } else if (uidata) { int n = p9_get_mapped_pages(chan, &in_pages, uidata, inlen, &offs, &need_drop); - if (n < 0) - return n; + if (n < 0) { + err = n; + goto err_out; + } in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); if (n != inlen) { __le32 v = cpu_to_le32(n); - memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4); + memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); inlen = n; } } @@ -445,7 +457,7 @@ req_retry_pinned: /* out data */ out = pack_sg_list(chan->sg, 0, - VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); + VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); if (out) sgs[out_sgs++] = chan->sg; @@ -464,7 +476,7 @@ req_retry_pinned: * alloced memory and payload onto the user buffer. */ in = pack_sg_list(chan->sg, out, - VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); + VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len); if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; @@ -498,6 +510,7 @@ req_retry_pinned: } virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); + kicked = 1; p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD); /* @@ -518,6 +531,10 @@ err_out: } kvfree(in_pages); kvfree(out_pages); + if (!kicked) { + /* reply won't come */ + p9_req_put(req); + } return err; } @@ -750,6 +767,7 @@ static struct p9_trans_module p9_virtio_trans = { .request = p9_virtio_request, .zc_request = p9_virtio_zc_request, .cancel = p9_virtio_cancel, + .cancelled = p9_virtio_cancelled, /* * We leave one entry for input and one entry for response * headers. We also skip one more entry to accomodate, address diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index c2d54ac76bfd..e2fbf3677b9b 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -141,7 +141,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) struct xen_9pfs_front_priv *priv = NULL; RING_IDX cons, prod, masked_cons, masked_prod; unsigned long flags; - u32 size = p9_req->tc->size; + u32 size = p9_req->tc.size; struct xen_9pfs_dataring *ring; int num; @@ -154,7 +154,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) if (!priv || priv->client != client) return -EINVAL; - num = p9_req->tc->tag % priv->num_rings; + num = p9_req->tc.tag % priv->num_rings; ring = &priv->rings[num]; again: @@ -176,7 +176,7 @@ again: masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); - xen_9pfs_write_packet(ring->data.out, p9_req->tc->sdata, size, + xen_9pfs_write_packet(ring->data.out, p9_req->tc.sdata, size, &masked_prod, masked_cons, XEN_9PFS_RING_SIZE); p9_req->status = REQ_STATUS_SENT; @@ -185,6 +185,7 @@ again: ring->intf->out_prod = prod; spin_unlock_irqrestore(&ring->lock, flags); notify_remote_via_irq(ring->irq); + p9_req_put(p9_req); return 0; } @@ -229,12 +230,12 @@ static void p9_xen_response(struct work_struct *work) continue; } - memcpy(req->rc, &h, sizeof(h)); - req->rc->offset = 0; + memcpy(&req->rc, &h, sizeof(h)); + req->rc.offset = 0; masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); /* Then, read the whole packet (including the header) */ - xen_9pfs_read_packet(req->rc->sdata, ring->data.in, h.size, + xen_9pfs_read_packet(req->rc.sdata, ring->data.in, h.size, masked_prod, &masked_cons, XEN_9PFS_RING_SIZE); @@ -391,8 +392,8 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev, unsigned int max_rings, max_ring_order, len = 0; versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); - if (!len) - return -EINVAL; + if (IS_ERR(versions)) + return PTR_ERR(versions); if (strcmp(versions, "1")) { kfree(versions); return -EINVAL; diff --git a/net/9p/util.c b/net/9p/util.c deleted file mode 100644 index 55ad98277e85..000000000000 --- a/net/9p/util.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * net/9p/util.c - * - * This file contains some helper functions - * - * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net> - * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> - * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/parser.h> -#include <linux/idr.h> -#include <linux/slab.h> -#include <net/9p/9p.h> - -/** - * struct p9_idpool - per-connection accounting for tag idpool - * @lock: protects the pool - * @pool: idr to allocate tag id from - * - */ - -struct p9_idpool { - spinlock_t lock; - struct idr pool; -}; - -/** - * p9_idpool_create - create a new per-connection id pool - * - */ - -struct p9_idpool *p9_idpool_create(void) -{ - struct p9_idpool *p; - - p = kmalloc(sizeof(struct p9_idpool), GFP_KERNEL); - if (!p) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&p->lock); - idr_init(&p->pool); - - return p; -} -EXPORT_SYMBOL(p9_idpool_create); - -/** - * p9_idpool_destroy - create a new per-connection id pool - * @p: idpool to destroy - */ - -void p9_idpool_destroy(struct p9_idpool *p) -{ - idr_destroy(&p->pool); - kfree(p); -} -EXPORT_SYMBOL(p9_idpool_destroy); - -/** - * p9_idpool_get - allocate numeric id from pool - * @p: pool to allocate from - * - * Bugs: This seems to be an awful generic function, should it be in idr.c with - * the lock included in struct idr? - */ - -int p9_idpool_get(struct p9_idpool *p) -{ - int i; - unsigned long flags; - - idr_preload(GFP_NOFS); - spin_lock_irqsave(&p->lock, flags); - - /* no need to store exactly p, we just need something non-null */ - i = idr_alloc(&p->pool, p, 0, 0, GFP_NOWAIT); - - spin_unlock_irqrestore(&p->lock, flags); - idr_preload_end(); - if (i < 0) - return -1; - - p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", i, p); - return i; -} -EXPORT_SYMBOL(p9_idpool_get); - -/** - * p9_idpool_put - release numeric id from pool - * @id: numeric id which is being released - * @p: pool to release id into - * - * Bugs: This seems to be an awful generic function, should it be in idr.c with - * the lock included in struct idr? - */ - -void p9_idpool_put(int id, struct p9_idpool *p) -{ - unsigned long flags; - - p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", id, p); - - spin_lock_irqsave(&p->lock, flags); - idr_remove(&p->pool, id); - spin_unlock_irqrestore(&p->lock, flags); -} -EXPORT_SYMBOL(p9_idpool_put); - -/** - * p9_idpool_check - check if the specified id is available - * @id: id to check - * @p: pool to check - */ - -int p9_idpool_check(int id, struct p9_idpool *p) -{ - return idr_find(&p->pool, id) != NULL; -} -EXPORT_SYMBOL(p9_idpool_check); diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 00deacdcb51c..cfd83c5521ae 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -49,18 +49,17 @@ static int bnep_sock_release(struct socket *sock) return 0; } -static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +static int do_bnep_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp) { struct bnep_connlist_req cl; struct bnep_connadd_req ca; struct bnep_conndel_req cd; struct bnep_conninfo ci; struct socket *nsock; - void __user *argp = (void __user *)arg; __u32 supp_feat = BIT(BNEP_SETUP_RESPONSE); int err; - BT_DBG("cmd %x arg %lx", cmd, arg); + BT_DBG("cmd %x arg %p", cmd, argp); switch (cmd) { case BNEPCONNADD: @@ -134,16 +133,22 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return 0; } +static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return do_bnep_sock_ioctl(sock, cmd, (void __user *)arg); +} + #ifdef CONFIG_COMPAT static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = compat_ptr(arg); if (cmd == BNEPGETCONNLIST) { struct bnep_connlist_req cl; + unsigned __user *p = argp; u32 uci; int err; - if (get_user(cl.cnum, (u32 __user *) arg) || - get_user(uci, (u32 __user *) (arg + 4))) + if (get_user(cl.cnum, p) || get_user(uci, p + 1)) return -EFAULT; cl.ci = compat_ptr(uci); @@ -153,13 +158,13 @@ static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne err = bnep_get_connlist(&cl); - if (!err && put_user(cl.cnum, (u32 __user *) arg)) + if (!err && put_user(cl.cnum, p)) err = -EFAULT; return err; } - return bnep_sock_ioctl(sock, cmd, arg); + return do_bnep_sock_ioctl(sock, cmd, argp); } #endif diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index e08f28fadd65..defdd4871919 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -63,17 +63,16 @@ static int cmtp_sock_release(struct socket *sock) return 0; } -static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +static int do_cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp) { struct cmtp_connadd_req ca; struct cmtp_conndel_req cd; struct cmtp_connlist_req cl; struct cmtp_conninfo ci; struct socket *nsock; - void __user *argp = (void __user *)arg; int err; - BT_DBG("cmd %x arg %lx", cmd, arg); + BT_DBG("cmd %x arg %p", cmd, argp); switch (cmd) { case CMTPCONNADD: @@ -137,16 +136,22 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } +static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return do_cmtp_sock_ioctl(sock, cmd, (void __user *)arg); +} + #ifdef CONFIG_COMPAT static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = compat_ptr(arg); if (cmd == CMTPGETCONNLIST) { struct cmtp_connlist_req cl; + u32 __user *p = argp; u32 uci; int err; - if (get_user(cl.cnum, (u32 __user *) arg) || - get_user(uci, (u32 __user *) (arg + 4))) + if (get_user(cl.cnum, p) || get_user(uci, p + 1)) return -EFAULT; cl.ci = compat_ptr(uci); @@ -156,13 +161,13 @@ static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne err = cmtp_get_connlist(&cl); - if (!err && put_user(cl.cnum, (u32 __user *) arg)) + if (!err && put_user(cl.cnum, p)) err = -EFAULT; return err; } - return cmtp_sock_ioctl(sock, cmd, arg); + return do_cmtp_sock_ioctl(sock, cmd, argp); } #endif diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 3734dc1788b4..a442e21f3894 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -649,7 +649,7 @@ static void hidp_process_transmit(struct hidp_session *session, } static int hidp_setup_input(struct hidp_session *session, - struct hidp_connadd_req *req) + const struct hidp_connadd_req *req) { struct input_dev *input; int i; @@ -748,7 +748,7 @@ EXPORT_SYMBOL_GPL(hidp_hid_driver); /* This function sets up the hid device. It does not add it to the HID system. That is done in hidp_add_connection(). */ static int hidp_setup_hid(struct hidp_session *session, - struct hidp_connadd_req *req) + const struct hidp_connadd_req *req) { struct hid_device *hid; int err; @@ -807,7 +807,7 @@ fault: /* initialize session devices */ static int hidp_session_dev_init(struct hidp_session *session, - struct hidp_connadd_req *req) + const struct hidp_connadd_req *req) { int ret; @@ -906,7 +906,7 @@ static void hidp_session_dev_work(struct work_struct *work) static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, struct socket *ctrl_sock, struct socket *intr_sock, - struct hidp_connadd_req *req, + const struct hidp_connadd_req *req, struct l2cap_conn *conn) { struct hidp_session *session; @@ -1338,7 +1338,7 @@ static int hidp_verify_sockets(struct socket *ctrl_sock, return 0; } -int hidp_connection_add(struct hidp_connadd_req *req, +int hidp_connection_add(const struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock) { diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h index 8798492a6e99..6ef88d0a1919 100644 --- a/net/bluetooth/hidp/hidp.h +++ b/net/bluetooth/hidp/hidp.h @@ -122,7 +122,7 @@ struct hidp_connlist_req { struct hidp_conninfo __user *ci; }; -int hidp_connection_add(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock); +int hidp_connection_add(const struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock); int hidp_connection_del(struct hidp_conndel_req *req); int hidp_get_connlist(struct hidp_connlist_req *req); int hidp_get_conninfo(struct hidp_conninfo *ci); diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 1eaac01f85de..9f85a1943be9 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -46,9 +46,8 @@ static int hidp_sock_release(struct socket *sock) return 0; } -static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +static int do_hidp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp) { - void __user *argp = (void __user *) arg; struct hidp_connadd_req ca; struct hidp_conndel_req cd; struct hidp_connlist_req cl; @@ -57,7 +56,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long struct socket *isock; int err; - BT_DBG("cmd %x arg %lx", cmd, arg); + BT_DBG("cmd %x arg %p", cmd, argp); switch (cmd) { case HIDPCONNADD: @@ -122,6 +121,11 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } +static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return do_hidp_sock_ioctl(sock, cmd, (void __user *)arg); +} + #ifdef CONFIG_COMPAT struct compat_hidp_connadd_req { int ctrl_sock; /* Connected control socket */ @@ -141,13 +145,15 @@ struct compat_hidp_connadd_req { static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = compat_ptr(arg); + int err; + if (cmd == HIDPGETCONNLIST) { struct hidp_connlist_req cl; + u32 __user *p = argp; u32 uci; - int err; - if (get_user(cl.cnum, (u32 __user *) arg) || - get_user(uci, (u32 __user *) (arg + 4))) + if (get_user(cl.cnum, p) || get_user(uci, p + 1)) return -EFAULT; cl.ci = compat_ptr(uci); @@ -157,39 +163,54 @@ static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne err = hidp_get_connlist(&cl); - if (!err && put_user(cl.cnum, (u32 __user *) arg)) + if (!err && put_user(cl.cnum, p)) err = -EFAULT; return err; } else if (cmd == HIDPCONNADD) { - struct compat_hidp_connadd_req ca; - struct hidp_connadd_req __user *uca; + struct compat_hidp_connadd_req ca32; + struct hidp_connadd_req ca; + struct socket *csock; + struct socket *isock; - uca = compat_alloc_user_space(sizeof(*uca)); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; - if (copy_from_user(&ca, (void __user *) arg, sizeof(ca))) + if (copy_from_user(&ca32, (void __user *) arg, sizeof(ca32))) return -EFAULT; - if (put_user(ca.ctrl_sock, &uca->ctrl_sock) || - put_user(ca.intr_sock, &uca->intr_sock) || - put_user(ca.parser, &uca->parser) || - put_user(ca.rd_size, &uca->rd_size) || - put_user(compat_ptr(ca.rd_data), &uca->rd_data) || - put_user(ca.country, &uca->country) || - put_user(ca.subclass, &uca->subclass) || - put_user(ca.vendor, &uca->vendor) || - put_user(ca.product, &uca->product) || - put_user(ca.version, &uca->version) || - put_user(ca.flags, &uca->flags) || - put_user(ca.idle_to, &uca->idle_to) || - copy_to_user(&uca->name[0], &ca.name[0], 128)) - return -EFAULT; + ca.ctrl_sock = ca32.ctrl_sock; + ca.intr_sock = ca32.intr_sock; + ca.parser = ca32.parser; + ca.rd_size = ca32.rd_size; + ca.rd_data = compat_ptr(ca32.rd_data); + ca.country = ca32.country; + ca.subclass = ca32.subclass; + ca.vendor = ca32.vendor; + ca.product = ca32.product; + ca.version = ca32.version; + ca.flags = ca32.flags; + ca.idle_to = ca32.idle_to; + memcpy(ca.name, ca32.name, 128); + + csock = sockfd_lookup(ca.ctrl_sock, &err); + if (!csock) + return err; - arg = (unsigned long) uca; + isock = sockfd_lookup(ca.intr_sock, &err); + if (!isock) { + sockfd_put(csock); + return err; + } - /* Fall through. We don't actually write back any _changes_ - to the structure anyway, so there's no need to copy back - into the original compat version */ + err = hidp_connection_add(&ca, csock, isock); + if (!err && copy_to_user(argp, &ca32, sizeof(ca32))) + err = -EFAULT; + + sockfd_put(csock); + sockfd_put(isock); + + return err; } return hidp_sock_ioctl(sock, cmd, arg); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 024139b51d3a..6bac0d6b7b94 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1422,7 +1422,14 @@ static void br_multicast_query_received(struct net_bridge *br, return; br_multicast_update_query_timer(br, query, max_delay); - br_multicast_mark_router(br, port); + + /* Based on RFC4541, section 2.1.1 IGMP Forwarding Rules, + * the arrival port for IGMP Queries where the source address + * is 0.0.0.0 should not be added to router port list. + */ + if ((saddr->proto == htons(ETH_P_IP) && saddr->u.ip4) || + saddr->proto == htons(ETH_P_IPV6)) + br_multicast_mark_router(br, port); } static void br_ip4_multicast_query(struct net_bridge *br, diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 02172c408ff2..5d6724cee38f 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -46,9 +46,9 @@ static int set_secret(struct ceph_crypto_key *key, void *buf) goto fail; } - /* crypto_alloc_skcipher() allocates with GFP_KERNEL */ + /* crypto_alloc_sync_skcipher() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); - key->tfm = crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); + key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0); memalloc_noio_restore(noio_flag); if (IS_ERR(key->tfm)) { ret = PTR_ERR(key->tfm); @@ -56,7 +56,7 @@ static int set_secret(struct ceph_crypto_key *key, void *buf) goto fail; } - ret = crypto_skcipher_setkey(key->tfm, key->key, key->len); + ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len); if (ret) goto fail; @@ -136,7 +136,7 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key) if (key) { kfree(key->key); key->key = NULL; - crypto_free_skcipher(key->tfm); + crypto_free_sync_skcipher(key->tfm); key->tfm = NULL; } } @@ -216,7 +216,7 @@ static void teardown_sgtable(struct sg_table *sgt) static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len) { - SKCIPHER_REQUEST_ON_STACK(req, key->tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm); struct sg_table sgt; struct scatterlist prealloc_sg; char iv[AES_BLOCK_SIZE] __aligned(8); @@ -232,7 +232,7 @@ static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, return ret; memcpy(iv, aes_iv, AES_BLOCK_SIZE); - skcipher_request_set_tfm(req, key->tfm); + skcipher_request_set_sync_tfm(req, key->tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv); diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index bb45c7d43739..96ef4d860bc9 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -13,7 +13,7 @@ struct ceph_crypto_key { struct ceph_timespec created; int len; void *key; - struct crypto_skcipher *tfm; + struct crypto_sync_skcipher *tfm; }; int ceph_crypto_key_clone(struct ceph_crypto_key *dst, diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0a187196aeed..88e35830198c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -156,7 +156,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con, /* Slab caches for frequently-allocated structures */ static struct kmem_cache *ceph_msg_cache; -static struct kmem_cache *ceph_msg_data_cache; /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; @@ -235,23 +234,11 @@ static int ceph_msgr_slab_init(void) if (!ceph_msg_cache) return -ENOMEM; - BUG_ON(ceph_msg_data_cache); - ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0); - if (ceph_msg_data_cache) - return 0; - - kmem_cache_destroy(ceph_msg_cache); - ceph_msg_cache = NULL; - - return -ENOMEM; + return 0; } static void ceph_msgr_slab_exit(void) { - BUG_ON(!ceph_msg_data_cache); - kmem_cache_destroy(ceph_msg_data_cache); - ceph_msg_data_cache = NULL; - BUG_ON(!ceph_msg_cache); kmem_cache_destroy(ceph_msg_cache); ceph_msg_cache = NULL; @@ -1141,16 +1128,13 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) { struct ceph_msg_data_cursor *cursor = &msg->cursor; - struct ceph_msg_data *data; BUG_ON(!length); BUG_ON(length > msg->data_length); - BUG_ON(list_empty(&msg->data)); + BUG_ON(!msg->num_data_items); - cursor->data_head = &msg->data; cursor->total_resid = length; - data = list_first_entry(&msg->data, struct ceph_msg_data, links); - cursor->data = data; + cursor->data = msg->data; __ceph_msg_data_cursor_init(cursor); } @@ -1231,8 +1215,7 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, if (!cursor->resid && cursor->total_resid) { WARN_ON(!cursor->last_piece); - BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); - cursor->data = list_next_entry(cursor->data, links); + cursor->data++; __ceph_msg_data_cursor_init(cursor); new_piece = true; } @@ -1248,9 +1231,6 @@ static size_t sizeof_footer(struct ceph_connection *con) static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { - BUG_ON(!msg); - BUG_ON(!data_len); - /* Initialize data cursor */ ceph_msg_data_cursor_init(msg, (size_t)data_len); @@ -1590,7 +1570,7 @@ static int write_partial_message_data(struct ceph_connection *con) dout("%s %p msg %p\n", __func__, con, msg); - if (list_empty(&msg->data)) + if (!msg->num_data_items) return -EINVAL; /* @@ -2347,8 +2327,7 @@ static int read_partial_msg_data(struct ceph_connection *con) u32 crc = 0; int ret; - BUG_ON(!msg); - if (list_empty(&msg->data)) + if (!msg->num_data_items) return -EIO; if (do_datacrc) @@ -3256,32 +3235,16 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con, return false; } -static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) +static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg) { - struct ceph_msg_data *data; - - if (WARN_ON(!ceph_msg_data_type_valid(type))) - return NULL; - - data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); - if (!data) - return NULL; - - data->type = type; - INIT_LIST_HEAD(&data->links); - - return data; + BUG_ON(msg->num_data_items >= msg->max_data_items); + return &msg->data[msg->num_data_items++]; } static void ceph_msg_data_destroy(struct ceph_msg_data *data) { - if (!data) - return; - - WARN_ON(!list_empty(&data->links)); if (data->type == CEPH_MSG_DATA_PAGELIST) ceph_pagelist_release(data->pagelist); - kmem_cache_free(ceph_msg_data_cache, data); } void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, @@ -3292,13 +3255,12 @@ void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, BUG_ON(!pages); BUG_ON(!length); - data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); - BUG_ON(!data); + data = ceph_msg_data_add(msg); + data->type = CEPH_MSG_DATA_PAGES; data->pages = pages; data->length = length; data->alignment = alignment & ~PAGE_MASK; - list_add_tail(&data->links, &msg->data); msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_pages); @@ -3311,11 +3273,11 @@ void ceph_msg_data_add_pagelist(struct ceph_msg *msg, BUG_ON(!pagelist); BUG_ON(!pagelist->length); - data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); - BUG_ON(!data); + data = ceph_msg_data_add(msg); + data->type = CEPH_MSG_DATA_PAGELIST; + refcount_inc(&pagelist->refcnt); data->pagelist = pagelist; - list_add_tail(&data->links, &msg->data); msg->data_length += pagelist->length; } EXPORT_SYMBOL(ceph_msg_data_add_pagelist); @@ -3326,12 +3288,11 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, { struct ceph_msg_data *data; - data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); - BUG_ON(!data); + data = ceph_msg_data_add(msg); + data->type = CEPH_MSG_DATA_BIO; data->bio_pos = *bio_pos; data->bio_length = length; - list_add_tail(&data->links, &msg->data); msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_bio); @@ -3342,11 +3303,10 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg, { struct ceph_msg_data *data; - data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS); - BUG_ON(!data); + data = ceph_msg_data_add(msg); + data->type = CEPH_MSG_DATA_BVECS; data->bvec_pos = *bvec_pos; - list_add_tail(&data->links, &msg->data); msg->data_length += bvec_pos->iter.bi_size; } EXPORT_SYMBOL(ceph_msg_data_add_bvecs); @@ -3355,8 +3315,8 @@ EXPORT_SYMBOL(ceph_msg_data_add_bvecs); * construct a new message with given type, size * the new msg has a ref count of 1. */ -struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, - bool can_fail) +struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, + gfp_t flags, bool can_fail) { struct ceph_msg *m; @@ -3370,7 +3330,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->list_head); kref_init(&m->kref); - INIT_LIST_HEAD(&m->data); /* front */ if (front_len) { @@ -3385,6 +3344,15 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, } m->front_alloc_len = m->front.iov_len = front_len; + if (max_data_items) { + m->data = kmalloc_array(max_data_items, sizeof(*m->data), + flags); + if (!m->data) + goto out2; + + m->max_data_items = max_data_items; + } + dout("ceph_msg_new %p front %d\n", m, front_len); return m; @@ -3401,6 +3369,13 @@ out: } return NULL; } +EXPORT_SYMBOL(ceph_msg_new2); + +struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, + bool can_fail) +{ + return ceph_msg_new2(type, front_len, 0, flags, can_fail); +} EXPORT_SYMBOL(ceph_msg_new); /* @@ -3496,13 +3471,14 @@ static void ceph_msg_free(struct ceph_msg *m) { dout("%s %p\n", __func__, m); kvfree(m->front.iov_base); + kfree(m->data); kmem_cache_free(ceph_msg_cache, m); } static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); - struct ceph_msg_data *data, *next; + int i; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); @@ -3515,11 +3491,8 @@ static void ceph_msg_release(struct kref *kref) m->middle = NULL; } - list_for_each_entry_safe(data, next, &m->data, links) { - list_del_init(&data->links); - ceph_msg_data_destroy(data); - } - m->data_length = 0; + for (i = 0; i < m->num_data_items; i++) + ceph_msg_data_destroy(&m->data[i]); if (m->pool) ceph_msgpool_put(m->pool, m); diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index 72571535883f..e3ecb80cd182 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -14,7 +14,8 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg) struct ceph_msgpool *pool = arg; struct ceph_msg *msg; - msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true); + msg = ceph_msg_new2(pool->type, pool->front_len, pool->max_data_items, + gfp_mask, true); if (!msg) { dout("msgpool_alloc %s failed\n", pool->name); } else { @@ -35,11 +36,13 @@ static void msgpool_free(void *element, void *arg) } int ceph_msgpool_init(struct ceph_msgpool *pool, int type, - int front_len, int size, bool blocking, const char *name) + int front_len, int max_data_items, int size, + const char *name) { dout("msgpool %s init\n", name); pool->type = type; pool->front_len = front_len; + pool->max_data_items = max_data_items; pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); if (!pool->pool) return -ENOMEM; @@ -53,18 +56,21 @@ void ceph_msgpool_destroy(struct ceph_msgpool *pool) mempool_destroy(pool->pool); } -struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, - int front_len) +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len, + int max_data_items) { struct ceph_msg *msg; - if (front_len > pool->front_len) { - dout("msgpool_get %s need front %d, pool size is %d\n", - pool->name, front_len, pool->front_len); - WARN_ON(1); + if (front_len > pool->front_len || + max_data_items > pool->max_data_items) { + pr_warn_ratelimited("%s need %d/%d, pool %s has %d/%d\n", + __func__, front_len, max_data_items, pool->name, + pool->front_len, pool->max_data_items); + WARN_ON_ONCE(1); /* try to alloc a fresh message */ - return ceph_msg_new(pool->type, front_len, GFP_NOFS, false); + return ceph_msg_new2(pool->type, front_len, max_data_items, + GFP_NOFS, false); } msg = mempool_alloc(pool->pool, GFP_NOFS); @@ -80,6 +86,9 @@ void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) msg->front.iov_len = pool->front_len; msg->hdr.front_len = cpu_to_le32(pool->front_len); + msg->data_length = 0; + msg->num_data_items = 0; + kref_init(&msg->kref); /* retake single ref */ mempool_free(msg, pool->pool); } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 60934bd8796c..d23a9f81f3d7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -126,6 +126,9 @@ static void ceph_osd_data_init(struct ceph_osd_data *osd_data) osd_data->type = CEPH_OSD_DATA_TYPE_NONE; } +/* + * Consumes @pages if @own_pages is true. + */ static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) @@ -138,6 +141,9 @@ static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, osd_data->own_pages = own_pages; } +/* + * Consumes a ref on @pagelist. + */ static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, struct ceph_pagelist *pagelist) { @@ -362,6 +368,8 @@ static void ceph_osd_data_release(struct ceph_osd_data *osd_data) num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); ceph_release_page_vector(osd_data->pages, num_pages); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { + ceph_pagelist_release(osd_data->pagelist); } ceph_osd_data_init(osd_data); } @@ -402,6 +410,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_LIST_WATCHERS: ceph_osd_data_release(&op->list_watchers.response_data); break; + case CEPH_OSD_OP_COPY_FROM: + ceph_osd_data_release(&op->copy_from.osd_data); + break; default: break; } @@ -606,12 +617,15 @@ static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc) return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); } -int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) +static int __ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp, + int num_request_data_items, + int num_reply_data_items) { struct ceph_osd_client *osdc = req->r_osdc; struct ceph_msg *msg; int msg_size; + WARN_ON(req->r_request || req->r_reply); WARN_ON(ceph_oid_empty(&req->r_base_oid)); WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); @@ -633,9 +647,11 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) msg_size += 4 + 8; /* retry_attempt, features */ if (req->r_mempool) - msg = ceph_msgpool_get(&osdc->msgpool_op, 0); + msg = ceph_msgpool_get(&osdc->msgpool_op, msg_size, + num_request_data_items); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true); + msg = ceph_msg_new2(CEPH_MSG_OSD_OP, msg_size, + num_request_data_items, gfp, true); if (!msg) return -ENOMEM; @@ -648,9 +664,11 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) msg_size += req->r_num_ops * sizeof(struct ceph_osd_op); if (req->r_mempool) - msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); + msg = ceph_msgpool_get(&osdc->msgpool_op_reply, msg_size, + num_reply_data_items); else - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true); + msg = ceph_msg_new2(CEPH_MSG_OSD_OPREPLY, msg_size, + num_reply_data_items, gfp, true); if (!msg) return -ENOMEM; @@ -658,7 +676,6 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) return 0; } -EXPORT_SYMBOL(ceph_osdc_alloc_messages); static bool osd_req_opcode_valid(u16 opcode) { @@ -671,6 +688,65 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE) } } +static void get_num_data_items(struct ceph_osd_request *req, + int *num_request_data_items, + int *num_reply_data_items) +{ + struct ceph_osd_req_op *op; + + *num_request_data_items = 0; + *num_reply_data_items = 0; + + for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) { + switch (op->op) { + /* request */ + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_COPY_FROM: + *num_request_data_items += 1; + break; + + /* reply */ + case CEPH_OSD_OP_STAT: + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_LIST_WATCHERS: + *num_reply_data_items += 1; + break; + + /* both */ + case CEPH_OSD_OP_NOTIFY: + *num_request_data_items += 1; + *num_reply_data_items += 1; + break; + case CEPH_OSD_OP_CALL: + *num_request_data_items += 2; + *num_reply_data_items += 1; + break; + + default: + WARN_ON(!osd_req_opcode_valid(op->op)); + break; + } + } +} + +/* + * oid, oloc and OSD op opcode(s) must be filled in before this function + * is called. + */ +int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) +{ + int num_request_data_items, num_reply_data_items; + + get_num_data_items(req, &num_request_data_items, &num_reply_data_items); + return __ceph_osdc_alloc_messages(req, gfp, num_request_data_items, + num_reply_data_items); +} +EXPORT_SYMBOL(ceph_osdc_alloc_messages); + /* * This is an osd op init function for opcodes that have no data or * other information associated with them. It also serves as a @@ -767,22 +843,19 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, EXPORT_SYMBOL(osd_req_op_extent_dup_last); int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, - u16 opcode, const char *class, const char *method) + const char *class, const char *method) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - opcode, 0); + struct ceph_osd_req_op *op; struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; - BUG_ON(opcode != CEPH_OSD_OP_CALL); + op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); - pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) return -ENOMEM; - ceph_pagelist_init(pagelist); - op->cls.class_name = class; size = strlen(class); BUG_ON(size > (size_t) U8_MAX); @@ -815,12 +888,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); - pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) return -ENOMEM; - ceph_pagelist_init(pagelist); - payload_len = strlen(name); op->xattr.name_len = payload_len; ceph_pagelist_append(pagelist, name, payload_len); @@ -900,12 +971,6 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, static u32 osd_req_encode_op(struct ceph_osd_op *dst, const struct ceph_osd_req_op *src) { - if (WARN_ON(!osd_req_opcode_valid(src->op))) { - pr_err("unrecognized osd opcode %d\n", src->op); - - return 0; - } - switch (src->op) { case CEPH_OSD_OP_STAT: break; @@ -955,6 +1020,14 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_DELETE: break; + case CEPH_OSD_OP_COPY_FROM: + dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); + dst->copy_from.src_version = + cpu_to_le64(src->copy_from.src_version); + dst->copy_from.flags = src->copy_from.flags; + dst->copy_from.src_fadvise_flags = + cpu_to_le32(src->copy_from.src_fadvise_flags); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); @@ -1038,7 +1111,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (flags & CEPH_OSD_FLAG_WRITE) req->r_data_offset = off; - r = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (num_ops > 1) + /* + * This is a special case for ceph_writepages_start(), but it + * also covers ceph_uninline_data(). If more multi-op request + * use cases emerge, we will need a separate helper. + */ + r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_ops, 0); + else + r = ceph_osdc_alloc_messages(req, GFP_NOFS); if (r) goto fail; @@ -1845,48 +1926,55 @@ static bool should_plug_request(struct ceph_osd_request *req) return true; } -static void setup_request_data(struct ceph_osd_request *req, - struct ceph_msg *msg) +/* + * Keep get_num_data_items() in sync with this function. + */ +static void setup_request_data(struct ceph_osd_request *req) { - u32 data_len = 0; - int i; + struct ceph_msg *request_msg = req->r_request; + struct ceph_msg *reply_msg = req->r_reply; + struct ceph_osd_req_op *op; - if (!list_empty(&msg->data)) + if (req->r_request->num_data_items || req->r_reply->num_data_items) return; - WARN_ON(msg->data_length); - for (i = 0; i < req->r_num_ops; i++) { - struct ceph_osd_req_op *op = &req->r_ops[i]; - + WARN_ON(request_msg->data_length || reply_msg->data_length); + for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) { switch (op->op) { /* request */ case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: WARN_ON(op->indata_len != op->extent.length); - ceph_osdc_msg_data_add(msg, &op->extent.osd_data); + ceph_osdc_msg_data_add(request_msg, + &op->extent.osd_data); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: WARN_ON(op->indata_len != op->xattr.name_len + op->xattr.value_len); - ceph_osdc_msg_data_add(msg, &op->xattr.osd_data); + ceph_osdc_msg_data_add(request_msg, + &op->xattr.osd_data); break; case CEPH_OSD_OP_NOTIFY_ACK: - ceph_osdc_msg_data_add(msg, + ceph_osdc_msg_data_add(request_msg, &op->notify_ack.request_data); break; + case CEPH_OSD_OP_COPY_FROM: + ceph_osdc_msg_data_add(request_msg, + &op->copy_from.osd_data); + break; /* reply */ case CEPH_OSD_OP_STAT: - ceph_osdc_msg_data_add(req->r_reply, + ceph_osdc_msg_data_add(reply_msg, &op->raw_data_in); break; case CEPH_OSD_OP_READ: - ceph_osdc_msg_data_add(req->r_reply, + ceph_osdc_msg_data_add(reply_msg, &op->extent.osd_data); break; case CEPH_OSD_OP_LIST_WATCHERS: - ceph_osdc_msg_data_add(req->r_reply, + ceph_osdc_msg_data_add(reply_msg, &op->list_watchers.response_data); break; @@ -1895,25 +1983,23 @@ static void setup_request_data(struct ceph_osd_request *req, WARN_ON(op->indata_len != op->cls.class_len + op->cls.method_len + op->cls.indata_len); - ceph_osdc_msg_data_add(msg, &op->cls.request_info); + ceph_osdc_msg_data_add(request_msg, + &op->cls.request_info); /* optional, can be NONE */ - ceph_osdc_msg_data_add(msg, &op->cls.request_data); + ceph_osdc_msg_data_add(request_msg, + &op->cls.request_data); /* optional, can be NONE */ - ceph_osdc_msg_data_add(req->r_reply, + ceph_osdc_msg_data_add(reply_msg, &op->cls.response_data); break; case CEPH_OSD_OP_NOTIFY: - ceph_osdc_msg_data_add(msg, + ceph_osdc_msg_data_add(request_msg, &op->notify.request_data); - ceph_osdc_msg_data_add(req->r_reply, + ceph_osdc_msg_data_add(reply_msg, &op->notify.response_data); break; } - - data_len += op->indata_len; } - - WARN_ON(data_len != msg->data_length); } static void encode_pgid(void **p, const struct ceph_pg *pgid) @@ -1961,7 +2047,7 @@ static void encode_request_partial(struct ceph_osd_request *req, req->r_data_offset || req->r_snapc); } - setup_request_data(req, msg); + setup_request_data(req); encode_spgid(&p, &req->r_t.spgid); /* actual spg */ ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */ @@ -3001,11 +3087,21 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd *osd; + down_write(&osdc->lock); + linger_register(lreq); + if (lreq->is_watch) { + lreq->reg_req->r_ops[0].watch.cookie = lreq->linger_id; + lreq->ping_req->r_ops[0].watch.cookie = lreq->linger_id; + } else { + lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; + } + calc_target(osdc, &lreq->t, NULL, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); send_linger(lreq); + up_write(&osdc->lock); } static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) @@ -4318,9 +4414,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc, lreq->notify_id, notify_id); } else if (!completion_done(&lreq->notify_finish_wait)) { struct ceph_msg_data *data = - list_first_entry_or_null(&msg->data, - struct ceph_msg_data, - links); + msg->num_data_items ? &msg->data[0] : NULL; if (data) { if (lreq->preply_pages) { @@ -4476,6 +4570,23 @@ alloc_linger_request(struct ceph_osd_linger_request *lreq) ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); + return req; +} + +static struct ceph_osd_request * +alloc_watch_request(struct ceph_osd_linger_request *lreq, u8 watch_opcode) +{ + struct ceph_osd_request *req; + + req = alloc_linger_request(lreq); + if (!req) + return NULL; + + /* + * Pass 0 for cookie because we don't know it yet, it will be + * filled in by linger_submit(). + */ + osd_req_op_watch_init(req, 0, 0, watch_opcode); if (ceph_osdc_alloc_messages(req, GFP_NOIO)) { ceph_osdc_put_request(req); @@ -4514,27 +4625,19 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, lreq->t.flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts64(&lreq->mtime); - lreq->reg_req = alloc_linger_request(lreq); + lreq->reg_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_WATCH); if (!lreq->reg_req) { ret = -ENOMEM; goto err_put_lreq; } - lreq->ping_req = alloc_linger_request(lreq); + lreq->ping_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_PING); if (!lreq->ping_req) { ret = -ENOMEM; goto err_put_lreq; } - down_write(&osdc->lock); - linger_register(lreq); /* before osd_req_op_* */ - osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id, - CEPH_OSD_WATCH_OP_WATCH); - osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id, - CEPH_OSD_WATCH_OP_PING); linger_submit(lreq); - up_write(&osdc->lock); - ret = linger_reg_commit_wait(lreq); if (ret) { linger_cancel(lreq); @@ -4599,11 +4702,10 @@ static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); - pl = kmalloc(sizeof(*pl), GFP_NOIO); + pl = ceph_pagelist_alloc(GFP_NOIO); if (!pl) return -ENOMEM; - ceph_pagelist_init(pl); ret = ceph_pagelist_encode_64(pl, notify_id); ret |= ceph_pagelist_encode_64(pl, cookie); if (payload) { @@ -4641,12 +4743,12 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = CEPH_OSD_FLAG_READ; - ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, + payload_len); if (ret) goto out_put_req; - ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, - payload_len); + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); if (ret) goto out_put_req; @@ -4670,11 +4772,10 @@ static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); op->notify.cookie = cookie; - pl = kmalloc(sizeof(*pl), GFP_NOIO); + pl = ceph_pagelist_alloc(GFP_NOIO); if (!pl) return -ENOMEM; - ceph_pagelist_init(pl); ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */ ret |= ceph_pagelist_encode_32(pl, timeout); ret |= ceph_pagelist_encode_32(pl, payload_len); @@ -4733,29 +4834,30 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, goto out_put_lreq; } + /* + * Pass 0 for cookie because we don't know it yet, it will be + * filled in by linger_submit(). + */ + ret = osd_req_op_notify_init(lreq->reg_req, 0, 0, 1, timeout, + payload, payload_len); + if (ret) + goto out_put_lreq; + /* for notify_id */ pages = ceph_alloc_page_vector(1, GFP_NOIO); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out_put_lreq; } - - down_write(&osdc->lock); - linger_register(lreq); /* before osd_req_op_* */ - ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1, - timeout, payload, payload_len); - if (ret) { - linger_unregister(lreq); - up_write(&osdc->lock); - ceph_release_page_vector(pages, 1); - goto out_put_lreq; - } ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify, response_data), pages, PAGE_SIZE, 0, false, true); - linger_submit(lreq); - up_write(&osdc->lock); + ret = ceph_osdc_alloc_messages(lreq->reg_req, GFP_NOIO); + if (ret) + goto out_put_lreq; + + linger_submit(lreq); ret = linger_reg_commit_wait(lreq); if (!ret) ret = linger_notify_finish_wait(lreq); @@ -4881,10 +4983,6 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = CEPH_OSD_FLAG_READ; - ret = ceph_osdc_alloc_messages(req, GFP_NOIO); - if (ret) - goto out_put_req; - pages = ceph_alloc_page_vector(1, GFP_NOIO); if (IS_ERR(pages)) { ret = PTR_ERR(pages); @@ -4896,6 +4994,10 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, response_data), pages, PAGE_SIZE, 0, false, true); + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + goto out_put_req; + ceph_osdc_start_request(osdc, req, false); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { @@ -4958,11 +5060,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, ceph_oloc_copy(&req->r_base_oloc, oloc); req->r_flags = flags; - ret = ceph_osdc_alloc_messages(req, GFP_NOIO); - if (ret) - goto out_put_req; - - ret = osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method); + ret = osd_req_op_cls_init(req, 0, class, method); if (ret) goto out_put_req; @@ -4973,6 +5071,10 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, osd_req_op_cls_response_data_pages(req, 0, &resp_page, *resp_len, 0, false, false); + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + goto out_put_req; + ceph_osdc_start_request(osdc, req, false); ret = ceph_osdc_wait_request(osdc, req); if (ret >= 0) { @@ -5021,11 +5123,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) goto out_map; err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, - PAGE_SIZE, 10, true, "osd_op"); + PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op"); if (err < 0) goto out_mempool; err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, - PAGE_SIZE, 10, true, "osd_op_reply"); + PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, + "osd_op_reply"); if (err < 0) goto out_msgpool; @@ -5168,6 +5271,80 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, } EXPORT_SYMBOL(ceph_osdc_writepages); +static int osd_req_op_copy_from_init(struct ceph_osd_request *req, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u32 dst_fadvise_flags, + u8 copy_from_flags) +{ + struct ceph_osd_req_op *op; + struct page **pages; + void *p, *end; + + pages = ceph_alloc_page_vector(1, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags); + op->copy_from.snapid = src_snapid; + op->copy_from.src_version = src_version; + op->copy_from.flags = copy_from_flags; + op->copy_from.src_fadvise_flags = src_fadvise_flags; + + p = page_address(pages[0]); + end = p + PAGE_SIZE; + ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); + encode_oloc(&p, end, src_oloc); + op->indata_len = PAGE_SIZE - (end - p); + + ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, + op->indata_len, 0, false, true); + return 0; +} + +int ceph_osdc_copy_from(struct ceph_osd_client *osdc, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + struct ceph_object_id *dst_oid, + struct ceph_object_locator *dst_oloc, + u32 dst_fadvise_flags, + u8 copy_from_flags) +{ + struct ceph_osd_request *req; + int ret; + + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->r_flags = CEPH_OSD_FLAG_WRITE; + + ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); + ceph_oid_copy(&req->r_t.base_oid, dst_oid); + + ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid, + src_oloc, src_fadvise_flags, + dst_fadvise_flags, copy_from_flags); + if (ret) + goto out; + + ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); + if (ret) + goto out; + + ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_wait_request(osdc, req); + +out: + ceph_osdc_put_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_copy_from); + int __init ceph_osdc_setup(void) { size_t size = sizeof(struct ceph_osd_request) + @@ -5295,7 +5472,7 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) u32 front_len = le32_to_cpu(hdr->front_len); u32 data_len = le32_to_cpu(hdr->data_len); - m = ceph_msg_new(type, front_len, GFP_NOIO, false); + m = ceph_msg_new2(type, front_len, 1, GFP_NOIO, false); if (!m) return NULL; diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 2ea0564771d2..65e34f78b05d 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -6,6 +6,26 @@ #include <linux/highmem.h> #include <linux/ceph/pagelist.h> +struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags) +{ + struct ceph_pagelist *pl; + + pl = kmalloc(sizeof(*pl), gfp_flags); + if (!pl) + return NULL; + + INIT_LIST_HEAD(&pl->head); + pl->mapped_tail = NULL; + pl->length = 0; + pl->room = 0; + INIT_LIST_HEAD(&pl->free_list); + pl->num_pages_free = 0; + refcount_set(&pl->refcnt, 1); + + return pl; +} +EXPORT_SYMBOL(ceph_pagelist_alloc); + static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) { if (pl->mapped_tail) { diff --git a/net/compat.c b/net/compat.c index 3b2105f6549d..47a614b370cd 100644 --- a/net/compat.c +++ b/net/compat.c @@ -812,21 +812,21 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, - struct compat_timespec __user *timeout) + struct old_timespec32 __user *timeout) { int datagrams; - struct timespec ktspec; + struct timespec64 ktspec; if (timeout == NULL) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, NULL); - if (compat_get_timespec(&ktspec, timeout)) + if (compat_get_timespec64(&ktspec, timeout)) return -EFAULT; datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, &ktspec); - if (datagrams > 0 && compat_put_timespec(&ktspec, timeout)) + if (datagrams > 0 && compat_put_timespec64(&ktspec, timeout)) datagrams = -EFAULT; return datagrams; @@ -834,7 +834,7 @@ static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, - struct compat_timespec __user *, timeout) + struct old_timespec32 __user *, timeout) { return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); } diff --git a/net/core/dev.c b/net/core/dev.c index 022ad73d6253..0ffcbdd55fa9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3272,7 +3272,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de } skb = next; - if (netif_xmit_stopped(txq) && skb) { + if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } @@ -5457,7 +5457,7 @@ static void gro_flush_oldest(struct list_head *head) /* Do not adjust napi->gro_hash[].count, caller is adding a new * SKB to the chain. */ - list_del(&oldest->list); + skb_list_del_init(oldest); napi_gro_complete(oldest); } diff --git a/net/core/filter.c b/net/core/filter.c index 35c6933c2622..e521c5ebc7d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5264,8 +5264,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5296,8 +5294,6 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; @@ -5496,7 +5492,13 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): return false; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; } + if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): @@ -5638,6 +5640,15 @@ static bool sock_filter_is_valid_access(int off, int size, prog->expected_attach_type); } +static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + /* Neither direct read nor direct write requires any preliminary + * action. + */ + return 0; +} + static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { @@ -7204,6 +7215,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops xdp_prog_ops = { @@ -7302,6 +7314,7 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops sk_msg_prog_ops = { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ee605d9d8bd4..41954e42a2de 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2364,7 +2364,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) if (!master_idx) return false; - master = netdev_master_upper_dev_get(dev); + master = dev ? netdev_master_upper_dev_get(dev) : NULL; if (!master || master->ifindex != master_idx) return true; @@ -2373,7 +2373,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) { - if (filter_idx && dev->ifindex != filter_idx) + if (filter_idx && (!dev || dev->ifindex != filter_idx)) return true; return false; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 5e4f04004a49..7bf833598615 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -106,6 +106,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, iterate_fd(p->files, 0, update_classid_sock, (void *)(unsigned long)cs->classid); task_unlock(p); + cond_resched(); } css_task_iter_end(&it); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5da9552b186b..2b9fdbc43205 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -717,7 +717,8 @@ int netpoll_setup(struct netpoll *np) read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; err = 0; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f679c7a7d761..33d9227a8b80 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3367,7 +3367,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = 0; } ret = dumpit(skb, cb); - if (ret < 0) + if (ret) break; } cb->family = idx; @@ -3600,6 +3600,11 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } + if (dev->type != ARPHRD_ETHER) { + NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); + return -EINVAL; + } + addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); @@ -3704,6 +3709,11 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } + if (dev->type != ARPHRD_ETHER) { + NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); + return -EINVAL; + } + addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 946de0e24c87..b4ee5c8b928f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4944,6 +4944,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) * * This is a helper to do that correctly considering GSO_BY_FRAGS. * + * @skb: GSO skb + * * @seg_len: The segmented length (from skb_gso_*_seglen). In the * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. * diff --git a/net/core/sock.c b/net/core/sock.c index 6fcc4bc07d19..080a880a1761 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3279,6 +3279,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && + protocol != IPPROTO_RAW && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b1a2c5e38530..37b4667128a3 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -279,7 +279,6 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, return ret; } -# ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -290,7 +289,6 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } -# endif #endif static struct ctl_table net_core_table[] = { @@ -397,6 +395,14 @@ static struct ctl_table net_core_table[] = { .extra2 = &one, }, # endif + { + .procname = "bpf_jit_limit", + .data = &bpf_jit_limit, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &one, + }, #endif { .procname = "netdev_tstamp_prequeue", diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 9250b309c742..a34602ae27de 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1704,6 +1704,7 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); if (IS_ERR(net)) { + fillargs->netnsid = -1; NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); return PTR_ERR(net); } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4da39446da2d..765b2b32c4a4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -111,13 +111,10 @@ #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ -#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ) -#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ) #define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ) #define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ) +#define IGMP_QUERY_INTERVAL (125*HZ) #define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) -#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2 - #define IGMP_INITIAL_REPORT_DELAY (1) @@ -935,13 +932,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + - IGMP_V1_ROUTER_PRESENT_TIMEOUT; + (in_dev->mr_qrv * in_dev->mr_qi) + + in_dev->mr_qri; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + - IGMP_V2_ROUTER_PRESENT_TIMEOUT; + (in_dev->mr_qrv * in_dev->mr_qi) + + in_dev->mr_qri; } /* cancel the interface change timer */ in_dev->mr_ifc_count = 0; @@ -981,8 +980,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ in_dev->mr_maxdelay = max_delay; - if (ih3->qrv) - in_dev->mr_qrv = ih3->qrv; + + /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently + * received value was zero, use the default or statically + * configured value. + */ + in_dev->mr_qrv = ih3->qrv ?: net->ipv4.sysctl_igmp_qrv; + in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; + + /* RFC3376, 8.3. Query Response Interval: + * The number of seconds represented by the [Query Response + * Interval] must be less than the [Query Interval]. + */ + if (in_dev->mr_qri >= in_dev->mr_qi) + in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; + if (!group) { /* general query */ if (ih3->nsrcs) return true; /* no sources allowed */ @@ -1723,18 +1735,30 @@ void ip_mc_down(struct in_device *in_dev) ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } -void ip_mc_init_dev(struct in_device *in_dev) -{ #ifdef CONFIG_IP_MULTICAST +static void ip_mc_reset(struct in_device *in_dev) +{ struct net *net = dev_net(in_dev->dev); + + in_dev->mr_qi = IGMP_QUERY_INTERVAL; + in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; +} +#else +static void ip_mc_reset(struct in_device *in_dev) +{ +} #endif + +void ip_mc_init_dev(struct in_device *in_dev) +{ ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0); timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0); - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif + ip_mc_reset(in_dev); spin_lock_init(&in_dev->mc_tomb_lock); } @@ -1744,15 +1768,10 @@ void ip_mc_init_dev(struct in_device *in_dev) void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; -#ifdef CONFIG_IP_MULTICAST - struct net *net = dev_net(in_dev->dev); -#endif ASSERT_RTNL(); -#ifdef CONFIG_IP_MULTICAST - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; -#endif + ip_mc_reset(in_dev); ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) { diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index f5c9ef2586de..411dd7a90046 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,7 +19,7 @@ #include <linux/slab.h> #include <linux/wait.h> #include <linux/vmalloc.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9b0158fa431f..d6ee343fdb86 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -722,10 +722,14 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { - if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) - return skb; - if (pskb_trim_rcsum(skb, netoff + len)) - return skb; + if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) { + kfree_skb(skb); + return NULL; + } + if (pskb_trim_rcsum(skb, netoff + len)) { + kfree_skb(skb); + return NULL; + } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(net, skb, user)) return NULL; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 26c36cccabdc..fffcc130900e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1246,7 +1246,7 @@ int ip_setsockopt(struct sock *sk, int level, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_SET_REPLACE && optname < BPFILTER_IPT_SET_MAX) err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); @@ -1559,7 +1559,7 @@ int ip_getsockopt(struct sock *sk, int level, int err; err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); @@ -1596,7 +1596,7 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, err = do_ip_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); -#ifdef CONFIG_BPFILTER +#if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && optname < BPFILTER_IPT_GET_MAX) err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1834818ed07b..9e6bc4d6daa7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -262,7 +262,7 @@ #include <linux/net.h> #include <linux/socket.h> #include <linux/random.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/highmem.h> #include <linux/swap.h> #include <linux/cache.h> diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index b7918d4caa30..3b45fe530f91 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -145,6 +145,7 @@ msg_bytes_ready: ret = err; goto out; } + copied = -EAGAIN; } ret = copied; out: diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7e048288fcab..1976fddb9e00 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -81,7 +81,7 @@ #include <linux/uaccess.h> #include <asm/ioctls.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/highmem.h> #include <linux/swap.h> #include <linux/types.h> @@ -609,8 +609,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, - iph->saddr, uh->source, skb->dev->ifindex, 0, - udptable, NULL); + iph->saddr, uh->source, skb->dev->ifindex, + inet_sdif(skb), udptable, NULL); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; /* No socket for error */ diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index d9ad986c7b2c..5cbb9be05295 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -42,6 +42,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, rcu_read_lock(); if (req->sdiag_family == AF_INET) + /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7eb09c86fa13..63a808d5af15 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5058,6 +5058,7 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, fillargs->netnsid = nla_get_s32(tb[i]); net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); if (IS_ERR(net)) { + fillargs->netnsid = -1; NL_SET_ERR_MSG_MOD(extack, "Invalid target network namespace id"); return PTR_ERR(net); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3f4d61017a69..f0cd291034f0 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1001,6 +1001,9 @@ static int __init inet6_init(void) err = ip6_flowlabel_init(); if (err) goto ip6_flowlabel_fail; + err = ipv6_anycast_init(); + if (err) + goto ipv6_anycast_fail; err = addrconf_init(); if (err) goto addrconf_fail; @@ -1091,6 +1094,8 @@ ipv6_frag_fail: ipv6_exthdrs_fail: addrconf_cleanup(); addrconf_fail: + ipv6_anycast_cleanup(); +ipv6_anycast_fail: ip6_flowlabel_cleanup(); ip6_flowlabel_fail: ndisc_late_cleanup(); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 4e0ff7031edd..94999058e110 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -44,8 +44,22 @@ #include <net/checksum.h> +#define IN6_ADDR_HSIZE_SHIFT 8 +#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT) +/* anycast address hash table + */ +static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; +static DEFINE_SPINLOCK(acaddr_hash_lock); + static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); +static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr) +{ + u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + + return hash_32(val, IN6_ADDR_HSIZE_SHIFT); +} + /* * socket join an anycast group */ @@ -204,16 +218,39 @@ void ipv6_sock_ac_close(struct sock *sk) rtnl_unlock(); } +static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) +{ + unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr); + + spin_lock(&acaddr_hash_lock); + hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]); + spin_unlock(&acaddr_hash_lock); +} + +static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca) +{ + spin_lock(&acaddr_hash_lock); + hlist_del_init_rcu(&aca->aca_addr_lst); + spin_unlock(&acaddr_hash_lock); +} + static void aca_get(struct ifacaddr6 *aca) { refcount_inc(&aca->aca_refcnt); } +static void aca_free_rcu(struct rcu_head *h) +{ + struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu); + + fib6_info_release(aca->aca_rt); + kfree(aca); +} + static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) { - fib6_info_release(ac->aca_rt); - kfree(ac); + call_rcu(&ac->rcu, aca_free_rcu); } } @@ -229,6 +266,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, aca->aca_addr = *addr; fib6_info_hold(f6i); aca->aca_rt = f6i; + INIT_HLIST_NODE(&aca->aca_addr_lst); aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; @@ -285,6 +323,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) aca_get(aca); write_unlock_bh(&idev->lock); + ipv6_add_acaddr_hash(net, aca); + ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); @@ -325,6 +365,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) else idev->ac_list = aca->aca_next; write_unlock_bh(&idev->lock); + ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); @@ -352,6 +393,8 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) idev->ac_list = aca->aca_next; write_unlock_bh(&idev->lock); + ipv6_del_acaddr_hash(aca); + addrconf_leave_solict(idev, &aca->aca_addr); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); @@ -390,17 +433,25 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr) { + unsigned int hash = inet6_acaddr_hash(net, addr); + struct net_device *nh_dev; + struct ifacaddr6 *aca; bool found = false; rcu_read_lock(); if (dev) found = ipv6_chk_acast_dev(dev, addr); else - for_each_netdev_rcu(net, dev) - if (ipv6_chk_acast_dev(dev, addr)) { + hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash], + aca_addr_lst) { + nh_dev = fib6_info_nh_dev(aca->aca_rt); + if (!nh_dev || !net_eq(dev_net(nh_dev), net)) + continue; + if (ipv6_addr_equal(&aca->aca_addr, addr)) { found = true; break; } + } rcu_read_unlock(); return found; } @@ -540,3 +591,24 @@ void ac6_proc_exit(struct net *net) remove_proc_entry("anycast6", net->proc_net); } #endif + +/* Init / cleanup code + */ +int __init ipv6_anycast_init(void) +{ + int i; + + for (i = 0; i < IN6_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet6_acaddr_lst[i]); + return 0; +} + +void ipv6_anycast_cleanup(void) +{ + int i; + + spin_lock(&acaddr_hash_lock); + for (i = 0; i < IN6_ADDR_HSIZE; i++) + WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); + spin_unlock(&acaddr_hash_lock); +} diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1b8bc008b53b..ae3786132c23 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -591,7 +591,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) /* fib entries are never clones */ if (arg.filter.flags & RTM_F_CLONED) - return skb->len; + goto out; w = (void *)cb->args[2]; if (!w) { @@ -621,7 +621,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (arg.filter.dump_all_families) - return skb->len; + goto out; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); return -ENOENT; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a25cfdd47c89..659ecf4e4b3c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1732,10 +1732,9 @@ int ndisc_rcv(struct sk_buff *skb) return 0; } - memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); ndisc_recv_ns(skb); break; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 06d17ff3562f..d2d97d07ef27 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -478,7 +478,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct net *net = dev_net(skb->dev); sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, - inet6_iif(skb), 0, udptable, skb); + inet6_iif(skb), inet6_sdif(skb), udptable, skb); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index 2fb703d70803..7e29f88dbf6a 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -146,18 +146,18 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template) goto err_tfm; } - key->tfm0 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); + key->tfm0 = crypto_alloc_sync_skcipher("ctr(aes)", 0, 0); if (IS_ERR(key->tfm0)) goto err_tfm; - if (crypto_skcipher_setkey(key->tfm0, template->key, + if (crypto_sync_skcipher_setkey(key->tfm0, template->key, IEEE802154_LLSEC_KEY_SIZE)) goto err_tfm0; return key; err_tfm0: - crypto_free_skcipher(key->tfm0); + crypto_free_sync_skcipher(key->tfm0); err_tfm: for (i = 0; i < ARRAY_SIZE(key->tfm); i++) if (key->tfm[i]) @@ -177,7 +177,7 @@ static void llsec_key_release(struct kref *ref) for (i = 0; i < ARRAY_SIZE(key->tfm); i++) crypto_free_aead(key->tfm[i]); - crypto_free_skcipher(key->tfm0); + crypto_free_sync_skcipher(key->tfm0); kzfree(key); } @@ -622,7 +622,7 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, { u8 iv[16]; struct scatterlist src; - SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); int err, datalen; unsigned char *data; @@ -632,7 +632,7 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, datalen = skb_tail_pointer(skb) - data; sg_init_one(&src, data, datalen); - skcipher_request_set_tfm(req, key->tfm0); + skcipher_request_set_sync_tfm(req, key->tfm0); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &src, &src, datalen, iv); err = crypto_skcipher_encrypt(req); @@ -840,7 +840,7 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, unsigned char *data; int datalen; struct scatterlist src; - SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); int err; llsec_geniv(iv, dev_addr, &hdr->sec); @@ -849,7 +849,7 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, sg_init_one(&src, data, datalen); - skcipher_request_set_tfm(req, key->tfm0); + skcipher_request_set_sync_tfm(req, key->tfm0); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &src, &src, datalen, iv); diff --git a/net/mac802154/llsec.h b/net/mac802154/llsec.h index 6f3b658e3279..8be46d74dc39 100644 --- a/net/mac802154/llsec.h +++ b/net/mac802154/llsec.h @@ -29,7 +29,7 @@ struct mac802154_llsec_key { /* one tfm for each authsize (4/8/16) */ struct crypto_aead *tfm[3]; - struct crypto_skcipher *tfm0; + struct crypto_sync_skcipher *tfm0; struct kref ref; }; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 6bec37ab4472..a4660c48ff01 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1203,7 +1203,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, &info->labels.mask); if (err) return err; - } else if (labels_nonzero(&info->labels.mask)) { + } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + labels_nonzero(&info->labels.mask)) { err = ovs_ct_set_labels(ct, key, &info->labels.value, &info->labels.mask); if (err) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index a70097ecf33c..865ecef68196 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -3030,7 +3030,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, * is already present */ if (mac_proto != MAC_PROTO_NONE) return -EINVAL; - mac_proto = MAC_PROTO_NONE; + mac_proto = MAC_PROTO_ETHERNET; break; case OVS_ACTION_ATTR_POP_ETH: @@ -3038,7 +3038,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return -EINVAL; if (vlan_tci & htons(VLAN_TAG_PRESENT)) return -EINVAL; - mac_proto = MAC_PROTO_ETHERNET; + mac_proto = MAC_PROTO_NONE; break; case OVS_ACTION_ATTR_PUSH_NSH: diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 0a7c49e8e053..bc628acf4f4f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -435,7 +435,7 @@ struct rxrpc_connection { struct sk_buff_head rx_queue; /* received conn-level packets */ const struct rxrpc_security *security; /* applied security module */ struct key *server_key; /* security for this service */ - struct crypto_skcipher *cipher; /* encryption handle */ + struct crypto_sync_skcipher *cipher; /* encryption handle */ struct rxrpc_crypt csum_iv; /* packet checksum base */ unsigned long flags; unsigned long events; @@ -611,6 +611,7 @@ struct rxrpc_call { * not hard-ACK'd packet follows this. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + u16 tx_backoff; /* Delay to insert due to Tx failure */ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS * is fixed, we keep these numbers in terms of segments (ie. DATA diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 8e7434e92097..468efc3660c0 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -123,6 +123,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, else ack_at = expiry; + ack_at += READ_ONCE(call->tx_backoff); ack_at += now; if (time_before(ack_at, call->ack_at)) { WRITE_ONCE(call->ack_at, ack_at); @@ -311,6 +312,7 @@ void rxrpc_process_call(struct work_struct *work) container_of(work, struct rxrpc_call, processor); rxrpc_serial_t *send_ack; unsigned long now, next, t; + unsigned int iterations = 0; rxrpc_see_call(call); @@ -319,6 +321,11 @@ void rxrpc_process_call(struct work_struct *work) call->debug_id, rxrpc_call_states[call->state], call->events); recheck_state: + /* Limit the number of times we do this before returning to the manager */ + iterations++; + if (iterations > 5) + goto requeue; + if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) { rxrpc_send_abort_packet(call); goto recheck_state; @@ -447,13 +454,16 @@ recheck_state: rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); /* other events may have been raised since we started checking */ - if (call->events && call->state < RXRPC_CALL_COMPLETE) { - __rxrpc_queue_call(call); - goto out; - } + if (call->events && call->state < RXRPC_CALL_COMPLETE) + goto requeue; out_put: rxrpc_put_call(call, rxrpc_call_put); out: _leave(""); + return; + +requeue: + __rxrpc_queue_call(call); + goto out; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 189418888839..736aa9281100 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -35,6 +35,21 @@ struct rxrpc_abort_buffer { static const char rxrpc_keepalive_string[] = ""; /* + * Increase Tx backoff on transmission failure and clear it on success. + */ +static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret) +{ + if (ret < 0) { + u16 tx_backoff = READ_ONCE(call->tx_backoff); + + if (tx_backoff < HZ) + WRITE_ONCE(call->tx_backoff, tx_backoff + 1); + } else { + WRITE_ONCE(call->tx_backoff, 0); + } +} + +/* * Arrange for a keepalive ping a certain time after we last transmitted. This * lets the far side know we're still interested in this call and helps keep * the route through any intervening firewall open. @@ -210,6 +225,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, else trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr, rxrpc_tx_point_call_ack); + rxrpc_tx_backoff(call, ret); if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { @@ -218,7 +234,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), - true, true, + false, true, rxrpc_propose_ack_retry_tx); } else { spin_lock_bh(&call->lock); @@ -300,7 +316,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) else trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr, rxrpc_tx_point_call_abort); - + rxrpc_tx_backoff(call, ret); rxrpc_put_connection(conn); return ret; @@ -413,6 +429,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, else trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_nofrag); + rxrpc_tx_backoff(call, ret); if (ret == -EMSGSIZE) goto send_fragmentable; @@ -445,9 +462,18 @@ done: rxrpc_reduce_call_timer(call, expect_rx_by, nowj, rxrpc_timer_set_for_normal); } - } - rxrpc_set_keepalive(call); + rxrpc_set_keepalive(call); + } else { + /* Cancel the call if the initial transmission fails, + * particularly if that's due to network routing issues that + * aren't going away anytime soon. The layer above can arrange + * the retransmission. + */ + if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, + RX_USER_ABORT, ret); + } _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; @@ -506,6 +532,7 @@ send_fragmentable: else trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_frag); + rxrpc_tx_backoff(call, ret); up_write(&conn->params.local->defrag_sem); goto done; diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index cea16838d588..cbef9ea43dec 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -46,7 +46,7 @@ struct rxkad_level2_hdr { * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE * packets */ -static struct crypto_skcipher *rxkad_ci; +static struct crypto_sync_skcipher *rxkad_ci; static DEFINE_MUTEX(rxkad_ci_mutex); /* @@ -54,7 +54,7 @@ static DEFINE_MUTEX(rxkad_ci_mutex); */ static int rxkad_init_connection_security(struct rxrpc_connection *conn) { - struct crypto_skcipher *ci; + struct crypto_sync_skcipher *ci; struct rxrpc_key_token *token; int ret; @@ -63,14 +63,14 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) token = conn->params.key->payload.data[0]; conn->security_ix = token->security_index; - ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0); if (IS_ERR(ci)) { _debug("no cipher"); ret = PTR_ERR(ci); goto error; } - if (crypto_skcipher_setkey(ci, token->kad->session_key, + if (crypto_sync_skcipher_setkey(ci, token->kad->session_key, sizeof(token->kad->session_key)) < 0) BUG(); @@ -104,7 +104,7 @@ error: static int rxkad_prime_packet_security(struct rxrpc_connection *conn) { struct rxrpc_key_token *token; - SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); struct scatterlist sg; struct rxrpc_crypt iv; __be32 *tmpbuf; @@ -128,7 +128,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn) tmpbuf[3] = htonl(conn->security_ix); sg_init_one(&sg, tmpbuf, tmpsize); - skcipher_request_set_tfm(req, conn->cipher); + skcipher_request_set_sync_tfm(req, conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x); crypto_skcipher_encrypt(req); @@ -167,7 +167,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, memset(&iv, 0, sizeof(iv)); sg_init_one(&sg, sechdr, 8); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -212,7 +212,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, memcpy(&iv, token->kad->session_key, sizeof(iv)); sg_init_one(&sg[0], sechdr, sizeof(rxkhdr)); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x); crypto_skcipher_encrypt(req); @@ -250,7 +250,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, void *sechdr) { struct rxrpc_skb_priv *sp; - SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg; u32 x, y; @@ -279,7 +279,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -352,7 +352,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, /* start the decryption afresh */ memset(&iv, 0, sizeof(iv)); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, 8, iv.x); crypto_skcipher_decrypt(req); @@ -450,7 +450,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, iv.x); crypto_skcipher_decrypt(req); @@ -506,7 +506,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, unsigned int offset, unsigned int len, rxrpc_seq_t seq, u16 expected_cksum) { - SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg; bool aborted; @@ -529,7 +529,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); - skcipher_request_set_tfm(req, call->conn->cipher); + skcipher_request_set_sync_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); @@ -755,7 +755,7 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn, struct rxkad_response *resp, const struct rxkad_key *s2) { - SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[1]; @@ -764,7 +764,7 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn, sg_init_table(sg, 1); sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); - skcipher_request_set_tfm(req, conn->cipher); + skcipher_request_set_sync_tfm(req, conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); crypto_skcipher_encrypt(req); @@ -1021,7 +1021,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, struct rxkad_response *resp, const struct rxrpc_crypt *session_key) { - SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci); struct scatterlist sg[1]; struct rxrpc_crypt iv; @@ -1031,7 +1031,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, ASSERT(rxkad_ci != NULL); mutex_lock(&rxkad_ci_mutex); - if (crypto_skcipher_setkey(rxkad_ci, session_key->x, + if (crypto_sync_skcipher_setkey(rxkad_ci, session_key->x, sizeof(*session_key)) < 0) BUG(); @@ -1039,7 +1039,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, sg_init_table(sg, 1); sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); - skcipher_request_set_tfm(req, rxkad_ci); + skcipher_request_set_sync_tfm(req, rxkad_ci); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); crypto_skcipher_decrypt(req); @@ -1218,7 +1218,7 @@ static void rxkad_clear(struct rxrpc_connection *conn) _enter(""); if (conn->cipher) - crypto_free_skcipher(conn->cipher); + crypto_free_sync_skcipher(conn->cipher); } /* @@ -1228,7 +1228,7 @@ static int rxkad_init(void) { /* pin the cipher we need so that the crypto layer doesn't invoke * keventd to go get it */ - rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + rxkad_ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0); return PTR_ERR_OR_ZERO(rxkad_ci); } @@ -1238,7 +1238,7 @@ static int rxkad_init(void) static void rxkad_exit(void) { if (rxkad_ci) - crypto_free_skcipher(rxkad_ci); + crypto_free_sync_skcipher(rxkad_ci); } /* diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index cbe4831f46f4..4a042abf844c 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -413,7 +413,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { if (tb[TCA_GRED_LIMIT] != NULL) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); - return gred_change_table_def(sch, opt); + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } if (tb[TCA_GRED_PARMS] == NULL || diff --git a/net/sctp/associola.c b/net/sctp/associola.c index a827a1f562bf..6a28b96e779e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -499,8 +499,9 @@ void sctp_assoc_set_primary(struct sctp_association *asoc, void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer) { - struct list_head *pos; - struct sctp_transport *transport; + struct sctp_transport *transport; + struct list_head *pos; + struct sctp_chunk *ch; pr_debug("%s: association:%p addr:%pISpc\n", __func__, asoc, &peer->ipaddr.sa); @@ -564,7 +565,6 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, */ if (!list_empty(&peer->transmitted)) { struct sctp_transport *active = asoc->peer.active_path; - struct sctp_chunk *ch; /* Reset the transport of each chunk on this list */ list_for_each_entry(ch, &peer->transmitted, @@ -586,6 +586,10 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, sctp_transport_hold(active); } + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) + if (ch->transport == peer) + ch->transport = NULL; + asoc->peer.transport_count--; sctp_transport_free(peer); diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 9cb854b05342..c37e1c2dec9d 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -212,7 +212,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); - sctp_sched_set_sched(asoc, SCTP_SS_FCFS); + sctp_sched_set_sched(asoc, SCTP_SS_DEFAULT); } /* Free the outqueue structure and any related pending chunks. diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index e948db29ab53..9b277bd36d1a 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -46,7 +46,7 @@ #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/seq_file.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/highmem.h> #include <linux/swap.h> #include <linux/slab.h> diff --git a/net/sctp/socket.c b/net/sctp/socket.c index fc0386e8ff23..739f3e50120d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7083,14 +7083,15 @@ static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, } policy = params.sprstat_policy; - if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL))) + if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) || + ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); if (!asoc) goto out; - if (policy & SCTP_PR_SCTP_ALL) { + if (policy == SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { @@ -7142,7 +7143,8 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, } policy = params.sprstat_policy; - if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL))) + if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) || + ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e871368500e3..18daebcef181 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -122,22 +122,17 @@ static void __smc_lgr_unregister_conn(struct smc_connection *conn) sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ } -/* Unregister connection and trigger lgr freeing if applicable +/* Unregister connection from lgr */ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - int reduced = 0; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { - reduced = 1; __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - if (!reduced || lgr->conns_num) - return; - smc_lgr_schedule_free_work(lgr); } /* Send delete link, either as client to request the initiation @@ -291,7 +286,8 @@ out: return rc; } -static void smc_buf_unuse(struct smc_connection *conn) +static void smc_buf_unuse(struct smc_connection *conn, + struct smc_link_group *lgr) { if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; @@ -301,8 +297,6 @@ static void smc_buf_unuse(struct smc_connection *conn) conn->rmb_desc->used = 0; } else { /* buf registration failed, reuse not possible */ - struct smc_link_group *lgr = conn->lgr; - write_lock_bh(&lgr->rmbs_lock); list_del(&conn->rmb_desc->list); write_unlock_bh(&lgr->rmbs_lock); @@ -315,16 +309,21 @@ static void smc_buf_unuse(struct smc_connection *conn) /* remove a finished connection from its link group */ void smc_conn_free(struct smc_connection *conn) { - if (!conn->lgr) + struct smc_link_group *lgr = conn->lgr; + + if (!lgr) return; - if (conn->lgr->is_smcd) { + if (lgr->is_smcd) { smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); } - smc_lgr_unregister_conn(conn); - smc_buf_unuse(conn); + smc_lgr_unregister_conn(conn); /* unsets conn->lgr */ + smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + + if (!lgr->conns_num) + smc_lgr_schedule_free_work(lgr); } static void smc_link_clear(struct smc_link *lnk) diff --git a/net/socket.c b/net/socket.c index b68801c7d0ab..99c96851469f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2342,7 +2342,7 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, */ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, - unsigned int flags, struct timespec *timeout) + unsigned int flags, struct timespec64 *timeout) { int fput_needed, err, datagrams; struct socket *sock; @@ -2407,8 +2407,7 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, if (timeout) { ktime_get_ts64(&timeout64); - *timeout = timespec64_to_timespec( - timespec64_sub(end_time, timeout64)); + *timeout = timespec64_sub(end_time, timeout64); if (timeout->tv_sec < 0) { timeout->tv_sec = timeout->tv_nsec = 0; break; @@ -2454,10 +2453,10 @@ out_put: static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, - struct timespec __user *timeout) + struct __kernel_timespec __user *timeout) { int datagrams; - struct timespec timeout_sys; + struct timespec64 timeout_sys; if (flags & MSG_CMSG_COMPAT) return -EINVAL; @@ -2465,13 +2464,12 @@ static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, if (!timeout) return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL); - if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys))) + if (get_timespec64(&timeout_sys, timeout)) return -EFAULT; datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); - if (datagrams > 0 && - copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys))) + if (datagrams > 0 && put_timespec64(&timeout_sys, timeout)) datagrams = -EFAULT; return datagrams; @@ -2479,7 +2477,7 @@ static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, - struct timespec __user *, timeout) + struct __kernel_timespec __user *, timeout) { return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); } @@ -2603,7 +2601,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) break; case SYS_RECVMMSG: err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], - a[3], (struct timespec __user *)a[4]); + a[3], (struct __kernel_timespec __user *)a[4]); break; case SYS_ACCEPT4: err = __sys_accept4(a0, (struct sockaddr __user *)a1, diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 305ecea92170..ad8ead738981 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -30,10 +30,9 @@ struct rpc_cred_cache { static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; -static DEFINE_SPINLOCK(rpc_authflavor_lock); -static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = { - &authnull_ops, /* AUTH_NULL */ - &authunix_ops, /* AUTH_UNIX */ +static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { + [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops, + [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops, NULL, /* others can be loadable modules */ }; @@ -93,39 +92,65 @@ pseudoflavor_to_flavor(u32 flavor) { int rpcauth_register(const struct rpc_authops *ops) { + const struct rpc_authops *old; rpc_authflavor_t flavor; - int ret = -EPERM; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; - spin_lock(&rpc_authflavor_lock); - if (auth_flavors[flavor] == NULL) { - auth_flavors[flavor] = ops; - ret = 0; - } - spin_unlock(&rpc_authflavor_lock); - return ret; + old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops); + if (old == NULL || old == ops) + return 0; + return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_register); int rpcauth_unregister(const struct rpc_authops *ops) { + const struct rpc_authops *old; rpc_authflavor_t flavor; - int ret = -EPERM; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; - spin_lock(&rpc_authflavor_lock); - if (auth_flavors[flavor] == ops) { - auth_flavors[flavor] = NULL; - ret = 0; - } - spin_unlock(&rpc_authflavor_lock); - return ret; + + old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL); + if (old == ops || old == NULL) + return 0; + return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_unregister); +static const struct rpc_authops * +rpcauth_get_authops(rpc_authflavor_t flavor) +{ + const struct rpc_authops *ops; + + if (flavor >= RPC_AUTH_MAXFLAVOR) + return NULL; + + rcu_read_lock(); + ops = rcu_dereference(auth_flavors[flavor]); + if (ops == NULL) { + rcu_read_unlock(); + request_module("rpc-auth-%u", flavor); + rcu_read_lock(); + ops = rcu_dereference(auth_flavors[flavor]); + if (ops == NULL) + goto out; + } + if (!try_module_get(ops->owner)) + ops = NULL; +out: + rcu_read_unlock(); + return ops; +} + +static void +rpcauth_put_authops(const struct rpc_authops *ops) +{ + module_put(ops->owner); +} + /** * rpcauth_get_pseudoflavor - check if security flavor is supported * @flavor: a security flavor @@ -138,25 +163,16 @@ EXPORT_SYMBOL_GPL(rpcauth_unregister); rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info) { - const struct rpc_authops *ops; + const struct rpc_authops *ops = rpcauth_get_authops(flavor); rpc_authflavor_t pseudoflavor; - ops = auth_flavors[flavor]; - if (ops == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); + if (!ops) return RPC_AUTH_MAXFLAVOR; - } - spin_unlock(&rpc_authflavor_lock); - pseudoflavor = flavor; if (ops->info2flavor != NULL) pseudoflavor = ops->info2flavor(info); - module_put(ops->owner); + rpcauth_put_authops(ops); return pseudoflavor; } EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor); @@ -176,25 +192,15 @@ rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info) const struct rpc_authops *ops; int result; - if (flavor >= RPC_AUTH_MAXFLAVOR) - return -EINVAL; - - ops = auth_flavors[flavor]; + ops = rpcauth_get_authops(flavor); if (ops == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); return -ENOENT; - } - spin_unlock(&rpc_authflavor_lock); result = -ENOENT; if (ops->flavor2info != NULL) result = ops->flavor2info(pseudoflavor, info); - module_put(ops->owner); + rpcauth_put_authops(ops); return result; } EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); @@ -212,15 +218,13 @@ EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); int rpcauth_list_flavors(rpc_authflavor_t *array, int size) { - rpc_authflavor_t flavor; - int result = 0; + const struct rpc_authops *ops; + rpc_authflavor_t flavor, pseudos[4]; + int i, len, result = 0; - spin_lock(&rpc_authflavor_lock); + rcu_read_lock(); for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) { - const struct rpc_authops *ops = auth_flavors[flavor]; - rpc_authflavor_t pseudos[4]; - int i, len; - + ops = rcu_dereference(auth_flavors[flavor]); if (result >= size) { result = -ENOMEM; break; @@ -245,7 +249,7 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size) array[result++] = pseudos[i]; } } - spin_unlock(&rpc_authflavor_lock); + rcu_read_unlock(); dprintk("RPC: %s returns %d\n", __func__, result); return result; @@ -255,25 +259,17 @@ EXPORT_SYMBOL_GPL(rpcauth_list_flavors); struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { - struct rpc_auth *auth; + struct rpc_auth *auth = ERR_PTR(-EINVAL); const struct rpc_authops *ops; - u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); + u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); - auth = ERR_PTR(-EINVAL); - if (flavor >= RPC_AUTH_MAXFLAVOR) + ops = rpcauth_get_authops(flavor); + if (ops == NULL) goto out; - if ((ops = auth_flavors[flavor]) == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); - goto out; - } - spin_unlock(&rpc_authflavor_lock); auth = ops->create(args, clnt); - module_put(ops->owner); + + rpcauth_put_authops(ops); if (IS_ERR(auth)) return auth; if (clnt->cl_auth) @@ -288,32 +284,37 @@ EXPORT_SYMBOL_GPL(rpcauth_create); void rpcauth_release(struct rpc_auth *auth) { - if (!atomic_dec_and_test(&auth->au_count)) + if (!refcount_dec_and_test(&auth->au_count)) return; auth->au_ops->destroy(auth); } static DEFINE_SPINLOCK(rpc_credcache_lock); -static void +/* + * On success, the caller is responsible for freeing the reference + * held by the hashtable + */ +static bool rpcauth_unhash_cred_locked(struct rpc_cred *cred) { + if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + return false; hlist_del_rcu(&cred->cr_hash); - smp_mb__before_atomic(); - clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); + return true; } -static int +static bool rpcauth_unhash_cred(struct rpc_cred *cred) { spinlock_t *cache_lock; - int ret; + bool ret; + if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + return false; cache_lock = &cred->cr_auth->au_credcache->lock; spin_lock(cache_lock); - ret = atomic_read(&cred->cr_count) == 0; - if (ret) - rpcauth_unhash_cred_locked(cred); + ret = rpcauth_unhash_cred_locked(cred); spin_unlock(cache_lock); return ret; } @@ -392,6 +393,44 @@ void rpcauth_destroy_credlist(struct list_head *head) } } +static void +rpcauth_lru_add_locked(struct rpc_cred *cred) +{ + if (!list_empty(&cred->cr_lru)) + return; + number_cred_unused++; + list_add_tail(&cred->cr_lru, &cred_unused); +} + +static void +rpcauth_lru_add(struct rpc_cred *cred) +{ + if (!list_empty(&cred->cr_lru)) + return; + spin_lock(&rpc_credcache_lock); + rpcauth_lru_add_locked(cred); + spin_unlock(&rpc_credcache_lock); +} + +static void +rpcauth_lru_remove_locked(struct rpc_cred *cred) +{ + if (list_empty(&cred->cr_lru)) + return; + number_cred_unused--; + list_del_init(&cred->cr_lru); +} + +static void +rpcauth_lru_remove(struct rpc_cred *cred) +{ + if (list_empty(&cred->cr_lru)) + return; + spin_lock(&rpc_credcache_lock); + rpcauth_lru_remove_locked(cred); + spin_unlock(&rpc_credcache_lock); +} + /* * Clear the RPC credential cache, and delete those credentials * that are not referenced. @@ -411,13 +450,10 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache) head = &cache->hashtable[i]; while (!hlist_empty(head)) { cred = hlist_entry(head->first, struct rpc_cred, cr_hash); - get_rpccred(cred); - if (!list_empty(&cred->cr_lru)) { - list_del(&cred->cr_lru); - number_cred_unused--; - } - list_add_tail(&cred->cr_lru, &free); rpcauth_unhash_cred_locked(cred); + /* Note: We now hold a reference to cred */ + rpcauth_lru_remove_locked(cred); + list_add_tail(&cred->cr_lru, &free); } } spin_unlock(&cache->lock); @@ -451,7 +487,6 @@ EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); static long rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { - spinlock_t *cache_lock; struct rpc_cred *cred, *next; unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; long freed = 0; @@ -460,32 +495,24 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) if (nr_to_scan-- == 0) break; + if (refcount_read(&cred->cr_count) > 1) { + rpcauth_lru_remove_locked(cred); + continue; + } /* * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ - if (time_in_range(cred->cr_expire, expired, jiffies) && - test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { - freed = SHRINK_STOP; - break; - } - - list_del_init(&cred->cr_lru); - number_cred_unused--; - freed++; - if (atomic_read(&cred->cr_count) != 0) + if (!time_in_range(cred->cr_expire, expired, jiffies)) + continue; + if (!rpcauth_unhash_cred(cred)) continue; - cache_lock = &cred->cr_auth->au_credcache->lock; - spin_lock(cache_lock); - if (atomic_read(&cred->cr_count) == 0) { - get_rpccred(cred); - list_add_tail(&cred->cr_lru, free); - rpcauth_unhash_cred_locked(cred); - } - spin_unlock(cache_lock); + rpcauth_lru_remove_locked(cred); + freed++; + list_add_tail(&cred->cr_lru, free); } - return freed; + return freed ? freed : SHRINK_STOP; } static unsigned long @@ -561,19 +588,15 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; if (flags & RPCAUTH_LOOKUP_RCU) { - if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) && - !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags)) - cred = entry; + if (test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags) || + refcount_read(&entry->cr_count) == 0) + continue; + cred = entry; break; } - spin_lock(&cache->lock); - if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) { - spin_unlock(&cache->lock); - continue; - } cred = get_rpccred(entry); - spin_unlock(&cache->lock); - break; + if (cred) + break; } rcu_read_unlock(); @@ -594,11 +617,13 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); - break; + if (cred) + break; } if (cred == NULL) { cred = new; set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); + refcount_inc(&cred->cr_count); hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]); } else list_add_tail(&new->cr_lru, &free); @@ -645,7 +670,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, { INIT_HLIST_NODE(&cred->cr_hash); INIT_LIST_HEAD(&cred->cr_lru); - atomic_set(&cred->cr_count, 1); + refcount_set(&cred->cr_count, 1); cred->cr_auth = auth; cred->cr_ops = ops; cred->cr_expire = jiffies; @@ -713,36 +738,29 @@ put_rpccred(struct rpc_cred *cred) { if (cred == NULL) return; - /* Fast path for unhashed credentials */ - if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) { - if (atomic_dec_and_test(&cred->cr_count)) - cred->cr_ops->crdestroy(cred); - return; - } - - if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock)) - return; - if (!list_empty(&cred->cr_lru)) { - number_cred_unused--; - list_del_init(&cred->cr_lru); - } - if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { - if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { - cred->cr_expire = jiffies; - list_add_tail(&cred->cr_lru, &cred_unused); - number_cred_unused++; - goto out_nodestroy; - } - if (!rpcauth_unhash_cred(cred)) { - /* We were hashed and someone looked us up... */ - goto out_nodestroy; - } + rcu_read_lock(); + if (refcount_dec_and_test(&cred->cr_count)) + goto destroy; + if (refcount_read(&cred->cr_count) != 1 || + !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + goto out; + if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { + cred->cr_expire = jiffies; + rpcauth_lru_add(cred); + /* Race breaker */ + if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))) + rpcauth_lru_remove(cred); + } else if (rpcauth_unhash_cred(cred)) { + rpcauth_lru_remove(cred); + if (refcount_dec_and_test(&cred->cr_count)) + goto destroy; } - spin_unlock(&rpc_credcache_lock); - cred->cr_ops->crdestroy(cred); +out: + rcu_read_unlock(); return; -out_nodestroy: - spin_unlock(&rpc_credcache_lock); +destroy: + rcu_read_unlock(); + cred->cr_ops->crdestroy(cred); } EXPORT_SYMBOL_GPL(put_rpccred); @@ -817,6 +835,16 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, return rpcauth_unwrap_req_decode(decode, rqstp, data, obj); } +bool +rpcauth_xmit_need_reencode(struct rpc_task *task) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + if (!cred || !cred->cr_ops->crneed_reencode) + return false; + return cred->cr_ops->crneed_reencode(task); +} + int rpcauth_refreshcred(struct rpc_task *task) { diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index f1df9837f1ac..d8831b988b1e 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -274,7 +274,7 @@ static const struct rpc_authops generic_auth_ops = { static struct rpc_auth generic_auth = { .au_ops = &generic_auth_ops, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static bool generic_key_to_expire(struct rpc_cred *cred) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 21c0aa0a0d1d..30f970cdc7f6 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1058,7 +1058,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth->au_flavor = flavor; if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; - atomic_set(&auth->au_count, 1); + refcount_set(&auth->au_count, 1); kref_init(&gss_auth->kref); err = rpcauth_init_credcache(auth); @@ -1187,7 +1187,7 @@ gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args, if (strcmp(gss_auth->target_name, args->target_name)) continue; } - if (!atomic_inc_not_zero(&gss_auth->rpc_auth.au_count)) + if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count)) continue; goto out; } @@ -1984,6 +1984,46 @@ gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp, return decode(rqstp, &xdr, obj); } +static bool +gss_seq_is_newer(u32 new, u32 old) +{ + return (s32)(new - old) > 0; +} + +static bool +gss_xmit_need_reencode(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_cred *cred = req->rq_cred; + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 win, seq_xmit; + bool ret = true; + + if (!ctx) + return true; + + if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) + goto out; + + seq_xmit = READ_ONCE(ctx->gc_seq_xmit); + while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { + u32 tmp = seq_xmit; + + seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); + if (seq_xmit == tmp) { + ret = false; + goto out; + } + } + + win = ctx->gc_win; + if (win > 0) + ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); +out: + gss_put_ctx(ctx); + return ret; +} + static int gss_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj) @@ -2052,6 +2092,7 @@ static const struct rpc_credops gss_credops = { .crunwrap_resp = gss_unwrap_resp, .crkey_timeout = gss_key_timeout, .crstringify_acceptor = gss_stringify_acceptor, + .crneed_reencode = gss_xmit_need_reencode, }; static const struct rpc_credops gss_nullops = { diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 0220e1ca5280..4f43383971ba 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -53,7 +53,7 @@ u32 krb5_encrypt( - struct crypto_skcipher *tfm, + struct crypto_sync_skcipher *tfm, void * iv, void * in, void * out, @@ -62,24 +62,24 @@ krb5_encrypt( u32 ret = -EINVAL; struct scatterlist sg[1]; u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - if (length % crypto_skcipher_blocksize(tfm) != 0) + if (length % crypto_sync_skcipher_blocksize(tfm) != 0) goto out; - if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n", - crypto_skcipher_ivsize(tfm)); + crypto_sync_skcipher_ivsize(tfm)); goto out; } if (iv) - memcpy(local_iv, iv, crypto_skcipher_ivsize(tfm)); + memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm)); memcpy(out, in, length); sg_init_one(sg, out, length); - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, length, local_iv); @@ -92,7 +92,7 @@ out: u32 krb5_decrypt( - struct crypto_skcipher *tfm, + struct crypto_sync_skcipher *tfm, void * iv, void * in, void * out, @@ -101,23 +101,23 @@ krb5_decrypt( u32 ret = -EINVAL; struct scatterlist sg[1]; u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - if (length % crypto_skcipher_blocksize(tfm) != 0) + if (length % crypto_sync_skcipher_blocksize(tfm) != 0) goto out; - if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { + if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", - crypto_skcipher_ivsize(tfm)); + crypto_sync_skcipher_ivsize(tfm)); goto out; } if (iv) - memcpy(local_iv,iv, crypto_skcipher_ivsize(tfm)); + memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm)); memcpy(out, in, length); sg_init_one(sg, out, length); - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, length, local_iv); @@ -466,7 +466,8 @@ encryptor(struct scatterlist *sg, void *data) { struct encryptor_desc *desc = data; struct xdr_buf *outbuf = desc->outbuf; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req); + struct crypto_sync_skcipher *tfm = + crypto_sync_skcipher_reqtfm(desc->req); struct page *in_page; int thislen = desc->fraglen + sg->length; int fraglen, ret; @@ -492,7 +493,7 @@ encryptor(struct scatterlist *sg, void *data) desc->fraglen += sg->length; desc->pos += sg->length; - fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1); + fraglen = thislen & (crypto_sync_skcipher_blocksize(tfm) - 1); thislen -= fraglen; if (thislen == 0) @@ -526,16 +527,16 @@ encryptor(struct scatterlist *sg, void *data) } int -gss_encrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf, +gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf, int offset, struct page **pages) { int ret; struct encryptor_desc desc; - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0); + BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0); - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); memset(desc.iv, 0, sizeof(desc.iv)); @@ -567,7 +568,8 @@ decryptor(struct scatterlist *sg, void *data) { struct decryptor_desc *desc = data; int thislen = desc->fraglen + sg->length; - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req); + struct crypto_sync_skcipher *tfm = + crypto_sync_skcipher_reqtfm(desc->req); int fraglen, ret; /* Worst case is 4 fragments: head, end of page 1, start @@ -578,7 +580,7 @@ decryptor(struct scatterlist *sg, void *data) desc->fragno++; desc->fraglen += sg->length; - fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1); + fraglen = thislen & (crypto_sync_skcipher_blocksize(tfm) - 1); thislen -= fraglen; if (thislen == 0) @@ -608,17 +610,17 @@ decryptor(struct scatterlist *sg, void *data) } int -gss_decrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf, +gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf, int offset) { int ret; struct decryptor_desc desc; - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); /* XXXJBF: */ - BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0); + BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0); - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); memset(desc.iv, 0, sizeof(desc.iv)); @@ -672,12 +674,12 @@ xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen) } static u32 -gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf, +gss_krb5_cts_crypt(struct crypto_sync_skcipher *cipher, struct xdr_buf *buf, u32 offset, u8 *iv, struct page **pages, int encrypt) { u32 ret; struct scatterlist sg[1]; - SKCIPHER_REQUEST_ON_STACK(req, cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, cipher); u8 *data; struct page **save_pages; u32 len = buf->len - offset; @@ -706,7 +708,7 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf, sg_init_one(sg, data, len); - skcipher_request_set_tfm(req, cipher); + skcipher_request_set_sync_tfm(req, cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, iv); @@ -735,7 +737,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_netobj hmac; u8 *cksumkey; u8 *ecptr; - struct crypto_skcipher *cipher, *aux_cipher; + struct crypto_sync_skcipher *cipher, *aux_cipher; int blocksize; struct page **save_pages; int nblocks, nbytes; @@ -754,7 +756,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, cksumkey = kctx->acceptor_integ; usage = KG_USAGE_ACCEPTOR_SEAL; } - blocksize = crypto_skcipher_blocksize(cipher); + blocksize = crypto_sync_skcipher_blocksize(cipher); /* hide the gss token header and insert the confounder */ offset += GSS_KRB5_TOK_HDR_LEN; @@ -807,7 +809,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, memset(desc.iv, 0, sizeof(desc.iv)); if (cbcbytes) { - SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); desc.pos = offset + GSS_KRB5_TOK_HDR_LEN; desc.fragno = 0; @@ -816,7 +818,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, desc.outbuf = buf; desc.req = req; - skcipher_request_set_tfm(req, aux_cipher); + skcipher_request_set_sync_tfm(req, aux_cipher); skcipher_request_set_callback(req, 0, NULL, NULL); sg_init_table(desc.infrags, 4); @@ -855,7 +857,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, struct xdr_buf subbuf; u32 ret = 0; u8 *cksum_key; - struct crypto_skcipher *cipher, *aux_cipher; + struct crypto_sync_skcipher *cipher, *aux_cipher; struct xdr_netobj our_hmac_obj; u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN]; u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN]; @@ -874,7 +876,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, cksum_key = kctx->initiator_integ; usage = KG_USAGE_INITIATOR_SEAL; } - blocksize = crypto_skcipher_blocksize(cipher); + blocksize = crypto_sync_skcipher_blocksize(cipher); /* create a segment skipping the header and leaving out the checksum */ @@ -891,13 +893,13 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, memset(desc.iv, 0, sizeof(desc.iv)); if (cbcbytes) { - SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, aux_cipher); desc.fragno = 0; desc.fraglen = 0; desc.req = req; - skcipher_request_set_tfm(req, aux_cipher); + skcipher_request_set_sync_tfm(req, aux_cipher); skcipher_request_set_callback(req, 0, NULL, NULL); sg_init_table(desc.frags, 4); @@ -946,7 +948,8 @@ out_err: * Set the key of the given cipher. */ int -krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, +krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, + struct crypto_sync_skcipher *cipher, unsigned char *cksum) { struct crypto_shash *hmac; @@ -994,7 +997,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, if (err) goto out_err; - err = crypto_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); + err = crypto_sync_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength); if (err) goto out_err; @@ -1012,7 +1015,8 @@ out_err: * Set the key of cipher kctx->enc. */ int -krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, +krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, + struct crypto_sync_skcipher *cipher, s32 seqnum) { struct crypto_shash *hmac; @@ -1069,7 +1073,8 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, if (err) goto out_err; - err = crypto_skcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength); + err = crypto_sync_skcipher_setkey(cipher, Kcrypt, + kctx->gk5e->keylength); if (err) goto out_err; diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index f7fe2d2b851f..550fdf18d3b3 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -147,7 +147,7 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, size_t blocksize, keybytes, keylength, n; unsigned char *inblockdata, *outblockdata, *rawkey; struct xdr_netobj inblock, outblock; - struct crypto_skcipher *cipher; + struct crypto_sync_skcipher *cipher; u32 ret = EINVAL; blocksize = gk5e->blocksize; @@ -157,11 +157,10 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e, if ((inkey->len != keylength) || (outkey->len != keylength)) goto err_return; - cipher = crypto_alloc_skcipher(gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_sync_skcipher(gk5e->encrypt_name, 0, 0); if (IS_ERR(cipher)) goto err_return; - if (crypto_skcipher_setkey(cipher, inkey->data, inkey->len)) + if (crypto_sync_skcipher_setkey(cipher, inkey->data, inkey->len)) goto err_return; /* allocate and set up buffers */ @@ -238,7 +237,7 @@ err_free_in: memset(inblockdata, 0, blocksize); kfree(inblockdata); err_free_cipher: - crypto_free_skcipher(cipher); + crypto_free_sync_skcipher(cipher); err_return: return ret; } diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 7bb2514aadd9..7f0424dfa8f6 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -218,7 +218,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) static inline const void * get_key(const void *p, const void *end, - struct krb5_ctx *ctx, struct crypto_skcipher **res) + struct krb5_ctx *ctx, struct crypto_sync_skcipher **res) { struct xdr_netobj key; int alg; @@ -246,15 +246,14 @@ get_key(const void *p, const void *end, if (IS_ERR(p)) goto out_err; - *res = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + *res = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); if (IS_ERR(*res)) { printk(KERN_WARNING "gss_kerberos_mech: unable to initialize " "crypto algorithm %s\n", ctx->gk5e->encrypt_name); *res = NULL; goto out_err_free_key; } - if (crypto_skcipher_setkey(*res, key.data, key.len)) { + if (crypto_sync_skcipher_setkey(*res, key.data, key.len)) { printk(KERN_WARNING "gss_kerberos_mech: error setting key for " "crypto algorithm %s\n", ctx->gk5e->encrypt_name); goto out_err_free_tfm; @@ -264,7 +263,7 @@ get_key(const void *p, const void *end, return p; out_err_free_tfm: - crypto_free_skcipher(*res); + crypto_free_sync_skcipher(*res); out_err_free_key: kfree(key.data); p = ERR_PTR(-EINVAL); @@ -336,30 +335,30 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) return 0; out_err_free_key2: - crypto_free_skcipher(ctx->seq); + crypto_free_sync_skcipher(ctx->seq); out_err_free_key1: - crypto_free_skcipher(ctx->enc); + crypto_free_sync_skcipher(ctx->enc); out_err_free_mech: kfree(ctx->mech_used.data); out_err: return PTR_ERR(p); } -static struct crypto_skcipher * +static struct crypto_sync_skcipher * context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key) { - struct crypto_skcipher *cp; + struct crypto_sync_skcipher *cp; - cp = crypto_alloc_skcipher(cname, 0, CRYPTO_ALG_ASYNC); + cp = crypto_alloc_sync_skcipher(cname, 0, 0); if (IS_ERR(cp)) { dprintk("gss_kerberos_mech: unable to initialize " "crypto algorithm %s\n", cname); return NULL; } - if (crypto_skcipher_setkey(cp, key, ctx->gk5e->keylength)) { + if (crypto_sync_skcipher_setkey(cp, key, ctx->gk5e->keylength)) { dprintk("gss_kerberos_mech: error setting key for " "crypto algorithm %s\n", cname); - crypto_free_skcipher(cp); + crypto_free_sync_skcipher(cp); return NULL; } return cp; @@ -413,9 +412,9 @@ context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask) return 0; out_free_enc: - crypto_free_skcipher(ctx->enc); + crypto_free_sync_skcipher(ctx->enc); out_free_seq: - crypto_free_skcipher(ctx->seq); + crypto_free_sync_skcipher(ctx->seq); out_err: return -EINVAL; } @@ -469,17 +468,15 @@ context_derive_keys_rc4(struct krb5_ctx *ctx) /* * allocate hash, and skciphers for data and seqnum encryption */ - ctx->enc = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + ctx->enc = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); if (IS_ERR(ctx->enc)) { err = PTR_ERR(ctx->enc); goto out_err_free_hmac; } - ctx->seq = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + ctx->seq = crypto_alloc_sync_skcipher(ctx->gk5e->encrypt_name, 0, 0); if (IS_ERR(ctx->seq)) { - crypto_free_skcipher(ctx->enc); + crypto_free_sync_skcipher(ctx->enc); err = PTR_ERR(ctx->seq); goto out_err_free_hmac; } @@ -591,7 +588,7 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) context_v2_alloc_cipher(ctx, "cbc(aes)", ctx->acceptor_seal); if (ctx->acceptor_enc_aux == NULL) { - crypto_free_skcipher(ctx->initiator_enc_aux); + crypto_free_sync_skcipher(ctx->initiator_enc_aux); goto out_free_acceptor_enc; } } @@ -599,9 +596,9 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask) return 0; out_free_acceptor_enc: - crypto_free_skcipher(ctx->acceptor_enc); + crypto_free_sync_skcipher(ctx->acceptor_enc); out_free_initiator_enc: - crypto_free_skcipher(ctx->initiator_enc); + crypto_free_sync_skcipher(ctx->initiator_enc); out_err: return -EINVAL; } @@ -713,12 +710,12 @@ static void gss_delete_sec_context_kerberos(void *internal_ctx) { struct krb5_ctx *kctx = internal_ctx; - crypto_free_skcipher(kctx->seq); - crypto_free_skcipher(kctx->enc); - crypto_free_skcipher(kctx->acceptor_enc); - crypto_free_skcipher(kctx->initiator_enc); - crypto_free_skcipher(kctx->acceptor_enc_aux); - crypto_free_skcipher(kctx->initiator_enc_aux); + crypto_free_sync_skcipher(kctx->seq); + crypto_free_sync_skcipher(kctx->enc); + crypto_free_sync_skcipher(kctx->acceptor_enc); + crypto_free_sync_skcipher(kctx->initiator_enc); + crypto_free_sync_skcipher(kctx->acceptor_enc_aux); + crypto_free_sync_skcipher(kctx->initiator_enc_aux); kfree(kctx->mech_used.data); kfree(kctx); } diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index eaad9bc7a0bd..b4adeb06660b 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -63,13 +63,12 @@ #include <linux/sunrpc/gss_krb5.h> #include <linux/random.h> #include <linux/crypto.h> +#include <linux/atomic.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif -DEFINE_SPINLOCK(krb5_seq_lock); - static void * setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) { @@ -124,6 +123,30 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) return krb5_hdr; } +u32 +gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx) +{ + u32 old, seq_send = READ_ONCE(ctx->seq_send); + + do { + old = seq_send; + seq_send = cmpxchg(&ctx->seq_send, old, old + 1); + } while (old != seq_send); + return seq_send; +} + +u64 +gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx) +{ + u64 old, seq_send = READ_ONCE(ctx->seq_send); + + do { + old = seq_send; + seq_send = cmpxchg64(&ctx->seq_send64, old, old + 1); + } while (old != seq_send); + return seq_send; +} + static u32 gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj *token) @@ -154,9 +177,7 @@ gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - spin_lock(&krb5_seq_lock); - seq_send = ctx->seq_send++; - spin_unlock(&krb5_seq_lock); + seq_send = gss_seq_send_fetch_and_inc(ctx); if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) @@ -174,7 +195,6 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, .data = cksumdata}; void *krb5_hdr; s32 now; - u64 seq_send; u8 *cksumkey; unsigned int cksum_usage; __be64 seq_send_be64; @@ -185,11 +205,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, /* Set up the sequence number. Now 64-bits in clear * text and w/o direction indicator */ - spin_lock(&krb5_seq_lock); - seq_send = ctx->seq_send64++; - spin_unlock(&krb5_seq_lock); - - seq_send_be64 = cpu_to_be64(seq_send); + seq_send_be64 = cpu_to_be64(gss_seq_send64_fetch_and_inc(ctx)); memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8); if (ctx->initiate) { diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c index c8b9082f4a9d..fb6656295204 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c +++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c @@ -43,13 +43,12 @@ static s32 krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, unsigned char *cksum, unsigned char *buf) { - struct crypto_skcipher *cipher; + struct crypto_sync_skcipher *cipher; unsigned char plain[8]; s32 code; dprintk("RPC: %s:\n", __func__); - cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0); if (IS_ERR(cipher)) return PTR_ERR(cipher); @@ -68,12 +67,12 @@ krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, code = krb5_encrypt(cipher, cksum, plain, buf, 8); out: - crypto_free_skcipher(cipher); + crypto_free_sync_skcipher(cipher); return code; } s32 krb5_make_seq_num(struct krb5_ctx *kctx, - struct crypto_skcipher *key, + struct crypto_sync_skcipher *key, int direction, u32 seqnum, unsigned char *cksum, unsigned char *buf) @@ -101,13 +100,12 @@ static s32 krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, unsigned char *buf, int *direction, s32 *seqnum) { - struct crypto_skcipher *cipher; + struct crypto_sync_skcipher *cipher; unsigned char plain[8]; s32 code; dprintk("RPC: %s:\n", __func__); - cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, 0, 0); if (IS_ERR(cipher)) return PTR_ERR(cipher); @@ -130,7 +128,7 @@ krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, *seqnum = ((plain[0] << 24) | (plain[1] << 16) | (plain[2] << 8) | (plain[3])); out: - crypto_free_skcipher(cipher); + crypto_free_sync_skcipher(cipher); return code; } @@ -142,7 +140,7 @@ krb5_get_seq_num(struct krb5_ctx *kctx, { s32 code; unsigned char plain[8]; - struct crypto_skcipher *key = kctx->seq; + struct crypto_sync_skcipher *key = kctx->seq; dprintk("RPC: krb5_get_seq_num:\n"); diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 39a2e672900b..962fa84e6db1 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -174,7 +174,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, now = get_seconds(); - blocksize = crypto_skcipher_blocksize(kctx->enc); + blocksize = crypto_sync_skcipher_blocksize(kctx->enc); gss_krb5_add_padding(buf, offset, blocksize); BUG_ON((buf->len - offset) % blocksize); plainlen = conflen + buf->len - offset; @@ -228,9 +228,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - spin_lock(&krb5_seq_lock); - seq_send = kctx->seq_send++; - spin_unlock(&krb5_seq_lock); + seq_send = gss_seq_send_fetch_and_inc(kctx); /* XXX would probably be more efficient to compute checksum * and encrypt at the same time: */ @@ -239,10 +237,10 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, return GSS_S_FAILURE; if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { - struct crypto_skcipher *cipher; + struct crypto_sync_skcipher *cipher; int err; - cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, + 0, 0); if (IS_ERR(cipher)) return GSS_S_FAILURE; @@ -250,7 +248,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, err = gss_encrypt_xdr_buf(cipher, buf, offset + headlen - conflen, pages); - crypto_free_skcipher(cipher); + crypto_free_sync_skcipher(cipher); if (err) return GSS_S_FAILURE; } else { @@ -327,18 +325,18 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) return GSS_S_BAD_SIG; if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) { - struct crypto_skcipher *cipher; + struct crypto_sync_skcipher *cipher; int err; - cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0, - CRYPTO_ALG_ASYNC); + cipher = crypto_alloc_sync_skcipher(kctx->gk5e->encrypt_name, + 0, 0); if (IS_ERR(cipher)) return GSS_S_FAILURE; krb5_rc4_setup_enc_key(kctx, cipher, seqnum); err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset); - crypto_free_skcipher(cipher); + crypto_free_sync_skcipher(cipher); if (err) return GSS_S_DEFECTIVE_TOKEN; } else { @@ -371,7 +369,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) /* Copy the data back to the right position. XXX: Would probably be * better to copy and encrypt at the same time. */ - blocksize = crypto_skcipher_blocksize(kctx->enc); + blocksize = crypto_sync_skcipher_blocksize(kctx->enc); data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) + conflen; orig_start = buf->head[0].iov_base + offset; @@ -477,9 +475,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, *be16ptr++ = 0; be64ptr = (__be64 *)be16ptr; - spin_lock(&krb5_seq_lock); - *be64ptr = cpu_to_be64(kctx->seq_send64++); - spin_unlock(&krb5_seq_lock); + *be64ptr = cpu_to_be64(gss_seq_send64_fetch_and_inc(kctx)); err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages); if (err) diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 5fec3abbe19b..16ac0f4cb7d8 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -117,7 +117,7 @@ int gss_mech_register(struct gss_api_mech *gm) if (status) return status; spin_lock(®istered_mechs_lock); - list_add(&gm->gm_list, ®istered_mechs); + list_add_rcu(&gm->gm_list, ®istered_mechs); spin_unlock(®istered_mechs_lock); dprintk("RPC: registered gss mechanism %s\n", gm->gm_name); return 0; @@ -132,7 +132,7 @@ EXPORT_SYMBOL_GPL(gss_mech_register); void gss_mech_unregister(struct gss_api_mech *gm) { spin_lock(®istered_mechs_lock); - list_del(&gm->gm_list); + list_del_rcu(&gm->gm_list); spin_unlock(®istered_mechs_lock); dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name); gss_mech_free(gm); @@ -151,15 +151,15 @@ _gss_mech_get_by_name(const char *name) { struct gss_api_mech *pos, *gm = NULL; - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { if (0 == strcmp(name, pos->gm_name)) { if (try_module_get(pos->gm_owner)) gm = pos; break; } } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -186,8 +186,8 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) dprintk("RPC: %s(%s)\n", __func__, buf); request_module("rpc-auth-gss-%s", buf); - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { if (obj->len == pos->gm_oid.len) { if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) { if (try_module_get(pos->gm_owner)) @@ -196,7 +196,7 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) } } } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -216,15 +216,15 @@ static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor) { struct gss_api_mech *gm = NULL, *pos; - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { if (!mech_supports_pseudoflavor(pos, pseudoflavor)) continue; if (try_module_get(pos->gm_owner)) gm = pos; break; } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -257,8 +257,8 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) struct gss_api_mech *pos = NULL; int j, i = 0; - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { for (j = 0; j < pos->gm_pf_num; j++) { if (i >= size) { spin_unlock(®istered_mechs_lock); @@ -267,7 +267,7 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) array_ptr[i++] = pos->gm_pfs[j].pseudoflavor; } } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return i; } diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 444380f968f1..006062ad5f58 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -784,6 +784,7 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req, xdr_inline_pages(&req->rq_rcv_buf, PAGE_SIZE/2 /* pretty arbitrary */, arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE); + req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; done: if (err) dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 860f2a1bbb67..1ece4bc3eb8d 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -76,6 +76,7 @@ struct rsi { struct xdr_netobj in_handle, in_token; struct xdr_netobj out_handle, out_token; int major_status, minor_status; + struct rcu_head rcu_head; }; static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old); @@ -89,13 +90,21 @@ static void rsi_free(struct rsi *rsii) kfree(rsii->out_token.data); } -static void rsi_put(struct kref *ref) +static void rsi_free_rcu(struct rcu_head *head) { - struct rsi *rsii = container_of(ref, struct rsi, h.ref); + struct rsi *rsii = container_of(head, struct rsi, rcu_head); + rsi_free(rsii); kfree(rsii); } +static void rsi_put(struct kref *ref) +{ + struct rsi *rsii = container_of(ref, struct rsi, h.ref); + + call_rcu(&rsii->rcu_head, rsi_free_rcu); +} + static inline int rsi_hash(struct rsi *item) { return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS) @@ -282,7 +291,7 @@ static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item) struct cache_head *ch; int hash = rsi_hash(item); - ch = sunrpc_cache_lookup(cd, &item->h, hash); + ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsi, h); else @@ -330,6 +339,7 @@ struct rsc { struct svc_cred cred; struct gss_svc_seq_data seqdata; struct gss_ctx *mechctx; + struct rcu_head rcu_head; }; static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old); @@ -343,12 +353,22 @@ static void rsc_free(struct rsc *rsci) free_svc_cred(&rsci->cred); } +static void rsc_free_rcu(struct rcu_head *head) +{ + struct rsc *rsci = container_of(head, struct rsc, rcu_head); + + kfree(rsci->handle.data); + kfree(rsci); +} + static void rsc_put(struct kref *ref) { struct rsc *rsci = container_of(ref, struct rsc, h.ref); - rsc_free(rsci); - kfree(rsci); + if (rsci->mechctx) + gss_delete_sec_context(&rsci->mechctx); + free_svc_cred(&rsci->cred); + call_rcu(&rsci->rcu_head, rsc_free_rcu); } static inline int @@ -542,7 +562,7 @@ static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item) struct cache_head *ch; int hash = rsc_hash(item); - ch = sunrpc_cache_lookup(cd, &item->h, hash); + ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsc, h); else @@ -1764,14 +1784,21 @@ out_err: } static void -svcauth_gss_domain_release(struct auth_domain *dom) +svcauth_gss_domain_release_rcu(struct rcu_head *head) { + struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head); struct gss_domain *gd = container_of(dom, struct gss_domain, h); kfree(dom->name); kfree(gd); } +static void +svcauth_gss_domain_release(struct auth_domain *dom) +{ + call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu); +} + static struct auth_ops svcauthops_gss = { .name = "rpcsec_gss", .owner = THIS_MODULE, diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 4b48228ee8c7..2694a1bc026b 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -21,7 +21,7 @@ static struct rpc_cred null_cred; static struct rpc_auth * nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { - atomic_inc(&null_auth.au_count); + refcount_inc(&null_auth.au_count); return &null_auth; } @@ -119,7 +119,7 @@ struct rpc_auth null_auth = { .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static @@ -138,6 +138,6 @@ struct rpc_cred null_cred = { .cr_lru = LIST_HEAD_INIT(null_cred.cr_lru), .cr_auth = &null_auth, .cr_ops = &null_credops, - .cr_count = ATOMIC_INIT(1), + .cr_count = REFCOUNT_INIT(2), .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, }; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 185e56d4f9ae..4c1c7e56288f 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -34,7 +34,7 @@ unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { dprintk("RPC: creating UNIX authenticator for client %p\n", clnt); - atomic_inc(&unix_auth.au_count); + refcount_inc(&unix_auth.au_count); return &unix_auth; } @@ -239,7 +239,7 @@ struct rpc_auth unix_auth = { .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 3c15a99b9700..fa5ba6ed3197 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -91,7 +91,6 @@ struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags) return NULL; req->rq_xprt = xprt; - INIT_LIST_HEAD(&req->rq_list); INIT_LIST_HEAD(&req->rq_bc_list); /* Preallocate one XDR receive buffer */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 109fbe591e7b..f96345b1180e 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -54,28 +54,33 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail) h->last_refresh = now; } -struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, - struct cache_head *key, int hash) +static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail, + struct cache_head *key, + int hash) { - struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL; - struct hlist_head *head; - - head = &detail->hash_table[hash]; - - read_lock(&detail->hash_lock); + struct hlist_head *head = &detail->hash_table[hash]; + struct cache_head *tmp; - hlist_for_each_entry(tmp, head, cache_list) { + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) - /* This entry is expired, we will discard it. */ - break; - cache_get(tmp); - read_unlock(&detail->hash_lock); + continue; + tmp = cache_get_rcu(tmp); + rcu_read_unlock(); return tmp; } } - read_unlock(&detail->hash_lock); - /* Didn't find anything, insert an empty entry */ + rcu_read_unlock(); + return NULL; +} + +static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, + struct cache_head *key, + int hash) +{ + struct cache_head *new, *tmp, *freeme = NULL; + struct hlist_head *head = &detail->hash_table[hash]; new = detail->alloc(); if (!new) @@ -87,35 +92,46 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, cache_init(new, detail); detail->init(new, key); - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); /* check if entry appeared while we slept */ - hlist_for_each_entry(tmp, head, cache_list) { + hlist_for_each_entry_rcu(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) { - hlist_del_init(&tmp->cache_list); + hlist_del_init_rcu(&tmp->cache_list); detail->entries --; freeme = tmp; break; } cache_get(tmp); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_put(new, detail); return tmp; } } - hlist_add_head(&new->cache_list, head); + hlist_add_head_rcu(&new->cache_list, head); detail->entries++; cache_get(new); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); if (freeme) cache_put(freeme, detail); return new; } -EXPORT_SYMBOL_GPL(sunrpc_cache_lookup); +struct cache_head *sunrpc_cache_lookup_rcu(struct cache_detail *detail, + struct cache_head *key, int hash) +{ + struct cache_head *ret; + + ret = sunrpc_cache_find_rcu(detail, key, hash); + if (ret) + return ret; + /* Didn't find anything, insert an empty entry */ + return sunrpc_cache_add_entry(detail, key, hash); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu); static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); @@ -151,18 +167,18 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, struct cache_head *tmp; if (!test_bit(CACHE_VALID, &old->flags)) { - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); if (!test_bit(CACHE_VALID, &old->flags)) { if (test_bit(CACHE_NEGATIVE, &new->flags)) set_bit(CACHE_NEGATIVE, &old->flags); else detail->update(old, new); cache_fresh_locked(old, new->expiry_time, detail); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(old, detail); return old; } - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); } /* We need to insert a new entry */ tmp = detail->alloc(); @@ -173,7 +189,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, cache_init(tmp, detail); detail->init(tmp, old); - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); if (test_bit(CACHE_NEGATIVE, &new->flags)) set_bit(CACHE_NEGATIVE, &tmp->flags); else @@ -183,7 +199,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, cache_get(tmp); cache_fresh_locked(tmp, new->expiry_time, detail); cache_fresh_locked(old, 0, detail); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(tmp, detail); cache_fresh_unlocked(old, detail); cache_put(old, detail); @@ -223,7 +239,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h { int rv; - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); rv = cache_is_valid(h); if (rv == -EAGAIN) { set_bit(CACHE_NEGATIVE, &h->flags); @@ -231,7 +247,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h detail); rv = -ENOENT; } - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(h, detail); return rv; } @@ -341,7 +357,7 @@ static struct delayed_work cache_cleaner; void sunrpc_init_cache_detail(struct cache_detail *cd) { - rwlock_init(&cd->hash_lock); + spin_lock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); spin_lock(&cache_list_lock); cd->nextcheck = 0; @@ -361,11 +377,11 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) { cache_purge(cd); spin_lock(&cache_list_lock); - write_lock(&cd->hash_lock); + spin_lock(&cd->hash_lock); if (current_detail == cd) current_detail = NULL; list_del_init(&cd->others); - write_unlock(&cd->hash_lock); + spin_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); if (list_empty(&cache_list)) { /* module must be being unloaded so its safe to kill the worker */ @@ -422,7 +438,7 @@ static int cache_clean(void) struct hlist_head *head; struct hlist_node *tmp; - write_lock(¤t_detail->hash_lock); + spin_lock(¤t_detail->hash_lock); /* Ok, now to clean this strand */ @@ -433,13 +449,13 @@ static int cache_clean(void) if (!cache_is_expired(current_detail, ch)) continue; - hlist_del_init(&ch->cache_list); + hlist_del_init_rcu(&ch->cache_list); current_detail->entries--; rv = 1; break; } - write_unlock(¤t_detail->hash_lock); + spin_unlock(¤t_detail->hash_lock); d = current_detail; if (!ch) current_index ++; @@ -494,9 +510,9 @@ void cache_purge(struct cache_detail *detail) struct hlist_node *tmp = NULL; int i = 0; - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); if (!detail->entries) { - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); return; } @@ -504,17 +520,17 @@ void cache_purge(struct cache_detail *detail) for (i = 0; i < detail->hash_size; i++) { head = &detail->hash_table[i]; hlist_for_each_entry_safe(ch, tmp, head, cache_list) { - hlist_del_init(&ch->cache_list); + hlist_del_init_rcu(&ch->cache_list); detail->entries--; set_bit(CACHE_CLEANED, &ch->flags); - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); cache_fresh_unlocked(ch, detail); cache_put(ch, detail); - write_lock(&detail->hash_lock); + spin_lock(&detail->hash_lock); } } - write_unlock(&detail->hash_lock); + spin_unlock(&detail->hash_lock); } EXPORT_SYMBOL_GPL(cache_purge); @@ -1289,21 +1305,19 @@ EXPORT_SYMBOL_GPL(qword_get); * get a header, then pass each real item in the cache */ -void *cache_seq_start(struct seq_file *m, loff_t *pos) - __acquires(cd->hash_lock) +static void *__cache_seq_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; unsigned int hash, entry; struct cache_head *ch; struct cache_detail *cd = m->private; - read_lock(&cd->hash_lock); if (!n--) return SEQ_START_TOKEN; hash = n >> 32; entry = n & ((1LL<<32) - 1); - hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list) + hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; n &= ~((1LL<<32) - 1); @@ -1315,12 +1329,12 @@ void *cache_seq_start(struct seq_file *m, loff_t *pos) if (hash >= cd->hash_size) return NULL; *pos = n+1; - return hlist_entry_safe(cd->hash_table[hash].first, + return hlist_entry_safe(rcu_dereference_raw( + hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); } -EXPORT_SYMBOL_GPL(cache_seq_start); -void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) +static void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) { struct cache_head *ch = p; int hash = (*pos >> 32); @@ -1333,7 +1347,8 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) *pos += 1LL<<32; } else { ++*pos; - return hlist_entry_safe(ch->cache_list.next, + return hlist_entry_safe(rcu_dereference_raw( + hlist_next_rcu(&ch->cache_list)), struct cache_head, cache_list); } *pos &= ~((1LL<<32) - 1); @@ -1345,18 +1360,32 @@ void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) if (hash >= cd->hash_size) return NULL; ++*pos; - return hlist_entry_safe(cd->hash_table[hash].first, + return hlist_entry_safe(rcu_dereference_raw( + hlist_first_rcu(&cd->hash_table[hash])), struct cache_head, cache_list); } EXPORT_SYMBOL_GPL(cache_seq_next); -void cache_seq_stop(struct seq_file *m, void *p) - __releases(cd->hash_lock) +void *cache_seq_start_rcu(struct seq_file *m, loff_t *pos) + __acquires(RCU) { - struct cache_detail *cd = m->private; - read_unlock(&cd->hash_lock); + rcu_read_lock(); + return __cache_seq_start(m, pos); +} +EXPORT_SYMBOL_GPL(cache_seq_start_rcu); + +void *cache_seq_next_rcu(struct seq_file *file, void *p, loff_t *pos) +{ + return cache_seq_next(file, p, pos); +} +EXPORT_SYMBOL_GPL(cache_seq_next_rcu); + +void cache_seq_stop_rcu(struct seq_file *m, void *p) + __releases(RCU) +{ + rcu_read_unlock(); } -EXPORT_SYMBOL_GPL(cache_seq_stop); +EXPORT_SYMBOL_GPL(cache_seq_stop_rcu); static int c_show(struct seq_file *m, void *p) { @@ -1384,9 +1413,9 @@ static int c_show(struct seq_file *m, void *p) } static const struct seq_operations cache_content_op = { - .start = cache_seq_start, - .next = cache_seq_next, - .stop = cache_seq_stop, + .start = cache_seq_start_rcu, + .next = cache_seq_next_rcu, + .stop = cache_seq_stop_rcu, .show = c_show, }; @@ -1844,13 +1873,13 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h) { - write_lock(&cd->hash_lock); + spin_lock(&cd->hash_lock); if (!hlist_unhashed(&h->cache_list)){ - hlist_del_init(&h->cache_list); + hlist_del_init_rcu(&h->cache_list); cd->entries--; - write_unlock(&cd->hash_lock); + spin_unlock(&cd->hash_lock); cache_put(h, cd); } else - write_unlock(&cd->hash_lock); + spin_unlock(&cd->hash_lock); } EXPORT_SYMBOL_GPL(sunrpc_cache_unhash); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8ea2f5fadd96..ae3b8145da35 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -61,6 +61,7 @@ static void call_start(struct rpc_task *task); static void call_reserve(struct rpc_task *task); static void call_reserveresult(struct rpc_task *task); static void call_allocate(struct rpc_task *task); +static void call_encode(struct rpc_task *task); static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); static void call_bind_status(struct rpc_task *task); @@ -1137,10 +1138,10 @@ EXPORT_SYMBOL_GPL(rpc_call_async); struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) { struct rpc_task *task; - struct xdr_buf *xbufp = &req->rq_snd_buf; struct rpc_task_setup task_setup_data = { .callback_ops = &rpc_default_ops, - .flags = RPC_TASK_SOFTCONN, + .flags = RPC_TASK_SOFTCONN | + RPC_TASK_NO_RETRANS_TIMEOUT, }; dprintk("RPC: rpc_run_bc_task req= %p\n", req); @@ -1148,14 +1149,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) * Create an rpc_task to send the data */ task = rpc_new_task(&task_setup_data); - task->tk_rqstp = req; - - /* - * Set up the xdr_buf length. - * This also indicates that the buffer is XDR encoded already. - */ - xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + - xbufp->tail[0].iov_len; + xprt_init_bc_request(req, task); task->tk_action = call_bc_transmit; atomic_inc(&task->tk_count); @@ -1558,7 +1552,6 @@ call_reserveresult(struct rpc_task *task) task->tk_status = 0; if (status >= 0) { if (task->tk_rqstp) { - xprt_request_init(task); task->tk_action = call_refresh; return; } @@ -1680,7 +1673,7 @@ call_allocate(struct rpc_task *task) dprint_status(task); task->tk_status = 0; - task->tk_action = call_bind; + task->tk_action = call_encode; if (req->rq_buffer) return; @@ -1721,22 +1714,15 @@ call_allocate(struct rpc_task *task) rpc_exit(task, -ERESTARTSYS); } -static inline int +static int rpc_task_need_encode(struct rpc_task *task) { - return task->tk_rqstp->rq_snd_buf.len == 0; + return test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) == 0 && + (!(task->tk_flags & RPC_TASK_SENT) || + !(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) || + xprt_request_need_retransmit(task)); } -static inline void -rpc_task_force_reencode(struct rpc_task *task) -{ - task->tk_rqstp->rq_snd_buf.len = 0; - task->tk_rqstp->rq_bytes_sent = 0; -} - -/* - * 3. Encode arguments of an RPC call - */ static void rpc_xdr_encode(struct rpc_task *task) { @@ -1752,6 +1738,7 @@ rpc_xdr_encode(struct rpc_task *task) xdr_buf_init(&req->rq_rcv_buf, req->rq_rbuffer, req->rq_rcvsize); + req->rq_bytes_sent = 0; p = rpc_encode_header(task); if (p == NULL) { @@ -1766,6 +1753,36 @@ rpc_xdr_encode(struct rpc_task *task) task->tk_status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp); + if (task->tk_status == 0) + xprt_request_prepare(req); +} + +/* + * 3. Encode arguments of an RPC call + */ +static void +call_encode(struct rpc_task *task) +{ + if (!rpc_task_need_encode(task)) + goto out; + /* Encode here so that rpcsec_gss can use correct sequence number. */ + rpc_xdr_encode(task); + /* Did the encode result in an error condition? */ + if (task->tk_status != 0) { + /* Was the error nonfatal? */ + if (task->tk_status == -EAGAIN || task->tk_status == -ENOMEM) + rpc_delay(task, HZ >> 4); + else + rpc_exit(task, task->tk_status); + return; + } + + /* Add task to reply queue before transmission to avoid races */ + if (rpc_reply_expected(task)) + xprt_request_enqueue_receive(task); + xprt_request_enqueue_transmit(task); +out: + task->tk_action = call_bind; } /* @@ -1947,43 +1964,16 @@ call_connect_status(struct rpc_task *task) static void call_transmit(struct rpc_task *task) { - int is_retrans = RPC_WAS_SENT(task); - dprint_status(task); - task->tk_action = call_status; - if (task->tk_status < 0) - return; - if (!xprt_prepare_transmit(task)) - return; - task->tk_action = call_transmit_status; - /* Encode here so that rpcsec_gss can use correct sequence number. */ - if (rpc_task_need_encode(task)) { - rpc_xdr_encode(task); - /* Did the encode result in an error condition? */ - if (task->tk_status != 0) { - /* Was the error nonfatal? */ - if (task->tk_status == -EAGAIN) - rpc_delay(task, HZ >> 4); - else - rpc_exit(task, task->tk_status); + task->tk_status = 0; + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) { + if (!xprt_prepare_transmit(task)) return; - } + xprt_transmit(task); } - xprt_transmit(task); - if (task->tk_status < 0) - return; - if (is_retrans) - task->tk_client->cl_stats->rpcretrans++; - /* - * On success, ensure that we call xprt_end_transmit() before sleeping - * in order to allow access to the socket to other RPC requests. - */ - call_transmit_status(task); - if (rpc_reply_expected(task)) - return; - task->tk_action = rpc_exit_task; - rpc_wake_up_queued_task(&task->tk_rqstp->rq_xprt->pending, task); + task->tk_action = call_transmit_status; + xprt_end_transmit(task); } /* @@ -1999,19 +1989,17 @@ call_transmit_status(struct rpc_task *task) * test first. */ if (task->tk_status == 0) { - xprt_end_transmit(task); - rpc_task_force_reencode(task); + xprt_request_wait_receive(task); return; } switch (task->tk_status) { - case -EAGAIN: - case -ENOBUFS: - break; default: dprint_status(task); - xprt_end_transmit(task); - rpc_task_force_reencode(task); + break; + case -EBADMSG: + task->tk_status = 0; + task->tk_action = call_encode; break; /* * Special cases: if we've been waiting on the @@ -2019,6 +2007,14 @@ call_transmit_status(struct rpc_task *task) * socket just returned a connection error, * then hold onto the transport lock. */ + case -ENOBUFS: + rpc_delay(task, HZ>>2); + /* fall through */ + case -EBADSLT: + case -EAGAIN: + task->tk_action = call_transmit; + task->tk_status = 0; + break; case -ECONNREFUSED: case -EHOSTDOWN: case -ENETDOWN: @@ -2026,7 +2022,6 @@ call_transmit_status(struct rpc_task *task) case -ENETUNREACH: case -EPERM: if (RPC_IS_SOFTCONN(task)) { - xprt_end_transmit(task); if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); @@ -2039,7 +2034,7 @@ call_transmit_status(struct rpc_task *task) case -EADDRINUSE: case -ENOTCONN: case -EPIPE: - rpc_task_force_reencode(task); + break; } } @@ -2053,6 +2048,11 @@ call_bc_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; + if (rpc_task_need_encode(task)) + xprt_request_enqueue_transmit(task); + if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + goto out_wakeup; + if (!xprt_prepare_transmit(task)) goto out_retry; @@ -2061,14 +2061,9 @@ call_bc_transmit(struct rpc_task *task) "error: %d\n", task->tk_status); goto out_done; } - if (req->rq_connect_cookie != req->rq_xprt->connect_cookie) - req->rq_bytes_sent = 0; xprt_transmit(task); - if (task->tk_status == -EAGAIN) - goto out_nospace; - xprt_end_transmit(task); dprint_status(task); switch (task->tk_status) { @@ -2084,6 +2079,8 @@ call_bc_transmit(struct rpc_task *task) case -ENOTCONN: case -EPIPE: break; + case -EAGAIN: + goto out_retry; case -ETIMEDOUT: /* * Problem reaching the server. Disconnect and let the @@ -2107,12 +2104,11 @@ call_bc_transmit(struct rpc_task *task) "error: %d\n", task->tk_status); break; } +out_wakeup: rpc_wake_up_queued_task(&req->rq_xprt->pending, task); out_done: task->tk_action = rpc_exit_task; return; -out_nospace: - req->rq_connect_cookie = req->rq_xprt->connect_cookie; out_retry: task->tk_status = 0; } @@ -2125,15 +2121,11 @@ static void call_status(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_rqst *req = task->tk_rqstp; int status; if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); - if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent) - task->tk_status = req->rq_reply_bytes_recvd; - dprint_status(task); status = task->tk_status; @@ -2173,13 +2165,8 @@ call_status(struct rpc_task *task) /* fall through */ case -EPIPE: case -ENOTCONN: - task->tk_action = call_bind; - break; - case -ENOBUFS: - rpc_delay(task, HZ>>2); - /* fall through */ case -EAGAIN: - task->tk_action = call_transmit; + task->tk_action = call_encode; break; case -EIO: /* shutdown or soft timeout */ @@ -2244,7 +2231,7 @@ call_timeout(struct rpc_task *task) rpcauth_invalcred(task); retry: - task->tk_action = call_bind; + task->tk_action = call_encode; task->tk_status = 0; } @@ -2261,6 +2248,11 @@ call_decode(struct rpc_task *task) dprint_status(task); + if (!decode) { + task->tk_action = rpc_exit_task; + return; + } + if (task->tk_flags & RPC_CALL_MAJORSEEN) { if (clnt->cl_chatty) { printk(KERN_NOTICE "%s: server %s OK\n", @@ -2283,7 +2275,7 @@ call_decode(struct rpc_task *task) if (req->rq_rcv_buf.len < 12) { if (!RPC_IS_SOFT(task)) { - task->tk_action = call_bind; + task->tk_action = call_encode; goto out_retry; } dprintk("RPC: %s: too small RPC reply size (%d bytes)\n", @@ -2298,13 +2290,11 @@ call_decode(struct rpc_task *task) goto out_retry; return; } - task->tk_action = rpc_exit_task; - if (decode) { - task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, - task->tk_msg.rpc_resp); - } + task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, + task->tk_msg.rpc_resp); + dprintk("RPC: %5u call_decode result %d\n", task->tk_pid, task->tk_status); return; @@ -2416,7 +2406,7 @@ rpc_verify_header(struct rpc_task *task) task->tk_garb_retry--; dprintk("RPC: %5u %s: retry garbled creds\n", task->tk_pid, __func__); - task->tk_action = call_bind; + task->tk_action = call_encode; goto out_retry; case RPC_AUTH_TOOWEAK: printk(KERN_NOTICE "RPC: server %s requires stronger " @@ -2485,7 +2475,7 @@ out_garbage: task->tk_garb_retry--; dprintk("RPC: %5u %s: retrying\n", task->tk_pid, __func__); - task->tk_action = call_bind; + task->tk_action = call_encode; out_retry: return ERR_PTR(-EAGAIN); } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 3fe5d60ab0e2..57ca5bead1cb 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -99,65 +99,79 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); } -static void rpc_rotate_queue_owner(struct rpc_wait_queue *queue) -{ - struct list_head *q = &queue->tasks[queue->priority]; - struct rpc_task *task; - - if (!list_empty(q)) { - task = list_first_entry(q, struct rpc_task, u.tk_wait.list); - if (task->tk_owner == queue->owner) - list_move_tail(&task->u.tk_wait.list, q); - } -} - static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) { if (queue->priority != priority) { - /* Fairness: rotate the list when changing priority */ - rpc_rotate_queue_owner(queue); queue->priority = priority; + queue->nr = 1U << priority; } } -static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid) -{ - queue->owner = pid; - queue->nr = RPC_BATCH_COUNT; -} - static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) { rpc_set_waitqueue_priority(queue, queue->maxpriority); - rpc_set_waitqueue_owner(queue, 0); } /* - * Add new request to a priority queue. + * Add a request to a queue list */ -static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, - struct rpc_task *task, - unsigned char queue_priority) +static void +__rpc_list_enqueue_task(struct list_head *q, struct rpc_task *task) { - struct list_head *q; struct rpc_task *t; - INIT_LIST_HEAD(&task->u.tk_wait.links); - if (unlikely(queue_priority > queue->maxpriority)) - queue_priority = queue->maxpriority; - if (queue_priority > queue->priority) - rpc_set_waitqueue_priority(queue, queue_priority); - q = &queue->tasks[queue_priority]; list_for_each_entry(t, q, u.tk_wait.list) { if (t->tk_owner == task->tk_owner) { - list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); + list_add_tail(&task->u.tk_wait.links, + &t->u.tk_wait.links); + /* Cache the queue head in task->u.tk_wait.list */ + task->u.tk_wait.list.next = q; + task->u.tk_wait.list.prev = NULL; return; } } + INIT_LIST_HEAD(&task->u.tk_wait.links); list_add_tail(&task->u.tk_wait.list, q); } /* + * Remove request from a queue list + */ +static void +__rpc_list_dequeue_task(struct rpc_task *task) +{ + struct list_head *q; + struct rpc_task *t; + + if (task->u.tk_wait.list.prev == NULL) { + list_del(&task->u.tk_wait.links); + return; + } + if (!list_empty(&task->u.tk_wait.links)) { + t = list_first_entry(&task->u.tk_wait.links, + struct rpc_task, + u.tk_wait.links); + /* Assume __rpc_list_enqueue_task() cached the queue head */ + q = t->u.tk_wait.list.next; + list_add_tail(&t->u.tk_wait.list, q); + list_del(&task->u.tk_wait.links); + } + list_del(&task->u.tk_wait.list); +} + +/* + * Add new request to a priority queue. + */ +static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, + struct rpc_task *task, + unsigned char queue_priority) +{ + if (unlikely(queue_priority > queue->maxpriority)) + queue_priority = queue->maxpriority; + __rpc_list_enqueue_task(&queue->tasks[queue_priority], task); +} + +/* * Add new request to wait queue. * * Swapper tasks always get inserted at the head of the queue. @@ -194,13 +208,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, */ static void __rpc_remove_wait_queue_priority(struct rpc_task *task) { - struct rpc_task *t; - - if (!list_empty(&task->u.tk_wait.links)) { - t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list); - list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); - list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); - } + __rpc_list_dequeue_task(task); } /* @@ -212,7 +220,8 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas __rpc_disable_timer(queue, task); if (RPC_IS_PRIORITY(queue)) __rpc_remove_wait_queue_priority(task); - list_del(&task->u.tk_wait.list); + else + list_del(&task->u.tk_wait.list); queue->qlen--; dprintk("RPC: %5u removed from queue %p \"%s\"\n", task->tk_pid, queue, rpc_qname(queue)); @@ -440,14 +449,28 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, struct rpc_task *task) +static struct rpc_task * +rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task, + bool (*action)(struct rpc_task *, void *), void *data) { if (RPC_IS_QUEUED(task)) { smp_rmb(); - if (task->tk_waitqueue == queue) - __rpc_do_wake_up_task_on_wq(wq, queue, task); + if (task->tk_waitqueue == queue) { + if (action == NULL || action(task, data)) { + __rpc_do_wake_up_task_on_wq(wq, queue, task); + return task; + } + } } + return NULL; +} + +static void +rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task) +{ + rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL); } /* @@ -465,6 +488,8 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, struct rpc_wait_queue *queue, struct rpc_task *task) { + if (!RPC_IS_QUEUED(task)) + return; spin_lock_bh(&queue->lock); rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); spin_unlock_bh(&queue->lock); @@ -475,12 +500,48 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, */ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) { + if (!RPC_IS_QUEUED(task)) + return; spin_lock_bh(&queue->lock); rpc_wake_up_task_queue_locked(queue, task); spin_unlock_bh(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); +static bool rpc_task_action_set_status(struct rpc_task *task, void *status) +{ + task->tk_status = *(int *)status; + return true; +} + +static void +rpc_wake_up_task_queue_set_status_locked(struct rpc_wait_queue *queue, + struct rpc_task *task, int status) +{ + rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue, + task, rpc_task_action_set_status, &status); +} + +/** + * rpc_wake_up_queued_task_set_status - wake up a task and set task->tk_status + * @queue: pointer to rpc_wait_queue + * @task: pointer to rpc_task + * @status: integer error value + * + * If @task is queued on @queue, then it is woken up, and @task->tk_status is + * set to the value of @status. + */ +void +rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *queue, + struct rpc_task *task, int status) +{ + if (!RPC_IS_QUEUED(task)) + return; + spin_lock_bh(&queue->lock); + rpc_wake_up_task_queue_set_status_locked(queue, task, status); + spin_unlock_bh(&queue->lock); +} + /* * Wake up the next task on a priority queue. */ @@ -493,17 +554,9 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q * Service a batch of tasks from a single owner. */ q = &queue->tasks[queue->priority]; - if (!list_empty(q)) { - task = list_entry(q->next, struct rpc_task, u.tk_wait.list); - if (queue->owner == task->tk_owner) { - if (--queue->nr) - goto out; - list_move_tail(&task->u.tk_wait.list, q); - } - /* - * Check if we need to switch queues. - */ - goto new_owner; + if (!list_empty(q) && --queue->nr) { + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); + goto out; } /* @@ -515,7 +568,7 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q else q = q - 1; if (!list_empty(q)) { - task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto new_queue; } } while (q != &queue->tasks[queue->priority]); @@ -525,8 +578,6 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q new_queue: rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0])); -new_owner: - rpc_set_waitqueue_owner(queue, task->tk_owner); out: return task; } @@ -553,12 +604,9 @@ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, queue, rpc_qname(queue)); spin_lock_bh(&queue->lock); task = __rpc_find_next_queued(queue); - if (task != NULL) { - if (func(task, data)) - rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); - else - task = NULL; - } + if (task != NULL) + task = rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, + task, func, data); spin_unlock_bh(&queue->lock); return task; diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index f217c348b341..9062967575c4 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -26,7 +26,8 @@ * Possibly called several times to iterate over an sk_buff and copy * data out of it. */ -size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) +static size_t +xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) { if (len > desc->count) len = desc->count; @@ -36,7 +37,6 @@ size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) desc->offset += len; return len; } -EXPORT_SYMBOL_GPL(xdr_skb_read_bits); /** * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer @@ -69,7 +69,8 @@ static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, * @copy_actor: virtual method for copying data * */ -ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) +static ssize_t +xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) { struct page **ppage = xdr->pages; unsigned int len, pglen = xdr->page_len; @@ -104,7 +105,7 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct /* ACL likes to be lazy in allocating pages - ACLs * are small by default but can get huge. */ - if (unlikely(*ppage == NULL)) { + if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) { *ppage = alloc_page(GFP_ATOMIC); if (unlikely(*ppage == NULL)) { if (copied == 0) @@ -140,7 +141,6 @@ copy_tail: out: return copied; } -EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb); /** * csum_partial_copy_to_xdr - checksum and copy data diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5185efb9027b..51d36230b6e3 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -171,7 +171,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl, mutex_init(&xprt->xpt_mutex); spin_lock_init(&xprt->xpt_lock); set_bit(XPT_BUSY, &xprt->xpt_flags); - rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending"); xprt->xpt_net = get_net(net); strcpy(xprt->xpt_remotebuf, "uninitialized"); } @@ -895,7 +894,6 @@ int svc_send(struct svc_rqst *rqstp) else len = xprt->xpt_ops->xpo_sendto(rqstp); mutex_unlock(&xprt->xpt_mutex); - rpc_wake_up(&xprt->xpt_bc_pending); trace_svc_send(rqstp, len); svc_xprt_release(rqstp); @@ -989,7 +987,7 @@ static void call_xpt_users(struct svc_xprt *xprt) spin_lock(&xprt->xpt_lock); while (!list_empty(&xprt->xpt_users)) { u = list_first_entry(&xprt->xpt_users, struct svc_xpt_user, list); - list_del(&u->list); + list_del_init(&u->list); u->callback(u); } spin_unlock(&xprt->xpt_lock); diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index bb8db3cb8032..775b8c94265b 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -27,12 +27,32 @@ extern struct auth_ops svcauth_null; extern struct auth_ops svcauth_unix; -static DEFINE_SPINLOCK(authtab_lock); -static struct auth_ops *authtab[RPC_AUTH_MAXFLAVOR] = { - [0] = &svcauth_null, - [1] = &svcauth_unix, +static struct auth_ops __rcu *authtab[RPC_AUTH_MAXFLAVOR] = { + [RPC_AUTH_NULL] = (struct auth_ops __force __rcu *)&svcauth_null, + [RPC_AUTH_UNIX] = (struct auth_ops __force __rcu *)&svcauth_unix, }; +static struct auth_ops * +svc_get_auth_ops(rpc_authflavor_t flavor) +{ + struct auth_ops *aops; + + if (flavor >= RPC_AUTH_MAXFLAVOR) + return NULL; + rcu_read_lock(); + aops = rcu_dereference(authtab[flavor]); + if (aops != NULL && !try_module_get(aops->owner)) + aops = NULL; + rcu_read_unlock(); + return aops; +} + +static void +svc_put_auth_ops(struct auth_ops *aops) +{ + module_put(aops->owner); +} + int svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) { @@ -45,14 +65,11 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) dprintk("svc: svc_authenticate (%d)\n", flavor); - spin_lock(&authtab_lock); - if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) || - !try_module_get(aops->owner)) { - spin_unlock(&authtab_lock); + aops = svc_get_auth_ops(flavor); + if (aops == NULL) { *authp = rpc_autherr_badcred; return SVC_DENIED; } - spin_unlock(&authtab_lock); rqstp->rq_auth_slack = 0; init_svc_cred(&rqstp->rq_cred); @@ -82,7 +99,7 @@ int svc_authorise(struct svc_rqst *rqstp) if (aops) { rv = aops->release(rqstp); - module_put(aops->owner); + svc_put_auth_ops(aops); } return rv; } @@ -90,13 +107,14 @@ int svc_authorise(struct svc_rqst *rqstp) int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops) { + struct auth_ops *old; int rv = -EINVAL; - spin_lock(&authtab_lock); - if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) { - authtab[flavor] = aops; - rv = 0; + + if (flavor < RPC_AUTH_MAXFLAVOR) { + old = cmpxchg((struct auth_ops ** __force)&authtab[flavor], NULL, aops); + if (old == NULL || old == aops) + rv = 0; } - spin_unlock(&authtab_lock); return rv; } EXPORT_SYMBOL_GPL(svc_auth_register); @@ -104,10 +122,8 @@ EXPORT_SYMBOL_GPL(svc_auth_register); void svc_auth_unregister(rpc_authflavor_t flavor) { - spin_lock(&authtab_lock); if (flavor < RPC_AUTH_MAXFLAVOR) - authtab[flavor] = NULL; - spin_unlock(&authtab_lock); + rcu_assign_pointer(authtab[flavor], NULL); } EXPORT_SYMBOL_GPL(svc_auth_unregister); @@ -127,10 +143,11 @@ static struct hlist_head auth_domain_table[DN_HASHMAX]; static DEFINE_SPINLOCK(auth_domain_lock); static void auth_domain_release(struct kref *kref) + __releases(&auth_domain_lock) { struct auth_domain *dom = container_of(kref, struct auth_domain, ref); - hlist_del(&dom->hash); + hlist_del_rcu(&dom->hash); dom->flavour->domain_release(dom); spin_unlock(&auth_domain_lock); } @@ -159,7 +176,7 @@ auth_domain_lookup(char *name, struct auth_domain *new) } } if (new) - hlist_add_head(&new->hash, head); + hlist_add_head_rcu(&new->hash, head); spin_unlock(&auth_domain_lock); return new; } @@ -167,6 +184,21 @@ EXPORT_SYMBOL_GPL(auth_domain_lookup); struct auth_domain *auth_domain_find(char *name) { - return auth_domain_lookup(name, NULL); + struct auth_domain *hp; + struct hlist_head *head; + + head = &auth_domain_table[hash_str(name, DN_HASHBITS)]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(hp, head, hash) { + if (strcmp(hp->name, name)==0) { + if (!kref_get_unless_zero(&hp->ref)) + hp = NULL; + rcu_read_unlock(); + return hp; + } + } + rcu_read_unlock(); + return NULL; } EXPORT_SYMBOL_GPL(auth_domain_find); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index af7f28fb8102..fb9041b92f72 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -37,20 +37,26 @@ struct unix_domain { extern struct auth_ops svcauth_null; extern struct auth_ops svcauth_unix; -static void svcauth_unix_domain_release(struct auth_domain *dom) +static void svcauth_unix_domain_release_rcu(struct rcu_head *head) { + struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head); struct unix_domain *ud = container_of(dom, struct unix_domain, h); kfree(dom->name); kfree(ud); } +static void svcauth_unix_domain_release(struct auth_domain *dom) +{ + call_rcu(&dom->rcu_head, svcauth_unix_domain_release_rcu); +} + struct auth_domain *unix_domain_find(char *name) { struct auth_domain *rv; struct unix_domain *new = NULL; - rv = auth_domain_lookup(name, NULL); + rv = auth_domain_find(name); while(1) { if (rv) { if (new && rv != &new->h) @@ -91,6 +97,7 @@ struct ip_map { char m_class[8]; /* e.g. "nfsd" */ struct in6_addr m_addr; struct unix_domain *m_client; + struct rcu_head m_rcu; }; static void ip_map_put(struct kref *kref) @@ -101,7 +108,7 @@ static void ip_map_put(struct kref *kref) if (test_bit(CACHE_VALID, &item->flags) && !test_bit(CACHE_NEGATIVE, &item->flags)) auth_domain_put(&im->m_client->h); - kfree(im); + kfree_rcu(im, m_rcu); } static inline int hash_ip6(const struct in6_addr *ip) @@ -280,9 +287,9 @@ static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, strcpy(ip.m_class, class); ip.m_addr = *addr; - ch = sunrpc_cache_lookup(cd, &ip.h, - hash_str(class, IP_HASHBITS) ^ - hash_ip6(addr)); + ch = sunrpc_cache_lookup_rcu(cd, &ip.h, + hash_str(class, IP_HASHBITS) ^ + hash_ip6(addr)); if (ch) return container_of(ch, struct ip_map, h); @@ -412,6 +419,7 @@ struct unix_gid { struct cache_head h; kuid_t uid; struct group_info *gi; + struct rcu_head rcu; }; static int unix_gid_hash(kuid_t uid) @@ -426,7 +434,7 @@ static void unix_gid_put(struct kref *kref) if (test_bit(CACHE_VALID, &item->flags) && !test_bit(CACHE_NEGATIVE, &item->flags)) put_group_info(ug->gi); - kfree(ug); + kfree_rcu(ug, rcu); } static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew) @@ -619,7 +627,7 @@ static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, kuid_t uid) struct cache_head *ch; ug.uid = uid; - ch = sunrpc_cache_lookup(cd, &ug.h, unix_gid_hash(uid)); + ch = sunrpc_cache_lookup_rcu(cd, &ug.h, unix_gid_hash(uid)); if (ch) return container_of(ch, struct unix_gid, h); else diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5445145e639c..3b525accaa68 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -325,59 +325,34 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) /* * Generic recvfrom routine. */ -static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, - int buflen) +static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, + unsigned int nr, size_t buflen, unsigned int base) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - struct msghdr msg = { - .msg_flags = MSG_DONTWAIT, - }; - int len; + struct msghdr msg = { NULL }; + ssize_t len; rqstp->rq_xprt_hlen = 0; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nr, buflen); - len = sock_recvmsg(svsk->sk_sock, &msg, msg.msg_flags); + if (base != 0) { + iov_iter_advance(&msg.msg_iter, base); + buflen -= base; + } + len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); /* If we read a full record, then assume there may be more * data to read (stream based sockets only!) */ if (len == buflen) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n", + dprintk("svc: socket %p recvfrom(%p, %zu) = %zd\n", svsk, iov[0].iov_base, iov[0].iov_len, len); return len; } -static int svc_partial_recvfrom(struct svc_rqst *rqstp, - struct kvec *iov, int nr, - int buflen, unsigned int base) -{ - size_t save_iovlen; - void *save_iovbase; - unsigned int i; - int ret; - - if (base == 0) - return svc_recvfrom(rqstp, iov, nr, buflen); - - for (i = 0; i < nr; i++) { - if (iov[i].iov_len > base) - break; - base -= iov[i].iov_len; - } - save_iovlen = iov[i].iov_len; - save_iovbase = iov[i].iov_base; - iov[i].iov_len -= base; - iov[i].iov_base += base; - ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen); - iov[i].iov_len = save_iovlen; - iov[i].iov_base = save_iovbase; - return ret; -} - /* * Set socket snd and rcv buffer lengths */ @@ -962,7 +937,8 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; iov.iov_len = want; - if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) + len = svc_recvfrom(rqstp, &iov, 1, want, 0); + if (len < 0) goto error; svsk->sk_tcplen += len; @@ -1004,7 +980,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) if (!bc_xprt) return -EAGAIN; - spin_lock(&bc_xprt->recv_lock); + spin_lock(&bc_xprt->queue_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) goto unlock_notfound; @@ -1022,7 +998,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len); rqstp->rq_arg.len = 0; - spin_unlock(&bc_xprt->recv_lock); + spin_unlock(&bc_xprt->queue_lock); return 0; unlock_notfound: printk(KERN_NOTICE @@ -1031,7 +1007,7 @@ unlock_notfound: __func__, ntohl(calldir), bc_xprt, ntohl(xid)); unlock_eagain: - spin_unlock(&bc_xprt->recv_lock); + spin_unlock(&bc_xprt->queue_lock); return -EAGAIN; } @@ -1088,14 +1064,13 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) vec = rqstp->rq_vec; - pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], - svsk->sk_datalen + want); + pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want); rqstp->rq_respages = &rqstp->rq_pages[pnum]; rqstp->rq_next_page = rqstp->rq_respages + 1; /* Now receive data */ - len = svc_partial_recvfrom(rqstp, vec, pnum, want, base); + len = svc_recvfrom(rqstp, vec, pnum, base + want, base); if (len >= 0) { svsk->sk_tcplen += len; svsk->sk_datalen += len; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 30afbd236656..2bbb8d38d2bf 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -15,6 +15,7 @@ #include <linux/errno.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/msg_prot.h> +#include <linux/bvec.h> /* * XDR functions for basic NFS types @@ -128,6 +129,39 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len) } EXPORT_SYMBOL_GPL(xdr_terminate_string); +size_t +xdr_buf_pagecount(struct xdr_buf *buf) +{ + if (!buf->page_len) + return 0; + return (buf->page_base + buf->page_len + PAGE_SIZE - 1) >> PAGE_SHIFT; +} + +int +xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) +{ + size_t i, n = xdr_buf_pagecount(buf); + + if (n != 0 && buf->bvec == NULL) { + buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); + if (!buf->bvec) + return -ENOMEM; + for (i = 0; i < n; i++) { + buf->bvec[i].bv_page = buf->pages[i]; + buf->bvec[i].bv_len = PAGE_SIZE; + buf->bvec[i].bv_offset = 0; + } + } + return 0; +} + +void +xdr_free_bvec(struct xdr_buf *buf) +{ + kfree(buf->bvec); + buf->bvec = NULL; +} + void xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, struct page **pages, unsigned int base, unsigned int len) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a8db2e3f8904..86bea4520c4d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -68,8 +68,6 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net); static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); static void xprt_connect_status(struct rpc_task *task); -static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); -static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *); static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); @@ -171,6 +169,17 @@ out: } EXPORT_SYMBOL_GPL(xprt_load_transport); +static void xprt_clear_locked(struct rpc_xprt *xprt) +{ + xprt->snd_task = NULL; + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { + smp_mb__before_atomic(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_atomic(); + } else + queue_work(xprtiod_workqueue, &xprt->task_cleanup); +} + /** * xprt_reserve_xprt - serialize write access to transports * @task: task that is requesting access to the transport @@ -183,44 +192,53 @@ EXPORT_SYMBOL_GPL(xprt_load_transport); int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - int priority; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) return 1; goto out_sleep; } + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; xprt->snd_task = task; - if (req != NULL) - req->rq_ntrans++; return 1; +out_unlock: + xprt_clear_locked(xprt); out_sleep: dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = 0; + task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - if (req == NULL) - priority = RPC_PRIORITY_LOW; - else if (!req->rq_ntrans) - priority = RPC_PRIORITY_NORMAL; - else - priority = RPC_PRIORITY_HIGH; - rpc_sleep_on_priority(&xprt->sending, task, NULL, priority); + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt); -static void xprt_clear_locked(struct rpc_xprt *xprt) +static bool +xprt_need_congestion_window_wait(struct rpc_xprt *xprt) { - xprt->snd_task = NULL; - if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { - smp_mb__before_atomic(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_atomic(); - } else - queue_work(xprtiod_workqueue, &xprt->task_cleanup); + return test_bit(XPRT_CWND_WAIT, &xprt->state); +} + +static void +xprt_set_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (!list_empty(&xprt->xmit_queue)) { + /* Peek at head of queue to see if it can make progress */ + if (list_first_entry(&xprt->xmit_queue, struct rpc_rqst, + rq_xmit)->rq_cong) + return; + } + set_bit(XPRT_CWND_WAIT, &xprt->state); +} + +static void +xprt_test_and_clear_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (!RPCXPRT_CONGESTED(xprt)) + clear_bit(XPRT_CWND_WAIT, &xprt->state); } /* @@ -230,11 +248,11 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) * Same as xprt_reserve_xprt, but Van Jacobson congestion control is * integrated into the decision of whether a request is allowed to be * woken up and given access to the transport. + * Note that the lock is only granted if we know there are free slots. */ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - int priority; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) @@ -245,25 +263,19 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) xprt->snd_task = task; return 1; } - if (__xprt_get_cong(xprt, task)) { + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; + if (!xprt_need_congestion_window_wait(xprt)) { xprt->snd_task = task; - req->rq_ntrans++; return 1; } +out_unlock: xprt_clear_locked(xprt); out_sleep: - if (req) - __xprt_put_cong(xprt, req); dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = 0; + task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - if (req == NULL) - priority = RPC_PRIORITY_LOW; - else if (!req->rq_ntrans) - priority = RPC_PRIORITY_NORMAL; - else - priority = RPC_PRIORITY_HIGH; - rpc_sleep_on_priority(&xprt->sending, task, NULL, priority); + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); @@ -272,6 +284,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; + if (test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == task) + return 1; spin_lock_bh(&xprt->transport_lock); retval = xprt->ops->reserve_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); @@ -281,12 +295,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) static bool __xprt_lock_write_func(struct rpc_task *task, void *data) { struct rpc_xprt *xprt = data; - struct rpc_rqst *req; - req = task->tk_rqstp; xprt->snd_task = task; - if (req) - req->rq_ntrans++; return true; } @@ -294,53 +304,30 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, __xprt_lock_write_func, xprt)) return; +out_unlock: xprt_clear_locked(xprt); } -static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data) -{ - struct rpc_xprt *xprt = data; - struct rpc_rqst *req; - - req = task->tk_rqstp; - if (req == NULL) { - xprt->snd_task = task; - return true; - } - if (__xprt_get_cong(xprt, task)) { - xprt->snd_task = task; - req->rq_ntrans++; - return true; - } - return false; -} - static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (RPCXPRT_CONGESTED(xprt)) + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; + if (xprt_need_congestion_window_wait(xprt)) goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, - __xprt_lock_write_cong_func, xprt)) + __xprt_lock_write_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); } -static void xprt_task_clear_bytes_sent(struct rpc_task *task) -{ - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } -} - /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting @@ -351,7 +338,6 @@ static void xprt_task_clear_bytes_sent(struct rpc_task *task) void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } @@ -369,7 +355,6 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } @@ -378,6 +363,8 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { + if (xprt->snd_task != task) + return; spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); @@ -388,16 +375,16 @@ static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *ta * overflowed. Put the task to sleep if this is the case. */ static int -__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task) +__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; - if (req->rq_cong) return 1; dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n", - task->tk_pid, xprt->cong, xprt->cwnd); - if (RPCXPRT_CONGESTED(xprt)) + req->rq_task->tk_pid, xprt->cong, xprt->cwnd); + if (RPCXPRT_CONGESTED(xprt)) { + xprt_set_congestion_window_wait(xprt); return 0; + } req->rq_cong = 1; xprt->cong += RPC_CWNDSCALE; return 1; @@ -414,10 +401,32 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) return; req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; + xprt_test_and_clear_congestion_window_wait(xprt); __xprt_lock_write_next_cong(xprt); } /** + * xprt_request_get_cong - Request congestion control credits + * @xprt: pointer to transport + * @req: pointer to RPC request + * + * Useful for transports that require congestion control. + */ +bool +xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + bool ret = false; + + if (req->rq_cong) + return true; + spin_lock_bh(&xprt->transport_lock); + ret = __xprt_get_cong(xprt, req) != 0; + spin_unlock_bh(&xprt->transport_lock); + return ret; +} +EXPORT_SYMBOL_GPL(xprt_request_get_cong); + +/** * xprt_release_rqst_cong - housekeeping when request is complete * @task: RPC request that recently completed * @@ -431,6 +440,20 @@ void xprt_release_rqst_cong(struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); +/* + * Clear the congestion window wait flag and wake up the next + * entry on xprt->sending + */ +static void +xprt_clear_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) { + spin_lock_bh(&xprt->transport_lock); + __xprt_lock_write_next_cong(xprt); + spin_unlock_bh(&xprt->transport_lock); + } +} + /** * xprt_adjust_cwnd - adjust transport congestion window * @xprt: pointer to xprt @@ -488,39 +511,46 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); /** * xprt_wait_for_buffer_space - wait for transport output buffer to clear - * @task: task to be put to sleep - * @action: function pointer to be executed after wait + * @xprt: transport * * Note that we only set the timer for the case of RPC_IS_SOFT(), since * we don't in general want to force a socket disconnection due to * an incomplete RPC call transmission. */ -void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action) +void xprt_wait_for_buffer_space(struct rpc_xprt *xprt) { - struct rpc_rqst *req = task->tk_rqstp; - struct rpc_xprt *xprt = req->rq_xprt; - - task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; - rpc_sleep_on(&xprt->pending, task, action); + set_bit(XPRT_WRITE_SPACE, &xprt->state); } EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); +static bool +xprt_clear_write_space_locked(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state)) { + __xprt_lock_write_next(xprt); + dprintk("RPC: write space: waking waiting task on " + "xprt %p\n", xprt); + return true; + } + return false; +} + /** * xprt_write_space - wake the task waiting for transport output buffer space * @xprt: transport with waiting tasks * * Can be called in a soft IRQ context, so xprt_write_space never sleeps. */ -void xprt_write_space(struct rpc_xprt *xprt) +bool xprt_write_space(struct rpc_xprt *xprt) { + bool ret; + + if (!test_bit(XPRT_WRITE_SPACE, &xprt->state)) + return false; spin_lock_bh(&xprt->transport_lock); - if (xprt->snd_task) { - dprintk("RPC: write space: waking waiting task on " - "xprt %p\n", xprt); - rpc_wake_up_queued_task_on_wq(xprtiod_workqueue, - &xprt->pending, xprt->snd_task); - } + ret = xprt_clear_write_space_locked(xprt); spin_unlock_bh(&xprt->transport_lock); + return ret; } EXPORT_SYMBOL_GPL(xprt_write_space); @@ -631,6 +661,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); + xprt_clear_write_space_locked(xprt); xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } @@ -654,6 +685,22 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_force_disconnect); +static unsigned int +xprt_connect_cookie(struct rpc_xprt *xprt) +{ + return READ_ONCE(xprt->connect_cookie); +} + +static bool +xprt_request_retransmit_after_disconnect(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + return req->rq_connect_cookie != xprt_connect_cookie(xprt) || + !xprt_connected(xprt); +} + /** * xprt_conditional_disconnect - force a transport to disconnect * @xprt: transport to disconnect @@ -692,7 +739,7 @@ static void xprt_schedule_autodisconnect(struct rpc_xprt *xprt) __must_hold(&xprt->transport_lock) { - if (list_empty(&xprt->recv) && xprt_has_timer(xprt)) + if (RB_EMPTY_ROOT(&xprt->recv_queue) && xprt_has_timer(xprt)) mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); } @@ -702,7 +749,7 @@ xprt_init_autodisconnect(struct timer_list *t) struct rpc_xprt *xprt = from_timer(xprt, t, timer); spin_lock(&xprt->transport_lock); - if (!list_empty(&xprt->recv)) + if (!RB_EMPTY_ROOT(&xprt->recv_queue)) goto out_abort; /* Reset xprt->last_used to avoid connect/autodisconnect cycling */ xprt->last_used = jiffies; @@ -726,7 +773,6 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; - xprt_task_clear_bytes_sent(task); xprt->snd_task = cookie; ret = true; out: @@ -772,7 +818,6 @@ void xprt_connect(struct rpc_task *task) xprt->ops->close(xprt); if (!xprt_connected(xprt)) { - task->tk_rqstp->rq_bytes_sent = 0; task->tk_timeout = task->tk_rqstp->rq_timeout; task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; rpc_sleep_on(&xprt->pending, task, xprt_connect_status); @@ -789,17 +834,11 @@ void xprt_connect(struct rpc_task *task) static void xprt_connect_status(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - - if (task->tk_status == 0) { - xprt->stat.connect_count++; - xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; + switch (task->tk_status) { + case 0: dprintk("RPC: %5u xprt_connect_status: connection established\n", task->tk_pid); - return; - } - - switch (task->tk_status) { + break; case -ECONNREFUSED: case -ECONNRESET: case -ECONNABORTED: @@ -816,28 +855,97 @@ static void xprt_connect_status(struct rpc_task *task) default: dprintk("RPC: %5u xprt_connect_status: error %d connecting to " "server %s\n", task->tk_pid, -task->tk_status, - xprt->servername); + task->tk_rqstp->rq_xprt->servername); task->tk_status = -EIO; } } +enum xprt_xid_rb_cmp { + XID_RB_EQUAL, + XID_RB_LEFT, + XID_RB_RIGHT, +}; +static enum xprt_xid_rb_cmp +xprt_xid_cmp(__be32 xid1, __be32 xid2) +{ + if (xid1 == xid2) + return XID_RB_EQUAL; + if ((__force u32)xid1 < (__force u32)xid2) + return XID_RB_LEFT; + return XID_RB_RIGHT; +} + +static struct rpc_rqst * +xprt_request_rb_find(struct rpc_xprt *xprt, __be32 xid) +{ + struct rb_node *n = xprt->recv_queue.rb_node; + struct rpc_rqst *req; + + while (n != NULL) { + req = rb_entry(n, struct rpc_rqst, rq_recv); + switch (xprt_xid_cmp(xid, req->rq_xid)) { + case XID_RB_LEFT: + n = n->rb_left; + break; + case XID_RB_RIGHT: + n = n->rb_right; + break; + case XID_RB_EQUAL: + return req; + } + } + return NULL; +} + +static void +xprt_request_rb_insert(struct rpc_xprt *xprt, struct rpc_rqst *new) +{ + struct rb_node **p = &xprt->recv_queue.rb_node; + struct rb_node *n = NULL; + struct rpc_rqst *req; + + while (*p != NULL) { + n = *p; + req = rb_entry(n, struct rpc_rqst, rq_recv); + switch(xprt_xid_cmp(new->rq_xid, req->rq_xid)) { + case XID_RB_LEFT: + p = &n->rb_left; + break; + case XID_RB_RIGHT: + p = &n->rb_right; + break; + case XID_RB_EQUAL: + WARN_ON_ONCE(new != req); + return; + } + } + rb_link_node(&new->rq_recv, n, p); + rb_insert_color(&new->rq_recv, &xprt->recv_queue); +} + +static void +xprt_request_rb_remove(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + rb_erase(&req->rq_recv, &xprt->recv_queue); +} + /** * xprt_lookup_rqst - find an RPC request corresponding to an XID * @xprt: transport on which the original request was transmitted * @xid: RPC XID of incoming reply * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) { struct rpc_rqst *entry; - list_for_each_entry(entry, &xprt->recv, rq_list) - if (entry->rq_xid == xid) { - trace_xprt_lookup_rqst(xprt, xid, 0); - entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); - return entry; - } + entry = xprt_request_rb_find(xprt, xid); + if (entry != NULL) { + trace_xprt_lookup_rqst(xprt, xid, 0); + entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); + return entry; + } dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n", ntohl(xid)); @@ -847,16 +955,22 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) } EXPORT_SYMBOL_GPL(xprt_lookup_rqst); +static bool +xprt_is_pinned_rqst(struct rpc_rqst *req) +{ + return atomic_read(&req->rq_pin) != 0; +} + /** * xprt_pin_rqst - Pin a request on the transport receive list * @req: Request to pin * * Caller must ensure this is atomic with the call to xprt_lookup_rqst() - * so should be holding the xprt transport lock. + * so should be holding the xprt receive lock. */ void xprt_pin_rqst(struct rpc_rqst *req) { - set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate); + atomic_inc(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_pin_rqst); @@ -864,38 +978,87 @@ EXPORT_SYMBOL_GPL(xprt_pin_rqst); * xprt_unpin_rqst - Unpin a request on the transport receive list * @req: Request to pin * - * Caller should be holding the xprt transport lock. + * Caller should be holding the xprt receive lock. */ void xprt_unpin_rqst(struct rpc_rqst *req) { - struct rpc_task *task = req->rq_task; - - clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate); - if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate)) - wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV); + if (!test_bit(RPC_TASK_MSG_PIN_WAIT, &req->rq_task->tk_runstate)) { + atomic_dec(&req->rq_pin); + return; + } + if (atomic_dec_and_test(&req->rq_pin)) + wake_up_var(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_unpin_rqst); static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) -__must_hold(&req->rq_xprt->recv_lock) { - struct rpc_task *task = req->rq_task; + wait_var_event(&req->rq_pin, !xprt_is_pinned_rqst(req)); +} - if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) { - spin_unlock(&req->rq_xprt->recv_lock); - set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); - wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV, - TASK_UNINTERRUPTIBLE); - clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); - spin_lock(&req->rq_xprt->recv_lock); - } +static bool +xprt_request_data_received(struct rpc_task *task) +{ + return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && + READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) != 0; +} + +static bool +xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req) +{ + return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && + READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) == 0; +} + +/** + * xprt_request_enqueue_receive - Add an request to the receive queue + * @task: RPC task + * + */ +void +xprt_request_enqueue_receive(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (!xprt_request_need_enqueue_receive(task, req)) + return; + spin_lock(&xprt->queue_lock); + + /* Update the softirq receive buffer */ + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + sizeof(req->rq_private_buf)); + + /* Add request to the receive list */ + xprt_request_rb_insert(xprt, req); + set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + + xprt_reset_majortimeo(req); + /* Turn off autodisconnect */ + del_singleshot_timer_sync(&xprt->timer); +} + +/** + * xprt_request_dequeue_receive_locked - Remove a request from the receive queue + * @task: RPC task + * + * Caller must hold xprt->queue_lock. + */ +static void +xprt_request_dequeue_receive_locked(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) + xprt_request_rb_remove(req->rq_xprt, req); } /** * xprt_update_rtt - Update RPC RTT statistics * @task: RPC request that recently completed * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ void xprt_update_rtt(struct rpc_task *task) { @@ -917,7 +1080,7 @@ EXPORT_SYMBOL_GPL(xprt_update_rtt); * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ void xprt_complete_rqst(struct rpc_task *task, int copied) { @@ -930,12 +1093,12 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) xprt->stat.recvs++; - list_del_init(&req->rq_list); req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ smp_wmb(); req->rq_reply_bytes_recvd = copied; + xprt_request_dequeue_receive_locked(task); rpc_wake_up_queued_task(&xprt->pending, task); } EXPORT_SYMBOL_GPL(xprt_complete_rqst); @@ -957,6 +1120,172 @@ static void xprt_timer(struct rpc_task *task) } /** + * xprt_request_wait_receive - wait for the reply to an RPC request + * @task: RPC task about to send a request + * + */ +void xprt_request_wait_receive(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (!test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) + return; + /* + * Sleep on the pending queue if we're expecting a reply. + * The spinlock ensures atomicity between the test of + * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). + */ + spin_lock(&xprt->queue_lock); + if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { + xprt->ops->set_retrans_timeout(task); + rpc_sleep_on(&xprt->pending, task, xprt_timer); + /* + * Send an extra queue wakeup call if the + * connection was dropped in case the call to + * rpc_sleep_on() raced. + */ + if (xprt_request_retransmit_after_disconnect(task)) + rpc_wake_up_queued_task_set_status(&xprt->pending, + task, -ENOTCONN); + } + spin_unlock(&xprt->queue_lock); +} + +static bool +xprt_request_need_enqueue_transmit(struct rpc_task *task, struct rpc_rqst *req) +{ + return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); +} + +/** + * xprt_request_enqueue_transmit - queue a task for transmission + * @task: pointer to rpc_task + * + * Add a task to the transmission queue. + */ +void +xprt_request_enqueue_transmit(struct rpc_task *task) +{ + struct rpc_rqst *pos, *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (xprt_request_need_enqueue_transmit(task, req)) { + spin_lock(&xprt->queue_lock); + /* + * Requests that carry congestion control credits are added + * to the head of the list to avoid starvation issues. + */ + if (req->rq_cong) { + xprt_clear_congestion_window_wait(xprt); + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_cong) + continue; + /* Note: req is added _before_ pos */ + list_add_tail(&req->rq_xmit, &pos->rq_xmit); + INIT_LIST_HEAD(&req->rq_xmit2); + goto out; + } + } else if (RPC_IS_SWAPPER(task)) { + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_cong || pos->rq_bytes_sent) + continue; + if (RPC_IS_SWAPPER(pos->rq_task)) + continue; + /* Note: req is added _before_ pos */ + list_add_tail(&req->rq_xmit, &pos->rq_xmit); + INIT_LIST_HEAD(&req->rq_xmit2); + goto out; + } + } else { + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_task->tk_owner != task->tk_owner) + continue; + list_add_tail(&req->rq_xmit2, &pos->rq_xmit2); + INIT_LIST_HEAD(&req->rq_xmit); + goto out; + } + } + list_add_tail(&req->rq_xmit, &xprt->xmit_queue); + INIT_LIST_HEAD(&req->rq_xmit2); +out: + set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + } +} + +/** + * xprt_request_dequeue_transmit_locked - remove a task from the transmission queue + * @task: pointer to rpc_task + * + * Remove a task from the transmission queue + * Caller must hold xprt->queue_lock + */ +static void +xprt_request_dequeue_transmit_locked(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + return; + if (!list_empty(&req->rq_xmit)) { + list_del(&req->rq_xmit); + if (!list_empty(&req->rq_xmit2)) { + struct rpc_rqst *next = list_first_entry(&req->rq_xmit2, + struct rpc_rqst, rq_xmit2); + list_del(&req->rq_xmit2); + list_add_tail(&next->rq_xmit, &next->rq_xprt->xmit_queue); + } + } else + list_del(&req->rq_xmit2); +} + +/** + * xprt_request_dequeue_transmit - remove a task from the transmission queue + * @task: pointer to rpc_task + * + * Remove a task from the transmission queue + */ +static void +xprt_request_dequeue_transmit(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + spin_unlock(&xprt->queue_lock); +} + +/** + * xprt_request_prepare - prepare an encoded request for transport + * @req: pointer to rpc_rqst + * + * Calls into the transport layer to do whatever is needed to prepare + * the request for transmission or receive. + */ +void +xprt_request_prepare(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + + if (xprt->ops->prepare_request) + xprt->ops->prepare_request(req); +} + +/** + * xprt_request_need_retransmit - Test if a task needs retransmission + * @task: pointer to rpc_task + * + * Test for whether a connection breakage requires the task to retransmit + */ +bool +xprt_request_need_retransmit(struct rpc_task *task) +{ + return xprt_request_retransmit_after_disconnect(task); +} + +/** * xprt_prepare_transmit - reserve the transport before sending a request * @task: RPC task about to send a request * @@ -965,32 +1294,18 @@ bool xprt_prepare_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - bool ret = false; dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid); - spin_lock_bh(&xprt->transport_lock); - if (!req->rq_bytes_sent) { - if (req->rq_reply_bytes_recvd) { - task->tk_status = req->rq_reply_bytes_recvd; - goto out_unlock; - } - if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) - && xprt_connected(xprt) - && req->rq_connect_cookie == xprt->connect_cookie) { - xprt->ops->set_retrans_timeout(task); - rpc_sleep_on(&xprt->pending, task, xprt_timer); - goto out_unlock; - } - } - if (!xprt->ops->reserve_xprt(xprt, task)) { - task->tk_status = -EAGAIN; - goto out_unlock; + if (!xprt_lock_write(xprt, task)) { + /* Race breaker: someone may have transmitted us */ + if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + rpc_wake_up_queued_task_set_status(&xprt->sending, + task, 0); + return false; + } - ret = true; -out_unlock: - spin_unlock_bh(&xprt->transport_lock); - return ret; + return true; } void xprt_end_transmit(struct rpc_task *task) @@ -999,54 +1314,62 @@ void xprt_end_transmit(struct rpc_task *task) } /** - * xprt_transmit - send an RPC request on a transport - * @task: controlling RPC task + * xprt_request_transmit - send an RPC request on a transport + * @req: pointer to request to transmit + * @snd_task: RPC task that owns the transport lock * - * We have to copy the iovec because sendmsg fiddles with its contents. + * This performs the transmission of a single request. + * Note that if the request is not the same as snd_task, then it + * does need to be pinned. + * Returns '0' on success. */ -void xprt_transmit(struct rpc_task *task) +static int +xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) { - struct rpc_rqst *req = task->tk_rqstp; - struct rpc_xprt *xprt = req->rq_xprt; + struct rpc_xprt *xprt = req->rq_xprt; + struct rpc_task *task = req->rq_task; unsigned int connect_cookie; + int is_retrans = RPC_WAS_SENT(task); int status; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); - if (!req->rq_reply_bytes_recvd) { - if (list_empty(&req->rq_list) && rpc_reply_expected(task)) { - /* - * Add to the list only if we're expecting a reply - */ - /* Update the softirq receive buffer */ - memcpy(&req->rq_private_buf, &req->rq_rcv_buf, - sizeof(req->rq_private_buf)); - /* Add request to the receive list */ - spin_lock(&xprt->recv_lock); - list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock(&xprt->recv_lock); - xprt_reset_majortimeo(req); - /* Turn off autodisconnect */ - del_singleshot_timer_sync(&xprt->timer); + if (!req->rq_bytes_sent) { + if (xprt_request_data_received(task)) { + status = 0; + goto out_dequeue; } - } else if (!req->rq_bytes_sent) - return; + /* Verify that our message lies in the RPCSEC_GSS window */ + if (rpcauth_xmit_need_reencode(task)) { + status = -EBADMSG; + goto out_dequeue; + } + } + + /* + * Update req->rq_ntrans before transmitting to avoid races with + * xprt_update_rtt(), which needs to know that it is recording a + * reply to the first transmission. + */ + req->rq_ntrans++; connect_cookie = xprt->connect_cookie; - status = xprt->ops->send_request(task); + status = xprt->ops->send_request(req); trace_xprt_transmit(xprt, req->rq_xid, status); if (status != 0) { - task->tk_status = status; - return; + req->rq_ntrans--; + return status; } + + if (is_retrans) + task->tk_client->cl_stats->rpcretrans++; + xprt_inject_disconnect(xprt); dprintk("RPC: %5u xmit complete\n", task->tk_pid); task->tk_flags |= RPC_TASK_SENT; spin_lock_bh(&xprt->transport_lock); - xprt->ops->set_retrans_timeout(task); - xprt->stat.sends++; xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; xprt->stat.bklog_u += xprt->backlog.qlen; @@ -1055,25 +1378,49 @@ void xprt_transmit(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); req->rq_connect_cookie = connect_cookie; - if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) { - /* - * Sleep on the pending queue if we're expecting a reply. - * The spinlock ensures atomicity between the test of - * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). - */ - spin_lock(&xprt->recv_lock); - if (!req->rq_reply_bytes_recvd) { - rpc_sleep_on(&xprt->pending, task, xprt_timer); - /* - * Send an extra queue wakeup call if the - * connection was dropped in case the call to - * rpc_sleep_on() raced. - */ - if (!xprt_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); - } - spin_unlock(&xprt->recv_lock); +out_dequeue: + xprt_request_dequeue_transmit(task); + rpc_wake_up_queued_task_set_status(&xprt->sending, task, status); + return status; +} + +/** + * xprt_transmit - send an RPC request on a transport + * @task: controlling RPC task + * + * Attempts to drain the transmit queue. On exit, either the transport + * signalled an error that needs to be handled before transmission can + * resume, or @task finished transmitting, and detected that it already + * received a reply. + */ +void +xprt_transmit(struct rpc_task *task) +{ + struct rpc_rqst *next, *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int status; + + spin_lock(&xprt->queue_lock); + while (!list_empty(&xprt->xmit_queue)) { + next = list_first_entry(&xprt->xmit_queue, + struct rpc_rqst, rq_xmit); + xprt_pin_rqst(next); + spin_unlock(&xprt->queue_lock); + status = xprt_request_transmit(next, task); + if (status == -EBADMSG && next != req) + status = 0; + cond_resched(); + spin_lock(&xprt->queue_lock); + xprt_unpin_rqst(next); + if (status == 0) { + if (!xprt_request_data_received(task) || + test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + continue; + } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + task->tk_status = status; + break; } + spin_unlock(&xprt->queue_lock); } static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) @@ -1170,20 +1517,6 @@ out_init_req: } EXPORT_SYMBOL_GPL(xprt_alloc_slot); -void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) -{ - /* Note: grabbing the xprt_lock_write() ensures that we throttle - * new slot allocation if the transport is congested (i.e. when - * reconnecting a stream transport or when out of socket write - * buffer space). - */ - if (xprt_lock_write(xprt, task)) { - xprt_alloc_slot(xprt, task); - xprt_release_write(xprt, task); - } -} -EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot); - void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { spin_lock(&xprt->reserve_lock); @@ -1250,6 +1583,60 @@ void xprt_free(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_free); +static void +xprt_init_connect_cookie(struct rpc_rqst *req, struct rpc_xprt *xprt) +{ + req->rq_connect_cookie = xprt_connect_cookie(xprt) - 1; +} + +static __be32 +xprt_alloc_xid(struct rpc_xprt *xprt) +{ + __be32 xid; + + spin_lock(&xprt->reserve_lock); + xid = (__force __be32)xprt->xid++; + spin_unlock(&xprt->reserve_lock); + return xid; +} + +static void +xprt_init_xid(struct rpc_xprt *xprt) +{ + xprt->xid = prandom_u32(); +} + +static void +xprt_request_init(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req = task->tk_rqstp; + + req->rq_timeout = task->tk_client->cl_timeout->to_initval; + req->rq_task = task; + req->rq_xprt = xprt; + req->rq_buffer = NULL; + req->rq_xid = xprt_alloc_xid(xprt); + xprt_init_connect_cookie(req, xprt); + req->rq_bytes_sent = 0; + req->rq_snd_buf.len = 0; + req->rq_snd_buf.buflen = 0; + req->rq_rcv_buf.len = 0; + req->rq_rcv_buf.buflen = 0; + req->rq_release_snd_buf = NULL; + xprt_reset_majortimeo(req); + dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, + req, ntohl(req->rq_xid)); +} + +static void +xprt_do_reserve(struct rpc_xprt *xprt, struct rpc_task *task) +{ + xprt->ops->alloc_slot(xprt, task); + if (task->tk_rqstp != NULL) + xprt_request_init(task); +} + /** * xprt_reserve - allocate an RPC request slot * @task: RPC task requesting a slot allocation @@ -1269,7 +1656,7 @@ void xprt_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; if (!xprt_throttle_congested(xprt, task)) - xprt->ops->alloc_slot(xprt, task); + xprt_do_reserve(xprt, task); } /** @@ -1291,45 +1678,29 @@ void xprt_retry_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; - xprt->ops->alloc_slot(xprt, task); -} - -static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) -{ - __be32 xid; - - spin_lock(&xprt->reserve_lock); - xid = (__force __be32)xprt->xid++; - spin_unlock(&xprt->reserve_lock); - return xid; + xprt_do_reserve(xprt, task); } -static inline void xprt_init_xid(struct rpc_xprt *xprt) -{ - xprt->xid = prandom_u32(); -} - -void xprt_request_init(struct rpc_task *task) +static void +xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req) { - struct rpc_xprt *xprt = task->tk_xprt; - struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; - INIT_LIST_HEAD(&req->rq_list); - req->rq_timeout = task->tk_client->cl_timeout->to_initval; - req->rq_task = task; - req->rq_xprt = xprt; - req->rq_buffer = NULL; - req->rq_xid = xprt_alloc_xid(xprt); - req->rq_connect_cookie = xprt->connect_cookie - 1; - req->rq_bytes_sent = 0; - req->rq_snd_buf.len = 0; - req->rq_snd_buf.buflen = 0; - req->rq_rcv_buf.len = 0; - req->rq_rcv_buf.buflen = 0; - req->rq_release_snd_buf = NULL; - xprt_reset_majortimeo(req); - dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, - req, ntohl(req->rq_xid)); + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || + test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || + xprt_is_pinned_rqst(req)) { + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); + while (xprt_is_pinned_rqst(req)) { + set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + xprt_wait_on_pinned_rqst(req); + spin_lock(&xprt->queue_lock); + clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + } + spin_unlock(&xprt->queue_lock); + } } /** @@ -1345,8 +1716,7 @@ void xprt_release(struct rpc_task *task) if (req == NULL) { if (task->tk_client) { xprt = task->tk_xprt; - if (xprt->snd_task == task) - xprt_release_write(xprt, task); + xprt_release_write(xprt, task); } return; } @@ -1356,12 +1726,7 @@ void xprt_release(struct rpc_task *task) task->tk_ops->rpc_count_stats(task, task->tk_calldata); else if (task->tk_client) rpc_count_iostats(task, task->tk_client->cl_metrics); - spin_lock(&xprt->recv_lock); - if (!list_empty(&req->rq_list)) { - list_del_init(&req->rq_list); - xprt_wait_on_pinned_rqst(req); - } - spin_unlock(&xprt->recv_lock); + xprt_request_dequeue_all(task, req); spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) @@ -1372,6 +1737,7 @@ void xprt_release(struct rpc_task *task) if (req->rq_buffer) xprt->ops->buf_free(task); xprt_inject_disconnect(xprt); + xdr_free_bvec(&req->rq_rcv_buf); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); task->tk_rqstp = NULL; @@ -1385,16 +1751,36 @@ void xprt_release(struct rpc_task *task) xprt_free_bc_request(req); } +#ifdef CONFIG_SUNRPC_BACKCHANNEL +void +xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task) +{ + struct xdr_buf *xbufp = &req->rq_snd_buf; + + task->tk_rqstp = req; + req->rq_task = task; + xprt_init_connect_cookie(req, req->rq_xprt); + /* + * Set up the xdr_buf length. + * This also indicates that the buffer is XDR encoded already. + */ + xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + + xbufp->tail[0].iov_len; + req->rq_bytes_sent = 0; +} +#endif + static void xprt_init(struct rpc_xprt *xprt, struct net *net) { kref_init(&xprt->kref); spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); - spin_lock_init(&xprt->recv_lock); + spin_lock_init(&xprt->queue_lock); INIT_LIST_HEAD(&xprt->free); - INIT_LIST_HEAD(&xprt->recv); + xprt->recv_queue = RB_ROOT; + INIT_LIST_HEAD(&xprt->xmit_queue); #if defined(CONFIG_SUNRPC_BACKCHANNEL) spin_lock_init(&xprt->bc_pa_lock); INIT_LIST_HEAD(&xprt->bc_pa_list); @@ -1407,7 +1793,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) rpc_init_wait_queue(&xprt->binding, "xprt_binding"); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); - rpc_init_priority_wait_queue(&xprt->sending, "xprt_sending"); + rpc_init_wait_queue(&xprt->sending, "xprt_sending"); rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); xprt_init_xid(xprt); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 90adeff4c06b..e5b367a3e517 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -51,12 +51,11 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, rqst = &req->rl_slot; rqst->rq_xprt = xprt; - INIT_LIST_HEAD(&rqst->rq_list); INIT_LIST_HEAD(&rqst->rq_bc_list); __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); @@ -201,6 +200,9 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (!xprt_connected(rqst->rq_xprt)) goto drop_connection; + if (!xprt_request_get_cong(rqst->rq_xprt, rqst)) + return -EBADSLT; + rc = rpcrdma_bc_marshal_reply(rqst); if (rc < 0) goto failed_marshal; @@ -228,16 +230,16 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpc_rqst *rqst, *tmp; - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { list_del(&rqst->rq_bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); rpcrdma_bc_free_rqst(r_xprt, rqst); - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); } - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); } /** @@ -255,9 +257,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) rpcrdma_recv_buffer_put(req->rl_reply); req->rl_reply = NULL; - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); } /** diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 0f7c465d9a5a..7f5632cd5a48 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -49,46 +49,7 @@ fmr_is_supported(struct rpcrdma_ia *ia) return true; } -static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - static struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - - mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(u64), GFP_KERNEL); - if (!mr->fmr.fm_physaddrs) - goto out_free; - - mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mr->mr_sg), GFP_KERNEL); - if (!mr->mr_sg) - goto out_free; - - sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - - mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, - &fmr_attr); - if (IS_ERR(mr->fmr.fm_mr)) - goto out_fmr_err; - - INIT_LIST_HEAD(&mr->mr_list); - return 0; - -out_fmr_err: - dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mr->fmr.fm_mr)); - -out_free: - kfree(mr->mr_sg); - kfree(mr->fmr.fm_physaddrs); - return -ENOMEM; -} - -static int +static void __fmr_unmap(struct rpcrdma_mr *mr) { LIST_HEAD(l); @@ -97,13 +58,16 @@ __fmr_unmap(struct rpcrdma_mr *mr) list_add(&mr->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); list_del(&mr->fmr.fm_mr->list); - return rc; + if (rc) + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", + mr, rc); } +/* Release an MR. + */ static void fmr_op_release_mr(struct rpcrdma_mr *mr) { - LIST_HEAD(unmap_list); int rc; kfree(mr->fmr.fm_physaddrs); @@ -112,10 +76,7 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) /* In case this one was left mapped, try to unmap it * to prevent dealloc_fmr from failing with EBUSY */ - rc = __fmr_unmap(mr); - if (rc) - pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - mr, rc); + __fmr_unmap(mr); rc = ib_dealloc_fmr(mr->fmr.fm_mr); if (rc) @@ -125,40 +86,68 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -/* Reset of a single FMR. +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. */ static void -fmr_op_recover_mr(struct rpcrdma_mr *mr) +fmr_mr_recycle_worker(struct work_struct *work) { + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - int rc; - /* ORDER: invalidate first */ - rc = __fmr_unmap(mr); - if (rc) - goto out_release; - - /* ORDER: then DMA unmap */ - rpcrdma_mr_unmap_and_put(mr); + trace_xprtrdma_mr_recycle(mr); - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; - - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); spin_lock(&r_xprt->rx_buf.rb_mrlock); list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mr); } +static int +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +{ + static struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + + mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(u64), GFP_KERNEL); + if (!mr->fmr.fm_physaddrs) + goto out_free; + + mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) + goto out_free; + + sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); + + mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + &fmr_attr); + if (IS_ERR(mr->fmr.fm_mr)) + goto out_fmr_err; + + INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker); + return 0; + +out_fmr_err: + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, + PTR_ERR(mr->fmr.fm_mr)); + +out_free: + kfree(mr->mr_sg); + kfree(mr->fmr.fm_physaddrs); + return -ENOMEM; +} + /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -187,6 +176,7 @@ fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES); + ia->ri_max_segs += 2; /* segments for head and tail buffers */ return 0; } @@ -244,7 +234,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_dma_map(mr); + trace_xprtrdma_mr_map(mr); for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); @@ -305,13 +295,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) { dprintk("RPC: %s: unmapping fmr %p\n", __func__, &mr->fmr); - trace_xprtrdma_localinv(mr); + trace_xprtrdma_mr_localinv(mr); list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); } r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); if (rc) - goto out_reset; + goto out_release; /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. @@ -324,13 +314,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) return; -out_reset: +out_release: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); while (!list_empty(mrs)) { mr = rpcrdma_mr_pop(mrs); list_del(&mr->fmr.fm_mr->list); - fmr_op_recover_mr(mr); + rpcrdma_mr_recycle(mr); } } @@ -338,7 +328,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_send = fmr_op_send, .ro_unmap_sync = fmr_op_unmap_sync, - .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, .ro_init_mr = fmr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 1bb00dd6ccdb..fc6378cc0c1c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -97,6 +97,44 @@ out_not_supported: return false; } +static void +frwr_op_release_mr(struct rpcrdma_mr *mr) +{ + int rc; + + rc = ib_dereg_mr(mr->frwr.fr_mr); + if (rc) + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", + mr, rc); + kfree(mr->mr_sg); + kfree(mr); +} + +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. + */ +static void +frwr_mr_recycle_worker(struct work_struct *work) +{ + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); + enum rpcrdma_frwr_state state = mr->frwr.fr_state; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + + trace_xprtrdma_mr_recycle(mr); + + if (state != FRWR_FLUSHED_LI) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + } + + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; + spin_unlock(&r_xprt->rx_buf.rb_mrlock); + frwr_op_release_mr(mr); +} + static int frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { @@ -113,6 +151,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) goto out_list_err; INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); sg_init_table(mr->mr_sg, depth); init_completion(&frwr->fr_linv_done); return 0; @@ -131,79 +170,6 @@ out_list_err: return rc; } -static void -frwr_op_release_mr(struct rpcrdma_mr *mr) -{ - int rc; - - rc = ib_dereg_mr(mr->frwr.fr_mr); - if (rc) - pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - mr, rc); - kfree(mr->mr_sg); - kfree(mr); -} - -static int -__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - struct rpcrdma_frwr *frwr = &mr->frwr; - int rc; - - rc = ib_dereg_mr(frwr->fr_mr); - if (rc) { - pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", - rc, mr); - return rc; - } - - frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, - ia->ri_max_frwr_depth); - if (IS_ERR(frwr->fr_mr)) { - pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", - PTR_ERR(frwr->fr_mr), mr); - return PTR_ERR(frwr->fr_mr); - } - - dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); - frwr->fr_state = FRWR_IS_INVALID; - return 0; -} - -/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. - */ -static void -frwr_op_recover_mr(struct rpcrdma_mr *mr) -{ - enum rpcrdma_frwr_state state = mr->frwr.fr_state; - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc; - - rc = __frwr_mr_reset(ia, mr); - if (state != FRWR_FLUSHED_LI) { - trace_xprtrdma_dma_unmap(mr); - ib_dma_unmap_sg(ia->ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - } - if (rc) - goto out_release; - - rpcrdma_mr_put(mr); - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; - - spin_lock(&r_xprt->rx_buf.rb_mrlock); - list_del(&mr->mr_all); - spin_unlock(&r_xprt->rx_buf.rb_mrlock); - - frwr_op_release_mr(mr); -} - /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -276,6 +242,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / ia->ri_max_frwr_depth); + ia->ri_max_segs += 2; /* segments for head and tail buffers */ return 0; } @@ -384,7 +351,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr = NULL; do { if (mr) - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); mr = rpcrdma_mr_get(r_xprt); if (!mr) return ERR_PTR(-EAGAIN); @@ -417,7 +384,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_dma_map(mr); + trace_xprtrdma_mr_map(mr); ibmr = frwr->fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); @@ -451,7 +418,7 @@ out_dmamap_err: out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frwr->fr_mr, n, mr->mr_nents); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } @@ -499,7 +466,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_remoteinv(mr); + trace_xprtrdma_mr_remoteinv(mr); mr->frwr.fr_state = FRWR_IS_INVALID; rpcrdma_mr_unmap_and_put(mr); break; /* only one invalidated MR per RPC */ @@ -536,7 +503,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) mr->frwr.fr_state = FRWR_IS_INVALID; frwr = &mr->frwr; - trace_xprtrdma_localinv(mr); + trace_xprtrdma_mr_localinv(mr); frwr->fr_cqe.done = frwr_wc_localinv; last = &frwr->fr_invwr; @@ -570,7 +537,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) if (bad_wr != first) wait_for_completion(&frwr->fr_linv_done); if (rc) - goto reset_mrs; + goto out_release; /* ORDER: Now DMA unmap all of the MRs, and return * them to the free MR list. @@ -582,22 +549,21 @@ unmap: } return; -reset_mrs: +out_release: pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); - /* Find and reset the MRs in the LOCAL_INV WRs that did not + /* Unmap and release the MRs in the LOCAL_INV WRs that did not * get posted. */ while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); - - __frwr_mr_reset(ia, mr); - bad_wr = bad_wr->next; + + list_del(&mr->mr_list); + frwr_op_release_mr(mr); } - goto unmap; } const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { @@ -605,7 +571,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_send = frwr_op_send, .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, - .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, .ro_init_mr = frwr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c8ae983c6cc0..9f53e0240035 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -71,7 +71,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Read list size */ - maxsegs += 2; /* segment for head and tail buffers */ size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ @@ -97,7 +96,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Write list size */ - maxsegs += 2; /* segment for head and tail buffers */ size = sizeof(__be32); /* segment count */ size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ @@ -805,7 +803,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) struct rpcrdma_mr *mr; mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); } /* This implementation supports the following combinations @@ -866,7 +864,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) out_err: switch (ret) { case -EAGAIN: - xprt_wait_for_buffer_space(rqst->rq_task, NULL); + xprt_wait_for_buffer_space(rqst->rq_xprt); break; case -ENOBUFS: break; @@ -1216,7 +1214,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpc_rqst *rqst = rep->rr_rqst; - unsigned long cwnd; int status; xprt->reestablish_timeout = 0; @@ -1238,15 +1235,10 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) goto out_badheader; out: - spin_lock(&xprt->recv_lock); - cwnd = xprt->cwnd; - xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(rqst->rq_task); - + spin_lock(&xprt->queue_lock); xprt_complete_rqst(rqst->rq_task, status); xprt_unpin_rqst(rqst); - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); return; /* If the incoming reply terminated a pending RPC, the next @@ -1345,19 +1337,23 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); rqst = xprt_lookup_rqst(xprt, rep->rr_xid); if (!rqst) goto out_norqst; xprt_pin_rqst(rqst); + spin_unlock(&xprt->queue_lock); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > buf->rb_max_requests) credits = buf->rb_max_requests; - buf->rb_credits = credits; - - spin_unlock(&xprt->recv_lock); + if (buf->rb_credits != credits) { + spin_lock_bh(&xprt->transport_lock); + buf->rb_credits = credits; + xprt->cwnd = credits << RPC_CWNDSHIFT; + spin_unlock_bh(&xprt->transport_lock); + } req = rpcr_to_rdmar(rqst); req->rl_reply = rep; @@ -1378,7 +1374,7 @@ out_badversion: * is corrupt. */ out_norqst: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); trace_xprtrdma_reply_rqst(rep); goto repost; diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a68180090554..f3c147d70286 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -5,8 +5,6 @@ * Support for backward direction RPCs on RPC/RDMA (server-side). */ -#include <linux/module.h> - #include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" @@ -32,7 +30,6 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct kvec *dst, *src = &rcvbuf->head[0]; struct rpc_rqst *req; - unsigned long cwnd; u32 credits; size_t len; __be32 xid; @@ -56,7 +53,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, if (src->iov_len < 24) goto out_shortreply; - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, xid); if (!req) goto out_notfound; @@ -66,6 +63,8 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, if (dst->iov_len < len) goto out_unlock; memcpy(dst->iov_base, p, len); + xprt_pin_rqst(req); + spin_unlock(&xprt->queue_lock); credits = be32_to_cpup(rdma_resp + 2); if (credits == 0) @@ -74,19 +73,17 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, credits = r_xprt->rx_buf.rb_bc_max_requests; spin_lock_bh(&xprt->transport_lock); - cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(req->rq_task); spin_unlock_bh(&xprt->transport_lock); - + spin_lock(&xprt->queue_lock); ret = 0; xprt_complete_rqst(req->rq_task, rcvbuf->len); + xprt_unpin_rqst(req); rcvbuf->len = 0; out_unlock: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); out: return ret; @@ -215,9 +212,8 @@ drop_connection: * connection. */ static int -xprt_rdma_bc_send_request(struct rpc_task *task) +xprt_rdma_bc_send_request(struct rpc_rqst *rqst) { - struct rpc_rqst *rqst = task->tk_rqstp; struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; struct svcxprt_rdma *rdma; int ret; @@ -225,12 +221,7 @@ xprt_rdma_bc_send_request(struct rpc_task *task) dprintk("svcrdma: sending bc call with xid: %08x\n", be32_to_cpu(rqst->rq_xid)); - if (!mutex_trylock(&sxprt->xpt_mutex)) { - rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL); - if (!mutex_trylock(&sxprt->xpt_mutex)) - return -EAGAIN; - rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task); - } + mutex_lock(&sxprt->xpt_mutex); ret = -ENOTCONN; rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); @@ -248,6 +239,7 @@ static void xprt_rdma_bc_close(struct rpc_xprt *xprt) { dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + xprt->cwnd = RPC_CWNDSHIFT; } static void @@ -256,7 +248,6 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt) dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); xprt_free(xprt); - module_put(THIS_MODULE); } static const struct rpc_xprt_ops xprt_rdma_bc_procs = { @@ -328,20 +319,9 @@ xprt_setup_rdma_bc(struct xprt_create *args) args->bc_xprt->xpt_bc_xprt = xprt; xprt->bc_xprt = args->bc_xprt; - if (!try_module_get(THIS_MODULE)) - goto out_fail; - /* Final put for backchannel xprt is in __svc_rdma_free */ xprt_get(xprt); return xprt; - -out_fail: - xprt_rdma_free_addresses(xprt); - args->bc_xprt->xpt_bc_xprt = NULL; - args->bc_xprt->xpt_bc_xps = NULL; - xprt_put(xprt); - xprt_free(xprt); - return ERR_PTR(-EINVAL); } struct xprt_class xprt_rdma_bc = { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 2848cafd4a17..2f7ec8912f49 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -475,10 +475,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Qualify the transport resource defaults with the * capabilities of this particular device */ - newxprt->sc_max_send_sges = dev->attrs.max_send_sge; - /* transport hdr, head iovec, one page list entry, tail iovec */ - if (newxprt->sc_max_send_sges < 4) { - pr_err("svcrdma: too few Send SGEs available (%d)\n", + /* Transport header, head iovec, tail iovec */ + newxprt->sc_max_send_sges = 3; + /* Add one SGE per page list entry */ + newxprt->sc_max_send_sges += svcrdma_max_req_size / PAGE_SIZE; + if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) { + pr_err("svcrdma: too few Send SGEs available (%d needed)\n", newxprt->sc_max_send_sges); goto errout; } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 143ce2579ba9..ae2a83828953 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -225,69 +225,59 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt) } } -void -rpcrdma_conn_func(struct rpcrdma_ep *ep) -{ - schedule_delayed_work(&ep->rep_connect_worker, 0); -} - -void -rpcrdma_connect_worker(struct work_struct *work) -{ - struct rpcrdma_ep *ep = - container_of(work, struct rpcrdma_ep, rep_connect_worker.work); - struct rpcrdma_xprt *r_xprt = - container_of(ep, struct rpcrdma_xprt, rx_ep); - struct rpc_xprt *xprt = &r_xprt->rx_xprt; - - spin_lock_bh(&xprt->transport_lock); - if (ep->rep_connected > 0) { - if (!xprt_test_and_set_connected(xprt)) - xprt_wake_pending_tasks(xprt, 0); - } else { - if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); - } - spin_unlock_bh(&xprt->transport_lock); -} - +/** + * xprt_rdma_connect_worker - establish connection in the background + * @work: worker thread context + * + * Requester holds the xprt's send lock to prevent activity on this + * transport while a fresh connection is being established. RPC tasks + * sleep on the xprt's pending queue waiting for connect to complete. + */ static void xprt_rdma_connect_worker(struct work_struct *work) { struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; - int rc = 0; - - xprt_clear_connected(xprt); + int rc; rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - xprt_wake_pending_tasks(xprt, rc); - xprt_clear_connecting(xprt); + if (r_xprt->rx_ep.rep_connected > 0) { + if (!xprt_test_and_set_connected(xprt)) { + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; + xprt_wake_pending_tasks(xprt, -EAGAIN); + } + } else { + if (xprt_test_and_clear_connected(xprt)) + xprt_wake_pending_tasks(xprt, rc); + } } +/** + * xprt_rdma_inject_disconnect - inject a connection fault + * @xprt: transport context + * + * If @xprt is connected, disconnect it to simulate spurious connection + * loss. + */ static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) { - struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, - rx_xprt); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); } -/* - * xprt_rdma_destroy +/** + * xprt_rdma_destroy - Full tear down of transport + * @xprt: doomed transport context * - * Destroy the xprt. - * Free all memory associated with the object, including its own. - * NOTE: none of the *destroy methods free memory for their top-level - * objects, even though they may have allocated it (they do free - * private memory). It's up to the caller to handle it. In this - * case (RDMA transport), all structure memory is inlined with the - * struct rpcrdma_xprt. + * Caller guarantees there will be no more calls to us with + * this @xprt. */ static void xprt_rdma_destroy(struct rpc_xprt *xprt) @@ -298,8 +288,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - xprt_clear_connected(xprt); - rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); @@ -442,11 +430,12 @@ out1: } /** - * xprt_rdma_close - Close down RDMA connection - * @xprt: generic transport to be closed + * xprt_rdma_close - close a transport connection + * @xprt: transport context * - * Called during transport shutdown reconnect, or device - * removal. Caller holds the transport's write lock. + * Called during transport shutdown, reconnect, or device removal. + * Caller holds @xprt's send lock to prevent activity on this + * transport while the connection is torn down. */ static void xprt_rdma_close(struct rpc_xprt *xprt) @@ -468,6 +457,12 @@ xprt_rdma_close(struct rpc_xprt *xprt) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); rpcrdma_ep_disconnect(ep, ia); + + /* Prepare @xprt for the next connection by reinitializing + * its credit grant to one (see RFC 8166, Section 3.3.3). + */ + r_xprt->rx_buf.rb_credits = 1; + xprt->cwnd = RPC_CWNDSHIFT; } /** @@ -519,6 +514,12 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) xprt_force_disconnect(xprt); } +/** + * xprt_rdma_connect - try to establish a transport connection + * @xprt: transport state + * @task: RPC scheduler context + * + */ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -638,13 +639,6 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * 0: Success; rq_buffer points to RPC buffer to use * ENOMEM: Out of memory, call again later * EIO: A permanent error occurred, do not retry - * - * The RDMA allocate/free functions need the task structure as a place - * to hide the struct rpcrdma_req, which is necessary for the actual - * send/recv sequence. - * - * xprt_rdma_allocate provides buffers that are already mapped for - * DMA, and a local DMA lkey is provided for each. */ static int xprt_rdma_allocate(struct rpc_task *task) @@ -693,7 +687,7 @@ xprt_rdma_free(struct rpc_task *task) /** * xprt_rdma_send_request - marshal and send an RPC request - * @task: RPC task with an RPC message in rq_snd_buf + * @rqst: RPC message in rq_snd_buf * * Caller holds the transport's write lock. * @@ -706,9 +700,8 @@ xprt_rdma_free(struct rpc_task *task) * sent. Do not try to send this message again. */ static int -xprt_rdma_send_request(struct rpc_task *task) +xprt_rdma_send_request(struct rpc_rqst *rqst) { - struct rpc_rqst *rqst = task->tk_rqstp; struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); @@ -722,6 +715,9 @@ xprt_rdma_send_request(struct rpc_task *task) if (!xprt_connected(xprt)) goto drop_connection; + if (!xprt_request_get_cong(xprt, rqst)) + return -EBADSLT; + rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) goto failed_marshal; @@ -741,7 +737,7 @@ xprt_rdma_send_request(struct rpc_task *task) /* An RPC with no reply will throw off credit accounting, * so drop the connection to reset the credit grant. */ - if (!rpc_reply_expected(task)) + if (!rpc_reply_expected(rqst->rq_task)) goto drop_connection; return 0; @@ -766,7 +762,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) 0, /* need a local port? */ xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -786,7 +782,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", - r_xprt->rx_stats.mrs_recovered, + r_xprt->rx_stats.mrs_recycled, r_xprt->rx_stats.mrs_orphaned, r_xprt->rx_stats.mrs_allocated, r_xprt->rx_stats.local_inv_needed, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 956a5ea47b58..3ddba94c939f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -108,20 +108,48 @@ rpcrdma_destroy_wq(void) } } +/** + * rpcrdma_disconnect_worker - Force a disconnect + * @work: endpoint to be disconnected + * + * Provider callbacks can possibly run in an IRQ context. This function + * is invoked in a worker thread to guarantee that disconnect wake-up + * calls are always done in process context. + */ +static void +rpcrdma_disconnect_worker(struct work_struct *work) +{ + struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, + rep_disconnect_worker.work); + struct rpcrdma_xprt *r_xprt = + container_of(ep, struct rpcrdma_xprt, rx_ep); + + xprt_force_disconnect(&r_xprt->rx_xprt); +} + +/** + * rpcrdma_qp_event_handler - Handle one QP event (error notification) + * @event: details of the event + * @context: ep that owns QP where event occurred + * + * Called from the RDMA provider (device driver) possibly in an interrupt + * context. + */ static void -rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) +rpcrdma_qp_event_handler(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, rx_ep); - trace_xprtrdma_qp_error(r_xprt, event); - pr_err("rpcrdma: %s on device %s ep %p\n", - ib_event_msg(event->event), event->device->name, context); + trace_xprtrdma_qp_event(r_xprt, event); + pr_err("rpcrdma: %s on device %s connected to %s:%s\n", + ib_event_msg(event->event), event->device->name, + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; - rpcrdma_conn_func(ep); + schedule_delayed_work(&ep->rep_disconnect_worker, 0); wake_up_all(&ep->rep_connect_wait); } } @@ -219,38 +247,48 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, rpcrdma_set_max_header_sizes(r_xprt); } +/** + * rpcrdma_cm_event_handler - Handle RDMA CM events + * @id: rdma_cm_id on which an event has occurred + * @event: details of the event + * + * Called with @id's mutex held. Returns 1 if caller should + * destroy @id, otherwise 0. + */ static int -rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) +rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct rpcrdma_xprt *xprt = id->context; - struct rpcrdma_ia *ia = &xprt->rx_ia; - struct rpcrdma_ep *ep = &xprt->rx_ep; - int connstate = 0; + struct rpcrdma_xprt *r_xprt = id->context; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + might_sleep(); - trace_xprtrdma_conn_upcall(xprt, event); + trace_xprtrdma_cm_event(r_xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: ia->ri_async_rc = 0; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_ADDR_ERROR: ia->ri_async_rc = -EPROTO; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_ROUTE_ERROR: ia->ri_async_rc = -ENETUNREACH; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) pr_info("rpcrdma: removing device %s for %s:%s\n", ia->ri_device->name, - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt)); + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; - xprt_force_disconnect(&xprt->rx_xprt); + xprt_force_disconnect(xprt); wait_for_completion(&ia->ri_remove_done); ia->ri_id = NULL; @@ -258,41 +296,40 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: - ++xprt->rx_xprt.connect_cookie; - connstate = 1; - rpcrdma_update_connect_private(xprt, &event->param.conn); - goto connected; + ++xprt->connect_cookie; + ep->rep_connected = 1; + rpcrdma_update_connect_private(r_xprt, &event->param.conn); + wake_up_all(&ep->rep_connect_wait); + break; case RDMA_CM_EVENT_CONNECT_ERROR: - connstate = -ENOTCONN; - goto connected; + ep->rep_connected = -ENOTCONN; + goto disconnected; case RDMA_CM_EVENT_UNREACHABLE: - connstate = -ENETUNREACH; - goto connected; + ep->rep_connected = -ENETUNREACH; + goto disconnected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), rdma_reject_msg(id, event->status)); - connstate = -ECONNREFUSED; + ep->rep_connected = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) - connstate = -EAGAIN; - goto connected; + ep->rep_connected = -EAGAIN; + goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: - ++xprt->rx_xprt.connect_cookie; - connstate = -ECONNABORTED; -connected: - ep->rep_connected = connstate; - rpcrdma_conn_func(ep); + ++xprt->connect_cookie; + ep->rep_connected = -ECONNABORTED; +disconnected: + xprt_force_disconnect(xprt); wake_up_all(&ep->rep_connect_wait); - /*FALLTHROUGH*/ + break; default: - dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n", - __func__, - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), - ia->ri_device->name, ia->ri_ops->ro_displayname, - ep, rdma_event_msg(event->event)); break; } + dprintk("RPC: %s: %s:%s on %s/%s: %s\n", __func__, + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), + ia->ri_device->name, ia->ri_ops->ro_displayname, + rdma_event_msg(event->event)); return 0; } @@ -308,7 +345,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) init_completion(&ia->ri_done); init_completion(&ia->ri_remove_done); - id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_conn_upcall, + id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -519,7 +556,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, if (rc) return rc; - ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; + ep->rep_attr.event_handler = rpcrdma_qp_event_handler; ep->rep_attr.qp_context = ep; ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_sge = max_sge; @@ -542,7 +579,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, cdata->max_requests >> 2); ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); - INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + INIT_DELAYED_WORK(&ep->rep_disconnect_worker, + rpcrdma_disconnect_worker); sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, @@ -615,7 +653,7 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - cancel_delayed_work_sync(&ep->rep_connect_worker); + cancel_delayed_work_sync(&ep->rep_disconnect_worker); if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); @@ -728,6 +766,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; int rc; retry: @@ -754,6 +793,8 @@ retry: } ep->rep_connected = 0; + xprt_clear_connected(xprt); + rpcrdma_post_recvs(r_xprt, true); rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -877,7 +918,6 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) sc->sc_xprt = r_xprt; buf->rb_sc_ctxs[i] = sc; } - buf->rb_flags = 0; return 0; @@ -978,39 +1018,6 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) } static void -rpcrdma_mr_recovery_worker(struct work_struct *work) -{ - struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, - rb_recovery_worker.work); - struct rpcrdma_mr *mr; - - spin_lock(&buf->rb_recovery_lock); - while (!list_empty(&buf->rb_stale_mrs)) { - mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - trace_xprtrdma_recover_mr(mr); - mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); - - spin_lock(&buf->rb_recovery_lock); - } - spin_unlock(&buf->rb_recovery_lock); -} - -void -rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - - spin_lock(&buf->rb_recovery_lock); - rpcrdma_mr_push(mr, &buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - schedule_delayed_work(&buf->rb_recovery_worker, 0); -} - -static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; @@ -1019,7 +1026,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) LIST_HEAD(free); LIST_HEAD(all); - for (count = 0; count < 3; count++) { + for (count = 0; count < ia->ri_max_segs; count++) { struct rpcrdma_mr *mr; int rc; @@ -1138,18 +1145,15 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; int i, rc; + buf->rb_flags = 0; buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); - spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all); - INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); - INIT_DELAYED_WORK(&buf->rb_recovery_worker, - rpcrdma_mr_recovery_worker); rpcrdma_mrs_create(r_xprt); @@ -1233,7 +1237,6 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_delayed_work_sync(&buf->rb_recovery_worker); cancel_delayed_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); @@ -1326,7 +1329,7 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); __rpcrdma_mr_put(&r_xprt->rx_buf, mr); @@ -1518,9 +1521,11 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) struct ib_recv_wr *wr, *bad_wr; int needed, count, rc; + rc = 0; + count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); if (buf->rb_posted_receives > needed) - return; + goto out; needed -= buf->rb_posted_receives; count = 0; @@ -1556,7 +1561,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) --needed; } if (!count) - return; + goto out; rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); @@ -1570,5 +1575,6 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) } } buf->rb_posted_receives += count; +out: trace_xprtrdma_post_recvs(r_xprt, count, rc); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2ca14f7c2d51..a13ccb643ce0 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -101,7 +101,7 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct delayed_work rep_connect_worker; + struct delayed_work rep_disconnect_worker; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -280,6 +280,7 @@ struct rpcrdma_mr { u32 mr_handle; u32 mr_length; u64 mr_offset; + struct work_struct mr_recycle; struct list_head mr_all; }; @@ -411,9 +412,6 @@ struct rpcrdma_buffer { u32 rb_bc_max_requests; - spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ - struct list_head rb_stale_mrs; - struct delayed_work rb_recovery_worker; struct delayed_work rb_refresh_worker; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -452,7 +450,7 @@ struct rpcrdma_stats { unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; - unsigned long mrs_recovered; + unsigned long mrs_recycled; unsigned long mrs_orphaned; unsigned long mrs_allocated; unsigned long empty_sendctx_q; @@ -481,7 +479,6 @@ struct rpcrdma_memreg_ops { struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_recover_mr)(struct rpcrdma_mr *mr); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); @@ -559,7 +556,6 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -void rpcrdma_conn_func(struct rpcrdma_ep *ep); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, @@ -578,7 +574,12 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); -void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); + +static inline void +rpcrdma_mr_recycle(struct rpcrdma_mr *mr) +{ + schedule_work(&mr->mr_recycle); +} struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); @@ -652,7 +653,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) extern unsigned int xprt_rdma_max_inline_read; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); -void rpcrdma_connect_worker(struct work_struct *work); void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 6b7539c0466e..1b51e04d3566 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -47,13 +47,13 @@ #include <net/checksum.h> #include <net/udp.h> #include <net/tcp.h> +#include <linux/bvec.h> +#include <linux/uio.h> #include <trace/events/sunrpc.h> #include "sunrpc.h" -#define RPC_TCP_READ_CHUNK_SZ (3*512*1024) - static void xs_close(struct rpc_xprt *xprt); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); @@ -129,7 +129,7 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, - .extra2 = &xprt_max_resvport + .extra2 = &xprt_max_resvport_limit }, { .procname = "max_resvport", @@ -137,7 +137,7 @@ static struct ctl_table xs_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &xprt_min_resvport, + .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, { @@ -325,6 +325,362 @@ static void xs_free_peer_addresses(struct rpc_xprt *xprt) } } +static size_t +xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) +{ + size_t i,n; + + if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + return want; + if (want > buf->page_len) + want = buf->page_len; + n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < n; i++) { + if (buf->pages[i]) + continue; + buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp); + if (!buf->pages[i]) { + buf->page_len = (i * PAGE_SIZE) - buf->page_base; + return buf->page_len; + } + } + return want; +} + +static ssize_t +xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) +{ + ssize_t ret; + if (seek != 0) + iov_iter_advance(&msg->msg_iter, seek); + ret = sock_recvmsg(sock, msg, flags); + return ret > 0 ? ret + seek : ret; +} + +static ssize_t +xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags, + struct kvec *kvec, size_t count, size_t seek) +{ + iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, kvec, 1, count); + return xs_sock_recvmsg(sock, msg, flags, seek); +} + +static ssize_t +xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags, + struct bio_vec *bvec, unsigned long nr, size_t count, + size_t seek) +{ + iov_iter_bvec(&msg->msg_iter, READ | ITER_BVEC, bvec, nr, count); + return xs_sock_recvmsg(sock, msg, flags, seek); +} + +static ssize_t +xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, + size_t count) +{ + struct kvec kvec = { 0 }; + return xs_read_kvec(sock, msg, flags | MSG_TRUNC, &kvec, count, 0); +} + +static ssize_t +xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, + struct xdr_buf *buf, size_t count, size_t seek, size_t *read) +{ + size_t want, seek_init = seek, offset = 0; + ssize_t ret; + + if (seek < buf->head[0].iov_len) { + want = min_t(size_t, count, buf->head[0].iov_len); + ret = xs_read_kvec(sock, msg, flags, &buf->head[0], want, seek); + if (ret <= 0) + goto sock_err; + offset += ret; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + seek = 0; + } else { + seek -= buf->head[0].iov_len; + offset += buf->head[0].iov_len; + } + if (seek < buf->page_len) { + want = xs_alloc_sparse_pages(buf, + min_t(size_t, count - offset, buf->page_len), + GFP_NOWAIT); + ret = xs_read_bvec(sock, msg, flags, buf->bvec, + xdr_buf_pagecount(buf), + want + buf->page_base, + seek + buf->page_base); + if (ret <= 0) + goto sock_err; + offset += ret - buf->page_base; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + seek = 0; + } else { + seek -= buf->page_len; + offset += buf->page_len; + } + if (seek < buf->tail[0].iov_len) { + want = min_t(size_t, count - offset, buf->tail[0].iov_len); + ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek); + if (ret <= 0) + goto sock_err; + offset += ret; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + } else + offset += buf->tail[0].iov_len; + ret = -EMSGSIZE; + msg->msg_flags |= MSG_TRUNC; +out: + *read = offset - seek_init; + return ret; +eagain: + ret = -EAGAIN; + goto out; +sock_err: + offset += seek; + goto out; +} + +static void +xs_read_header(struct sock_xprt *transport, struct xdr_buf *buf) +{ + if (!transport->recv.copied) { + if (buf->head[0].iov_len >= transport->recv.offset) + memcpy(buf->head[0].iov_base, + &transport->recv.xid, + transport->recv.offset); + transport->recv.copied = transport->recv.offset; + } +} + +static bool +xs_read_stream_request_done(struct sock_xprt *transport) +{ + return transport->recv.fraghdr & cpu_to_be32(RPC_LAST_STREAM_FRAGMENT); +} + +static ssize_t +xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, + int flags, struct rpc_rqst *req) +{ + struct xdr_buf *buf = &req->rq_private_buf; + size_t want, read; + ssize_t ret; + + xs_read_header(transport, buf); + + want = transport->recv.len - transport->recv.offset; + ret = xs_read_xdr_buf(transport->sock, msg, flags, buf, + transport->recv.copied + want, transport->recv.copied, + &read); + transport->recv.offset += read; + transport->recv.copied += read; + if (transport->recv.offset == transport->recv.len) { + if (xs_read_stream_request_done(transport)) + msg->msg_flags |= MSG_EOR; + return transport->recv.copied; + } + + switch (ret) { + case -EMSGSIZE: + return transport->recv.copied; + case 0: + return -ESHUTDOWN; + default: + if (ret < 0) + return ret; + } + return -EAGAIN; +} + +static size_t +xs_read_stream_headersize(bool isfrag) +{ + if (isfrag) + return sizeof(__be32); + return 3 * sizeof(__be32); +} + +static ssize_t +xs_read_stream_header(struct sock_xprt *transport, struct msghdr *msg, + int flags, size_t want, size_t seek) +{ + struct kvec kvec = { + .iov_base = &transport->recv.fraghdr, + .iov_len = want, + }; + return xs_read_kvec(transport->sock, msg, flags, &kvec, want, seek); +} + +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +static ssize_t +xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + struct rpc_xprt *xprt = &transport->xprt; + struct rpc_rqst *req; + ssize_t ret; + + /* Look up and lock the request corresponding to the given XID */ + req = xprt_lookup_bc_request(xprt, transport->recv.xid); + if (!req) { + printk(KERN_WARNING "Callback slot table overflowed\n"); + return -ESHUTDOWN; + } + + ret = xs_read_stream_request(transport, msg, flags, req); + if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + xprt_complete_bc_request(req, ret); + + return ret; +} +#else /* CONFIG_SUNRPC_BACKCHANNEL */ +static ssize_t +xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + return -ESHUTDOWN; +} +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + +static ssize_t +xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + struct rpc_xprt *xprt = &transport->xprt; + struct rpc_rqst *req; + ssize_t ret = 0; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->queue_lock); + req = xprt_lookup_rqst(xprt, transport->recv.xid); + if (!req) { + msg->msg_flags |= MSG_TRUNC; + goto out; + } + xprt_pin_rqst(req); + spin_unlock(&xprt->queue_lock); + + ret = xs_read_stream_request(transport, msg, flags, req); + + spin_lock(&xprt->queue_lock); + if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + xprt_complete_rqst(req->rq_task, ret); + xprt_unpin_rqst(req); +out: + spin_unlock(&xprt->queue_lock); + return ret; +} + +static ssize_t +xs_read_stream(struct sock_xprt *transport, int flags) +{ + struct msghdr msg = { 0 }; + size_t want, read = 0; + ssize_t ret = 0; + + if (transport->recv.len == 0) { + want = xs_read_stream_headersize(transport->recv.copied != 0); + ret = xs_read_stream_header(transport, &msg, flags, want, + transport->recv.offset); + if (ret <= 0) + goto out_err; + transport->recv.offset = ret; + if (ret != want) { + ret = -EAGAIN; + goto out_err; + } + transport->recv.len = be32_to_cpu(transport->recv.fraghdr) & + RPC_FRAGMENT_SIZE_MASK; + transport->recv.offset -= sizeof(transport->recv.fraghdr); + read = ret; + } + + switch (be32_to_cpu(transport->recv.calldir)) { + case RPC_CALL: + ret = xs_read_stream_call(transport, &msg, flags); + break; + case RPC_REPLY: + ret = xs_read_stream_reply(transport, &msg, flags); + } + if (msg.msg_flags & MSG_TRUNC) { + transport->recv.calldir = cpu_to_be32(-1); + transport->recv.copied = -1; + } + if (ret < 0) + goto out_err; + read += ret; + if (transport->recv.offset < transport->recv.len) { + ret = xs_read_discard(transport->sock, &msg, flags, + transport->recv.len - transport->recv.offset); + if (ret <= 0) + goto out_err; + transport->recv.offset += ret; + read += ret; + if (transport->recv.offset != transport->recv.len) + return -EAGAIN; + } + if (xs_read_stream_request_done(transport)) { + trace_xs_stream_read_request(transport); + transport->recv.copied = 0; + } + transport->recv.offset = 0; + transport->recv.len = 0; + return read; +out_err: + switch (ret) { + case 0: + case -ESHUTDOWN: + xprt_force_disconnect(&transport->xprt); + return -ESHUTDOWN; + } + return ret; +} + +static void xs_stream_data_receive(struct sock_xprt *transport) +{ + size_t read = 0; + ssize_t ret = 0; + + mutex_lock(&transport->recv_mutex); + if (transport->sock == NULL) + goto out; + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); + for (;;) { + ret = xs_read_stream(transport, MSG_DONTWAIT); + if (ret <= 0) + break; + read += ret; + cond_resched(); + } +out: + mutex_unlock(&transport->recv_mutex); + trace_xs_stream_read_data(&transport->xprt, ret, read); +} + +static void xs_stream_data_receive_workfn(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, recv_worker); + xs_stream_data_receive(transport); +} + +static void +xs_stream_reset_connect(struct sock_xprt *transport) +{ + transport->recv.offset = 0; + transport->recv.len = 0; + transport->recv.copied = 0; + transport->xmit.offset = 0; + transport->xprt.stat.connect_count++; + transport->xprt.stat.connect_start = jiffies; +} + #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more) @@ -440,28 +796,21 @@ out: return err; } -static void xs_nospace_callback(struct rpc_task *task) -{ - struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); - - transport->inet->sk_write_pending--; -} - /** - * xs_nospace - place task on wait queue if transmit was incomplete - * @task: task to put to sleep + * xs_nospace - handle transmit was incomplete + * @req: pointer to RPC request * */ -static int xs_nospace(struct rpc_task *task) +static int xs_nospace(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct sock *sk = transport->inet; int ret = -EAGAIN; dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", - task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_task->tk_pid, + req->rq_slen - transport->xmit.offset, req->rq_slen); /* Protect against races with write_space */ @@ -471,7 +820,7 @@ static int xs_nospace(struct rpc_task *task) if (xprt_connected(xprt)) { /* wait for more buffer space */ sk->sk_write_pending++; - xprt_wait_for_buffer_space(task, xs_nospace_callback); + xprt_wait_for_buffer_space(xprt); } else ret = -ENOTCONN; @@ -491,6 +840,22 @@ static int xs_nospace(struct rpc_task *task) return ret; } +static void +xs_stream_prepare_request(struct rpc_rqst *req) +{ + req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_NOIO); +} + +/* + * Determine if the previous message in the stream was aborted before it + * could complete transmission. + */ +static bool +xs_send_request_was_aborted(struct sock_xprt *transport, struct rpc_rqst *req) +{ + return transport->xmit.offset != 0 && req->rq_bytes_sent == 0; +} + /* * Construct a stream transport record marker in @buf. */ @@ -503,7 +868,7 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf) /** * xs_local_send_request - write an RPC request to an AF_LOCAL socket - * @task: RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -512,9 +877,8 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf) * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occured, the request was not sent */ -static int xs_local_send_request(struct rpc_task *task) +static int xs_local_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -522,25 +886,34 @@ static int xs_local_send_request(struct rpc_task *task) int status; int sent = 0; + /* Close the stream if the previous transmission was incomplete */ + if (xs_send_request_was_aborted(transport, req)) { + xs_close(xprt); + return -ENOTCONN; + } + xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); req->rq_xtime = ktime_get(); - status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent, + status = xs_sendpages(transport->sock, NULL, 0, xdr, + transport->xmit.offset, true, &sent); dprintk("RPC: %s(%u) = %d\n", - __func__, xdr->len - req->rq_bytes_sent, status); + __func__, xdr->len - transport->xmit.offset, status); if (status == -EAGAIN && sock_writeable(transport->inet)) status = -ENOBUFS; if (likely(sent > 0) || status == 0) { - req->rq_bytes_sent += sent; - req->rq_xmit_bytes_sent += sent; + transport->xmit.offset += sent; + req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_xmit_bytes_sent += transport->xmit.offset; req->rq_bytes_sent = 0; + transport->xmit.offset = 0; return 0; } status = -EAGAIN; @@ -550,7 +923,7 @@ static int xs_local_send_request(struct rpc_task *task) case -ENOBUFS: break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", @@ -566,7 +939,7 @@ static int xs_local_send_request(struct rpc_task *task) /** * xs_udp_send_request - write an RPC request to a UDP socket - * @task: address of RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -575,9 +948,8 @@ static int xs_local_send_request(struct rpc_task *task) * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occurred, the request was not sent */ -static int xs_udp_send_request(struct rpc_task *task) +static int xs_udp_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; @@ -590,12 +962,16 @@ static int xs_udp_send_request(struct rpc_task *task) if (!xprt_bound(xprt)) return -ENOTCONN; + + if (!xprt_request_get_cong(xprt, req)) + return -EBADSLT; + req->rq_xtime = ktime_get(); status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, - xdr, req->rq_bytes_sent, true, &sent); + xdr, 0, true, &sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", - xdr->len - req->rq_bytes_sent, status); + xdr->len, status); /* firewall is blocking us, don't return -EAGAIN or we end up looping */ if (status == -EPERM) @@ -619,7 +995,7 @@ process_status: /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; case -ENETUNREACH: case -ENOBUFS: @@ -639,7 +1015,7 @@ process_status: /** * xs_tcp_send_request - write an RPC request to a TCP socket - * @task: address of RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -651,9 +1027,8 @@ process_status: * XXX: In the case of soft timeouts, should we eventually give up * if sendmsg is not able to make progress? */ -static int xs_tcp_send_request(struct rpc_task *task) +static int xs_tcp_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; @@ -662,6 +1037,13 @@ static int xs_tcp_send_request(struct rpc_task *task) int status; int sent; + /* Close the stream if the previous transmission was incomplete */ + if (xs_send_request_was_aborted(transport, req)) { + if (transport->sock != NULL) + kernel_sock_shutdown(transport->sock, SHUT_RDWR); + return -ENOTCONN; + } + xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", @@ -671,7 +1053,7 @@ static int xs_tcp_send_request(struct rpc_task *task) * completes while the socket holds a reference to the pages, * then we may end up resending corrupted data. */ - if (task->tk_flags & RPC_TASK_SENT) + if (req->rq_task->tk_flags & RPC_TASK_SENT) zerocopy = false; if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state)) @@ -684,17 +1066,20 @@ static int xs_tcp_send_request(struct rpc_task *task) while (1) { sent = 0; status = xs_sendpages(transport->sock, NULL, 0, xdr, - req->rq_bytes_sent, zerocopy, &sent); + transport->xmit.offset, + zerocopy, &sent); dprintk("RPC: xs_tcp_send_request(%u) = %d\n", - xdr->len - req->rq_bytes_sent, status); + xdr->len - transport->xmit.offset, status); /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ - req->rq_bytes_sent += sent; - req->rq_xmit_bytes_sent += sent; + transport->xmit.offset += sent; + req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_xmit_bytes_sent += transport->xmit.offset; req->rq_bytes_sent = 0; + transport->xmit.offset = 0; return 0; } @@ -732,7 +1117,7 @@ static int xs_tcp_send_request(struct rpc_task *task) /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; case -ECONNRESET: case -ECONNREFUSED: @@ -749,35 +1134,6 @@ static int xs_tcp_send_request(struct rpc_task *task) return status; } -/** - * xs_tcp_release_xprt - clean up after a tcp transmission - * @xprt: transport - * @task: rpc task - * - * This cleans up if an error causes us to abort the transmission of a request. - * In this case, the socket may need to be reset in order to avoid confusing - * the server. - */ -static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) -{ - struct rpc_rqst *req; - - if (task != xprt->snd_task) - return; - if (task == NULL) - goto out_release; - req = task->tk_rqstp; - if (req == NULL) - goto out_release; - if (req->rq_bytes_sent == 0) - goto out_release; - if (req->rq_bytes_sent == req->rq_snd_buf.len) - goto out_release; - set_bit(XPRT_CLOSE_WAIT, &xprt->state); -out_release: - xprt_release_xprt(xprt, task); -} - static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) { transport->old_data_ready = sk->sk_data_ready; @@ -921,114 +1277,6 @@ static void xs_destroy(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) -{ - struct xdr_skb_reader desc = { - .skb = skb, - .offset = sizeof(rpc_fraghdr), - .count = skb->len - sizeof(rpc_fraghdr), - }; - - if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) - return -1; - if (desc.count) - return -1; - return 0; -} - -/** - * xs_local_data_read_skb - * @xprt: transport - * @sk: socket - * @skb: skbuff - * - * Currently this assumes we can read the whole reply in a single gulp. - */ -static void xs_local_data_read_skb(struct rpc_xprt *xprt, - struct sock *sk, - struct sk_buff *skb) -{ - struct rpc_task *task; - struct rpc_rqst *rovr; - int repsize, copied; - u32 _xid; - __be32 *xp; - - repsize = skb->len - sizeof(rpc_fraghdr); - if (repsize < 4) { - dprintk("RPC: impossible RPC reply size %d\n", repsize); - return; - } - - /* Copy the XID from the skb... */ - xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid); - if (xp == NULL) - return; - - /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->recv_lock); - rovr = xprt_lookup_rqst(xprt, *xp); - if (!rovr) - goto out_unlock; - xprt_pin_rqst(rovr); - spin_unlock(&xprt->recv_lock); - task = rovr->rq_task; - - copied = rovr->rq_private_buf.buflen; - if (copied > repsize) - copied = repsize; - - if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) { - dprintk("RPC: sk_buff copy failed\n"); - spin_lock(&xprt->recv_lock); - goto out_unpin; - } - - spin_lock(&xprt->recv_lock); - xprt_complete_rqst(task, copied); -out_unpin: - xprt_unpin_rqst(rovr); - out_unlock: - spin_unlock(&xprt->recv_lock); -} - -static void xs_local_data_receive(struct sock_xprt *transport) -{ - struct sk_buff *skb; - struct sock *sk; - int err; - -restart: - mutex_lock(&transport->recv_mutex); - sk = transport->inet; - if (sk == NULL) - goto out; - for (;;) { - skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb != NULL) { - xs_local_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); - continue; - } - if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - break; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } - } -out: - mutex_unlock(&transport->recv_mutex); -} - -static void xs_local_data_receive_workfn(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, recv_worker); - xs_local_data_receive(transport); -} - /** * xs_udp_data_read_skb - receive callback for UDP sockets * @xprt: transport @@ -1058,13 +1306,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, return; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; xprt_pin_rqst(rovr); xprt_update_rtt(rovr->rq_task); - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); task = rovr->rq_task; if ((copied = rovr->rq_private_buf.buflen) > repsize) @@ -1072,7 +1320,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); goto out_unpin; } @@ -1081,13 +1329,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, spin_lock_bh(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, copied); spin_unlock_bh(&xprt->transport_lock); - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); xprt_complete_rqst(task, copied); __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); out_unpin: xprt_unpin_rqst(rovr); out_unlock: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); } static void xs_udp_data_receive(struct sock_xprt *transport) @@ -1096,25 +1344,18 @@ static void xs_udp_data_receive(struct sock_xprt *transport) struct sock *sk; int err; -restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) goto out; + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); for (;;) { skb = skb_recv_udp(sk, 0, 1, &err); - if (skb != NULL) { - xs_udp_data_read_skb(&transport->xprt, sk, skb); - consume_skb(skb); - continue; - } - if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + if (skb == NULL) break; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } + xs_udp_data_read_skb(&transport->xprt, sk, skb); + consume_skb(skb); + cond_resched(); } out: mutex_unlock(&transport->recv_mutex); @@ -1163,263 +1404,7 @@ static void xs_tcp_force_close(struct rpc_xprt *xprt) xprt_force_disconnect(xprt); } -static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - size_t len, used; - char *p; - - p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset; - len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - - transport->tcp_reclen = ntohl(transport->tcp_fraghdr); - if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) - transport->tcp_flags |= TCP_RCV_LAST_FRAG; - else - transport->tcp_flags &= ~TCP_RCV_LAST_FRAG; - transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; - - transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR; - transport->tcp_offset = 0; - - /* Sanity check of the record length */ - if (unlikely(transport->tcp_reclen < 8)) { - dprintk("RPC: invalid TCP record fragment length\n"); - xs_tcp_force_close(xprt); - return; - } - dprintk("RPC: reading TCP record fragment of length %d\n", - transport->tcp_reclen); -} - -static void xs_tcp_check_fraghdr(struct sock_xprt *transport) -{ - if (transport->tcp_offset == transport->tcp_reclen) { - transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR; - transport->tcp_offset = 0; - if (transport->tcp_flags & TCP_RCV_LAST_FRAG) { - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - transport->tcp_flags |= TCP_RCV_COPY_XID; - transport->tcp_copied = 0; - } - } -} - -static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc) -{ - size_t len, used; - char *p; - - len = sizeof(transport->tcp_xid) - transport->tcp_offset; - dprintk("RPC: reading XID (%zu bytes)\n", len); - p = ((char *) &transport->tcp_xid) + transport->tcp_offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - transport->tcp_flags &= ~TCP_RCV_COPY_XID; - transport->tcp_flags |= TCP_RCV_READ_CALLDIR; - transport->tcp_copied = 4; - dprintk("RPC: reading %s XID %08x\n", - (transport->tcp_flags & TCP_RPC_REPLY) ? "reply for" - : "request with", - ntohl(transport->tcp_xid)); - xs_tcp_check_fraghdr(transport); -} - -static inline void xs_tcp_read_calldir(struct sock_xprt *transport, - struct xdr_skb_reader *desc) -{ - size_t len, used; - u32 offset; - char *p; - - /* - * We want transport->tcp_offset to be 8 at the end of this routine - * (4 bytes for the xid and 4 bytes for the call/reply flag). - * When this function is called for the first time, - * transport->tcp_offset is 4 (after having already read the xid). - */ - offset = transport->tcp_offset - sizeof(transport->tcp_xid); - len = sizeof(transport->tcp_calldir) - offset; - dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len); - p = ((char *) &transport->tcp_calldir) + offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - transport->tcp_flags &= ~TCP_RCV_READ_CALLDIR; - /* - * We don't yet have the XDR buffer, so we will write the calldir - * out after we get the buffer from the 'struct rpc_rqst' - */ - switch (ntohl(transport->tcp_calldir)) { - case RPC_REPLY: - transport->tcp_flags |= TCP_RCV_COPY_CALLDIR; - transport->tcp_flags |= TCP_RCV_COPY_DATA; - transport->tcp_flags |= TCP_RPC_REPLY; - break; - case RPC_CALL: - transport->tcp_flags |= TCP_RCV_COPY_CALLDIR; - transport->tcp_flags |= TCP_RCV_COPY_DATA; - transport->tcp_flags &= ~TCP_RPC_REPLY; - break; - default: - dprintk("RPC: invalid request message type\n"); - xs_tcp_force_close(&transport->xprt); - } - xs_tcp_check_fraghdr(transport); -} - -static inline void xs_tcp_read_common(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc, - struct rpc_rqst *req) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct xdr_buf *rcvbuf; - size_t len; - ssize_t r; - - rcvbuf = &req->rq_private_buf; - - if (transport->tcp_flags & TCP_RCV_COPY_CALLDIR) { - /* - * Save the RPC direction in the XDR buffer - */ - memcpy(rcvbuf->head[0].iov_base + transport->tcp_copied, - &transport->tcp_calldir, - sizeof(transport->tcp_calldir)); - transport->tcp_copied += sizeof(transport->tcp_calldir); - transport->tcp_flags &= ~TCP_RCV_COPY_CALLDIR; - } - - len = desc->count; - if (len > transport->tcp_reclen - transport->tcp_offset) - desc->count = transport->tcp_reclen - transport->tcp_offset; - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, - desc, xdr_skb_read_bits); - - if (desc->count) { - /* Error when copying to the receive buffer, - * usually because we weren't able to allocate - * additional buffer pages. All we can do now - * is turn off TCP_RCV_COPY_DATA, so the request - * will not receive any additional updates, - * and time out. - * Any remaining data from this record will - * be discarded. - */ - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - dprintk("RPC: XID %08x truncated request\n", - ntohl(transport->tcp_xid)); - dprintk("RPC: xprt = %p, tcp_copied = %lu, " - "tcp_offset = %u, tcp_reclen = %u\n", - xprt, transport->tcp_copied, - transport->tcp_offset, transport->tcp_reclen); - return; - } - - transport->tcp_copied += r; - transport->tcp_offset += r; - desc->count = len - r; - - dprintk("RPC: XID %08x read %zd bytes\n", - ntohl(transport->tcp_xid), r); - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " - "tcp_reclen = %u\n", xprt, transport->tcp_copied, - transport->tcp_offset, transport->tcp_reclen); - - if (transport->tcp_copied == req->rq_private_buf.buflen) - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - else if (transport->tcp_offset == transport->tcp_reclen) { - if (transport->tcp_flags & TCP_RCV_LAST_FRAG) - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - } -} - -/* - * Finds the request corresponding to the RPC xid and invokes the common - * tcp read code to read the data. - */ -static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct rpc_rqst *req; - - dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); - - /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->recv_lock); - req = xprt_lookup_rqst(xprt, transport->tcp_xid); - if (!req) { - dprintk("RPC: XID %08x request not found!\n", - ntohl(transport->tcp_xid)); - spin_unlock(&xprt->recv_lock); - return -1; - } - xprt_pin_rqst(req); - spin_unlock(&xprt->recv_lock); - - xs_tcp_read_common(xprt, desc, req); - - spin_lock(&xprt->recv_lock); - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) - xprt_complete_rqst(req->rq_task, transport->tcp_copied); - xprt_unpin_rqst(req); - spin_unlock(&xprt->recv_lock); - return 0; -} - #if defined(CONFIG_SUNRPC_BACKCHANNEL) -/* - * Obtains an rpc_rqst previously allocated and invokes the common - * tcp read code to read the data. The result is placed in the callback - * queue. - * If we're unable to obtain the rpc_rqst we schedule the closing of the - * connection and return -1. - */ -static int xs_tcp_read_callback(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct rpc_rqst *req; - - /* Look up the request corresponding to the given XID */ - req = xprt_lookup_bc_request(xprt, transport->tcp_xid); - if (req == NULL) { - printk(KERN_WARNING "Callback slot table overflowed\n"); - xprt_force_disconnect(xprt); - return -1; - } - - dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid)); - xs_tcp_read_common(xprt, desc, req); - - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) - xprt_complete_bc_request(req, transport->tcp_copied); - - return 0; -} - -static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - - return (transport->tcp_flags & TCP_RPC_REPLY) ? - xs_tcp_read_reply(xprt, desc) : - xs_tcp_read_callback(xprt, desc); -} - static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) { int ret; @@ -1435,145 +1420,8 @@ static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) { return PAGE_SIZE; } -#else -static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - return xs_tcp_read_reply(xprt, desc); -} #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* - * Read data off the transport. This can be either an RPC_CALL or an - * RPC_REPLY. Relay the processing to helper functions. - */ -static void xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - - if (_xs_tcp_read_data(xprt, desc) == 0) - xs_tcp_check_fraghdr(transport); - else { - /* - * The transport_lock protects the request handling. - * There's no need to hold it to update the tcp_flags. - */ - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - } -} - -static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc) -{ - size_t len; - - len = transport->tcp_reclen - transport->tcp_offset; - if (len > desc->count) - len = desc->count; - desc->count -= len; - desc->offset += len; - transport->tcp_offset += len; - dprintk("RPC: discarded %zu bytes\n", len); - xs_tcp_check_fraghdr(transport); -} - -static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) -{ - struct rpc_xprt *xprt = rd_desc->arg.data; - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct xdr_skb_reader desc = { - .skb = skb, - .offset = offset, - .count = len, - }; - size_t ret; - - dprintk("RPC: xs_tcp_data_recv started\n"); - do { - trace_xs_tcp_data_recv(transport); - /* Read in a new fragment marker if necessary */ - /* Can we ever really expect to get completely empty fragments? */ - if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) { - xs_tcp_read_fraghdr(xprt, &desc); - continue; - } - /* Read in the xid if necessary */ - if (transport->tcp_flags & TCP_RCV_COPY_XID) { - xs_tcp_read_xid(transport, &desc); - continue; - } - /* Read in the call/reply flag */ - if (transport->tcp_flags & TCP_RCV_READ_CALLDIR) { - xs_tcp_read_calldir(transport, &desc); - continue; - } - /* Read in the request data */ - if (transport->tcp_flags & TCP_RCV_COPY_DATA) { - xs_tcp_read_data(xprt, &desc); - continue; - } - /* Skip over any trailing bytes on short reads */ - xs_tcp_read_discard(transport, &desc); - } while (desc.count); - ret = len - desc.count; - if (ret < rd_desc->count) - rd_desc->count -= ret; - else - rd_desc->count = 0; - trace_xs_tcp_data_recv(transport); - dprintk("RPC: xs_tcp_data_recv done\n"); - return ret; -} - -static void xs_tcp_data_receive(struct sock_xprt *transport) -{ - struct rpc_xprt *xprt = &transport->xprt; - struct sock *sk; - read_descriptor_t rd_desc = { - .arg.data = xprt, - }; - unsigned long total = 0; - int read = 0; - -restart: - mutex_lock(&transport->recv_mutex); - sk = transport->inet; - if (sk == NULL) - goto out; - - /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - for (;;) { - rd_desc.count = RPC_TCP_READ_CHUNK_SZ; - lock_sock(sk); - read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - if (rd_desc.count != 0 || read < 0) { - clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); - release_sock(sk); - break; - } - release_sock(sk); - total += read; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } - } - if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - queue_work(xprtiod_workqueue, &transport->recv_worker); -out: - mutex_unlock(&transport->recv_mutex); - trace_xs_tcp_data_ready(xprt, read, total); -} - -static void xs_tcp_data_receive_workfn(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, recv_worker); - xs_tcp_data_receive(transport); -} - /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed @@ -1600,17 +1448,13 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_ESTABLISHED: spin_lock(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { - - /* Reset TCP record info */ - transport->tcp_offset = 0; - transport->tcp_reclen = 0; - transport->tcp_copied = 0; - transport->tcp_flags = - TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; xprt->connect_cookie++; clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); xprt_clear_connecting(xprt); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_wake_pending_tasks(xprt, -EAGAIN); } spin_unlock(&xprt->transport_lock); @@ -1675,7 +1519,8 @@ static void xs_write_space(struct sock *sk) if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) goto out; - xprt_write_space(xprt); + if (xprt_write_space(xprt)) + sk->sk_write_pending--; out: rcu_read_unlock(); } @@ -1773,11 +1618,17 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); } -static unsigned short xs_get_random_port(void) +static int xs_get_random_port(void) { - unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; - unsigned short rand = (unsigned short) prandom_u32() % range; - return rand + xprt_min_resvport; + unsigned short min = xprt_min_resvport, max = xprt_max_resvport; + unsigned short range; + unsigned short rand; + + if (max < min) + return -EADDRINUSE; + range = max - min + 1; + rand = (unsigned short) prandom_u32() % range; + return rand + min; } /** @@ -1833,9 +1684,9 @@ static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock) transport->srcport = xs_sock_getport(sock); } -static unsigned short xs_get_srcport(struct sock_xprt *transport) +static int xs_get_srcport(struct sock_xprt *transport) { - unsigned short port = transport->srcport; + int port = transport->srcport; if (port == 0 && transport->xprt.resvport) port = xs_get_random_port(); @@ -1856,7 +1707,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) { struct sockaddr_storage myaddr; int err, nloop = 0; - unsigned short port = xs_get_srcport(transport); + int port = xs_get_srcport(transport); unsigned short last; /* @@ -1874,8 +1725,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) * transport->xprt.resvport == 1) xs_get_srcport above will * ensure that port is non-zero and we will bind as needed. */ - if (port == 0) - return 0; + if (port <= 0) + return port; memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { @@ -2028,9 +1879,8 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, write_unlock_bh(&sk->sk_callback_lock); } - /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; + xs_stream_reset_connect(transport); + return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); } @@ -2062,6 +1912,9 @@ static int xs_local_setup_socket(struct sock_xprt *transport) case 0: dprintk("RPC: xprt %p connected to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_set_connected(xprt); case -ENOBUFS: break; @@ -2386,9 +2239,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_set_memalloc(xprt); + /* Reset TCP record info */ + xs_stream_reset_connect(transport); + /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); switch (ret) { @@ -2561,7 +2415,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) "%llu %llu %lu %llu %llu\n", xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -2616,7 +2470,7 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) transport->srcport, xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -2704,9 +2558,8 @@ static int bc_sendto(struct rpc_rqst *req) /* * The send routine. Borrows from svc_send */ -static int bc_send_request(struct rpc_task *task) +static int bc_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct svc_xprt *xprt; int len; @@ -2720,12 +2573,7 @@ static int bc_send_request(struct rpc_task *task) * Grab the mutex to serialize data as the connection is shared * with the fore channel */ - if (!mutex_trylock(&xprt->xpt_mutex)) { - rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL); - if (!mutex_trylock(&xprt->xpt_mutex)) - return -EAGAIN; - rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task); - } + mutex_lock(&xprt->xpt_mutex); if (test_bit(XPT_DEAD, &xprt->xpt_flags)) len = -ENOTCONN; else @@ -2761,7 +2609,7 @@ static void bc_destroy(struct rpc_xprt *xprt) static const struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, - .release_xprt = xs_tcp_release_xprt, + .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = xs_local_rpcbind, @@ -2769,6 +2617,7 @@ static const struct rpc_xprt_ops xs_local_ops = { .connect = xs_local_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, + .prepare_request = xs_stream_prepare_request, .send_request = xs_local_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, @@ -2803,14 +2652,15 @@ static const struct rpc_xprt_ops xs_udp_ops = { static const struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, - .release_xprt = xs_tcp_release_xprt, - .alloc_slot = xprt_lock_and_alloc_slot, + .release_xprt = xprt_release_xprt, + .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, + .prepare_request = xs_stream_prepare_request, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_tcp_shutdown, @@ -2952,9 +2802,8 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) xprt->ops = &xs_local_ops; xprt->timeout = &xs_local_default_timeout; - INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_dummy_setup_socket); + INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); + INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket); switch (sun->sun_family) { case AF_LOCAL: @@ -3106,7 +2955,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); - INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); + INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); switch (addr->sa_family) { @@ -3317,12 +3166,8 @@ static int param_set_uint_minmax(const char *val, static int param_set_portnr(const char *val, const struct kernel_param *kp) { - if (kp->arg == &xprt_min_resvport) - return param_set_uint_minmax(val, kp, - RPC_MIN_RESVPORT, - xprt_max_resvport); return param_set_uint_minmax(val, kp, - xprt_min_resvport, + RPC_MIN_RESVPORT, RPC_MAX_RESVPORT); } diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 4a9ee2d83158..140270a13d54 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -8,7 +8,6 @@ config XFRM config XFRM_OFFLOAD bool - depends on XFRM config XFRM_ALGO tristate diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c index 2ad33ce1ea17..eca8d84d99bf 100644 --- a/net/xfrm/xfrm_hash.c +++ b/net/xfrm/xfrm_hash.c @@ -6,7 +6,7 @@ #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/xfrm.h> |