diff options
Diffstat (limited to 'net')
89 files changed, 1462 insertions, 773 deletions
diff --git a/net/9p/client.c b/net/9p/client.c index 9186550d77a6..0004cbaac4a4 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -415,9 +415,17 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r) * req: request received * */ -void p9_client_cb(struct p9_client *c, struct p9_req_t *req) +void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) { p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc->tag); + + /* + * This barrier is needed to make sure any change made to req before + * the other thread wakes up will indeed be seen by the waiting side. + */ + smp_wmb(); + req->status = status; + wake_up(req->wq); p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag); } @@ -655,16 +663,13 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) if (IS_ERR(req)) return PTR_ERR(req); - /* * if we haven't received a response for oldreq, * remove it from the list */ - if (oldreq->status == REQ_STATUS_FLSH) { - spin_lock(&c->lock); - list_del(&oldreq->req_list); - spin_unlock(&c->lock); - } + if (oldreq->status == REQ_STATUS_SENT) + if (c->trans_mod->cancelled) + c->trans_mod->cancelled(c, oldreq); p9_free_req(c, req); return 0; @@ -751,6 +756,12 @@ again: err = wait_event_interruptible(*req->wq, req->status >= REQ_STATUS_RCVD); + /* + * Make sure our req is coherent with regard to updates in other + * threads - echoes to wmb() in the callback + */ + smp_rmb(); + if ((err == -ERESTARTSYS) && (c->status == Connected) && (type == P9_TFLUSH)) { sigpending = 1; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index b7bd7f2961bf..80d08f6664cb 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -66,20 +66,6 @@ struct p9_fd_opts { int privport; }; -/** - * struct p9_trans_fd - transport state - * @rd: reference to file to read from - * @wr: reference of file to write to - * @conn: connection state reference - * - */ - -struct p9_trans_fd { - struct file *rd; - struct file *wr; - struct p9_conn *conn; -}; - /* * Option Parsing (code inspired by NFS code) * - a little lazy - parse all fd-transport options @@ -159,6 +145,20 @@ struct p9_conn { unsigned long wsched; }; +/** + * struct p9_trans_fd - transport state + * @rd: reference to file to read from + * @wr: reference of file to write to + * @conn: connection state reference + * + */ + +struct p9_trans_fd { + struct file *rd; + struct file *wr; + struct p9_conn conn; +}; + static void p9_poll_workfn(struct work_struct *work); static DEFINE_SPINLOCK(p9_poll_lock); @@ -212,15 +212,9 @@ static void p9_conn_cancel(struct p9_conn *m, int err) m->err = err; list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { - req->status = REQ_STATUS_ERROR; - if (!req->t_err) - req->t_err = err; list_move(&req->req_list, &cancel_list); } list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { - req->status = REQ_STATUS_ERROR; - if (!req->t_err) - req->t_err = err; list_move(&req->req_list, &cancel_list); } spin_unlock_irqrestore(&m->client->lock, flags); @@ -228,7 +222,9 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); list_del(&req->req_list); - p9_client_cb(m->client, req); + if (!req->t_err) + req->t_err = err; + p9_client_cb(m->client, req, REQ_STATUS_ERROR); } } @@ -302,6 +298,7 @@ static void p9_read_work(struct work_struct *work) { int n, err; struct p9_conn *m; + int status = REQ_STATUS_ERROR; m = container_of(work, struct p9_conn, rq); @@ -348,8 +345,7 @@ static void p9_read_work(struct work_struct *work) "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag); m->req = p9_tag_lookup(m->client, tag); - if (!m->req || (m->req->status != REQ_STATUS_SENT && - m->req->status != REQ_STATUS_FLSH)) { + if (!m->req || (m->req->status != REQ_STATUS_SENT)) { p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", tag); err = -EIO; @@ -375,10 +371,10 @@ static void p9_read_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "got new packet\n"); spin_lock(&m->client->lock); if (m->req->status != REQ_STATUS_ERROR) - m->req->status = REQ_STATUS_RCVD; + status = REQ_STATUS_RCVD; list_del(&m->req->req_list); spin_unlock(&m->client->lock); - p9_client_cb(m->client, m->req); + p9_client_cb(m->client, m->req, status); m->rbuf = NULL; m->rpos = 0; m->rsize = 0; @@ -573,21 +569,19 @@ p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) } /** - * p9_conn_create - allocate and initialize the per-session mux data + * p9_conn_create - initialize the per-session mux data * @client: client instance * * Note: Creates the polling task if this is the first session. */ -static struct p9_conn *p9_conn_create(struct p9_client *client) +static void p9_conn_create(struct p9_client *client) { int n; - struct p9_conn *m; + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; p9_debug(P9_DEBUG_TRANS, "client %p msize %d\n", client, client->msize); - m = kzalloc(sizeof(struct p9_conn), GFP_KERNEL); - if (!m) - return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&m->mux_list); m->client = client; @@ -609,8 +603,6 @@ static struct p9_conn *p9_conn_create(struct p9_client *client) p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m); set_bit(Wpending, &m->wsched); } - - return m; } /** @@ -669,7 +661,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) { int n; struct p9_trans_fd *ts = client->trans; - struct p9_conn *m = ts->conn; + struct p9_conn *m = &ts->conn; p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n", m, current, req->tc, req->tc->id); @@ -704,14 +696,26 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) list_del(&req->req_list); req->status = REQ_STATUS_FLSHD; ret = 0; - } else if (req->status == REQ_STATUS_SENT) - req->status = REQ_STATUS_FLSH; - + } spin_unlock(&client->lock); return ret; } +static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req); + + /* we haven't received a response for oldreq, + * remove it from the list. + */ + spin_lock(&client->lock); + list_del(&req->req_list); + spin_unlock(&client->lock); + + return 0; +} + /** * parse_opts - parse mount options into p9_fd_opts structure * @params: options string passed from mount @@ -780,7 +784,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts) static int p9_fd_open(struct p9_client *client, int rfd, int wfd) { - struct p9_trans_fd *ts = kmalloc(sizeof(struct p9_trans_fd), + struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); if (!ts) return -ENOMEM; @@ -806,9 +810,8 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) { struct p9_trans_fd *p; struct file *file; - int ret; - p = kmalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); + p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); if (!p) return -ENOMEM; @@ -829,20 +832,12 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) p->rd->f_flags |= O_NONBLOCK; - p->conn = p9_conn_create(client); - if (IS_ERR(p->conn)) { - ret = PTR_ERR(p->conn); - p->conn = NULL; - kfree(p); - sockfd_put(csocket); - sockfd_put(csocket); - return ret; - } + p9_conn_create(client); return 0; } /** - * p9_mux_destroy - cancels all pending requests and frees mux resources + * p9_mux_destroy - cancels all pending requests of mux * @m: mux to destroy * */ @@ -859,7 +854,6 @@ static void p9_conn_destroy(struct p9_conn *m) p9_conn_cancel(m, -ECONNRESET); m->client = NULL; - kfree(m); } /** @@ -881,7 +875,7 @@ static void p9_fd_close(struct p9_client *client) client->status = Disconnected; - p9_conn_destroy(ts->conn); + p9_conn_destroy(&ts->conn); if (ts->rd) fput(ts->rd); @@ -1033,14 +1027,7 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) return err; p = (struct p9_trans_fd *) client->trans; - p->conn = p9_conn_create(client); - if (IS_ERR(p->conn)) { - err = PTR_ERR(p->conn); - p->conn = NULL; - fput(p->rd); - fput(p->wr); - return err; - } + p9_conn_create(client); return 0; } @@ -1053,6 +1040,7 @@ static struct p9_trans_module p9_tcp_trans = { .close = p9_fd_close, .request = p9_fd_request, .cancel = p9_fd_cancel, + .cancelled = p9_fd_cancelled, .owner = THIS_MODULE, }; @@ -1064,6 +1052,7 @@ static struct p9_trans_module p9_unix_trans = { .close = p9_fd_close, .request = p9_fd_request, .cancel = p9_fd_cancel, + .cancelled = p9_fd_cancelled, .owner = THIS_MODULE, }; @@ -1075,6 +1064,7 @@ static struct p9_trans_module p9_fd_trans = { .close = p9_fd_close, .request = p9_fd_request, .cancel = p9_fd_cancel, + .cancelled = p9_fd_cancelled, .owner = THIS_MODULE, }; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 8f68df5d2973..14ad43b5cf89 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -193,6 +193,8 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts) if (!*p) continue; token = match_token(p, tokens, args); + if (token == Opt_err) + continue; r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, @@ -305,8 +307,7 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, } req->rc = c->rc; - req->status = REQ_STATUS_RCVD; - p9_client_cb(client, req); + p9_client_cb(client, req, REQ_STATUS_RCVD); return; @@ -511,6 +512,11 @@ dont_need_post_recv: goto send_error; } + /* Mark request as `sent' *before* we actually send it, + * because doing if after could erase the REQ_STATUS_RCVD + * status in case of a very fast reply. + */ + req->status = REQ_STATUS_SENT; err = ib_post_send(rdma->qp, &wr, &bad_wr); if (err) goto send_error; @@ -520,6 +526,7 @@ dont_need_post_recv: /* Handle errors that happened during or while preparing the send: */ send_error: + req->status = REQ_STATUS_ERROR; kfree(c); p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); @@ -582,12 +589,24 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) return rdma; } -/* its not clear to me we can do anything after send has been posted */ static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) { + /* Nothing to do here. + * We will take care of it (if we have to) in rdma_cancelled() + */ return 1; } +/* A request has been fully flushed without a reply. + * That means we have posted one buffer in excess. + */ +static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + struct p9_trans_rdma *rdma = client->trans; + atomic_inc(&rdma->excess_rc); + return 0; +} + /** * trans_create_rdma - Transport method for creating atransport instance * @client: client instance @@ -721,6 +740,7 @@ static struct p9_trans_module p9_rdma_trans = { .close = rdma_close, .request = rdma_request, .cancel = rdma_cancel, + .cancelled = rdma_cancelled, }; /** diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index ac2666c1d011..6940d8fe8971 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -164,8 +164,7 @@ static void req_done(struct virtqueue *vq) p9_debug(P9_DEBUG_TRANS, ": rc %p\n", rc); p9_debug(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); req = p9_tag_lookup(chan->client, rc->tag); - req->status = REQ_STATUS_RCVD; - p9_client_cb(chan->client, req); + p9_client_cb(chan->client, req, REQ_STATUS_RCVD); } } diff --git a/net/atm/clip.c b/net/atm/clip.c index 8215f7cb170b..ba291ce4bdff 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -68,7 +68,7 @@ static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip) sk = sk_atm(atmarpd); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); return 0; } diff --git a/net/atm/lec.c b/net/atm/lec.c index 5a2f602d07e1..4c5b8ba0f84f 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -152,7 +152,7 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) atm_force_charge(priv->lecd, skb2->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb2); - sk->sk_data_ready(sk, skb2->len); + sk->sk_data_ready(sk); } } #endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ @@ -447,7 +447,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) atm_force_charge(priv->lecd, skb2->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb2); - sk->sk_data_ready(sk, skb2->len); + sk->sk_data_ready(sk); } } #endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ @@ -530,13 +530,13 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type, atm_force_charge(priv->lecd, skb->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); if (data != NULL) { pr_debug("about to send %d bytes of data\n", data->len); atm_force_charge(priv->lecd, data->truesize); skb_queue_tail(&sk->sk_receive_queue, data); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); } return 0; @@ -616,7 +616,7 @@ static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb) pr_debug("%s: To daemon\n", dev->name); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); } else { /* Data frame, queue to protocol handlers */ struct lec_arp_table *entry; unsigned char *src, *dst; diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 91dc58f1124d..e8e0e7a8a23d 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -706,7 +706,7 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb) dprintk("(%s) control packet arrived\n", dev->name); /* Pass control packets to daemon */ skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); return; } @@ -992,7 +992,7 @@ int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc) sk = sk_atm(mpc->mpoad_vcc); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); return 0; } @@ -1273,7 +1273,7 @@ static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry) sk = sk_atm(vcc); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); dprintk("exiting\n"); } diff --git a/net/atm/raw.c b/net/atm/raw.c index b4f7b9ff3c74..2e17e97a7a8b 100644 --- a/net/atm/raw.c +++ b/net/atm/raw.c @@ -25,7 +25,7 @@ static void atm_push_raw(struct atm_vcc *vcc, struct sk_buff *skb) struct sock *sk = sk_atm(vcc); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); } } diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 4176887e72eb..523bce72f698 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -51,7 +51,7 @@ static void sigd_put_skb(struct sk_buff *skb) #endif atm_force_charge(sigd, skb->truesize); skb_queue_tail(&sk_atm(sigd)->sk_receive_queue, skb); - sk_atm(sigd)->sk_data_ready(sk_atm(sigd), skb->len); + sk_atm(sigd)->sk_data_ready(sk_atm(sigd)); } static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg) diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 96f4cab3a2f9..7ed8ab724819 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -422,7 +422,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, if (sk) { if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); sock_put(sk); } else { free: diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index f59e00c2daa9..ef5e5b04f34f 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1271,7 +1271,7 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err) if (parent) { bt_accept_unlink(sk); - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { sk->sk_state_change(sk); } @@ -1327,7 +1327,7 @@ static void l2cap_sock_ready_cb(struct l2cap_chan *chan) sk->sk_state_change(sk); if (parent) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); release_sock(sk); } @@ -1340,7 +1340,7 @@ static void l2cap_sock_defer_cb(struct l2cap_chan *chan) parent = bt_sk(sk)->parent; if (parent) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); release_sock(sk); } diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 633cceeb943e..cf620260affa 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -186,9 +186,9 @@ static void rfcomm_l2state_change(struct sock *sk) rfcomm_schedule(); } -static void rfcomm_l2data_ready(struct sock *sk, int bytes) +static void rfcomm_l2data_ready(struct sock *sk) { - BT_DBG("%p bytes %d", sk, bytes); + BT_DBG("%p", sk); rfcomm_schedule(); } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index eabd25ab5ad9..c603a5eb4720 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -54,7 +54,7 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb) atomic_add(skb->len, &sk->sk_rmem_alloc); skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) rfcomm_dlc_throttle(d); @@ -84,7 +84,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) sock_set_flag(sk, SOCK_ZAPPED); bt_accept_unlink(sk); } - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { if (d->state == BT_CONNECTED) rfcomm_session_getaddr(d->session, diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index ab1e6fcca4c5..c06dbd3938e8 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1024,7 +1024,7 @@ static void sco_conn_ready(struct sco_conn *conn) sk->sk_state = BT_CONNECTED; /* Wake up parent */ - parent->sk_data_ready(parent, 1); + parent->sk_data_ready(parent); bh_unlock_sock(parent); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index d0cca3c65f01..7985deaff52f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -73,7 +73,7 @@ int br_handle_frame_finish(struct sk_buff *skb) goto drop; if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid)) - goto drop; + goto out; /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 91510712c7a7..4a3716102789 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -170,7 +170,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, * rejected. */ if (!v) - return false; + goto drop; /* If vlan tx offload is disabled on bridge device and frame was * sent from vlan device on the bridge device, it does not have @@ -193,7 +193,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, * vlan untagged or priority-tagged traffic belongs to. */ if (pvid == VLAN_N_VID) - return false; + goto drop; /* PVID is set on this port. Any untagged or priority-tagged * ingress frame is considered to belong to this vlan. @@ -216,7 +216,8 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, /* Frame had a valid vlan tag. See if vlan is allowed */ if (test_bit(*vid, v->vlan_bitmap)) return true; - +drop: + kfree_skb(skb); return false; } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 0e474b13463b..1059ed3bc255 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1044,10 +1044,9 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, if (repl->num_counters && copy_to_user(repl->counters, counterstmp, repl->num_counters * sizeof(struct ebt_counter))) { - ret = -EFAULT; + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n"); } - else - ret = 0; /* decrease module count and free resources */ EBT_ENTRY_ITERATE(table->entries, table->entries_size, diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index d6be3edb7a43..e8437094d15f 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -124,7 +124,6 @@ static void caif_flow_ctrl(struct sock *sk, int mode) static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; - int skb_len; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); @@ -153,14 +152,13 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * may be freed by other threads of control pulling packets * from the queue. */ - skb_len = skb->len; spin_lock_irqsave(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); else kfree_skb(skb); return 0; diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index b703790b4e44..a1ef53c04415 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -292,10 +292,12 @@ static int is_out(const struct crush_map *map, * @outpos: our position in that vector * @tries: number of attempts to make * @recurse_tries: number of attempts to have recursive chooseleaf make - * @local_tries: localized retries - * @local_fallback_tries: localized fallback retries + * @local_retries: localized retries + * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) + * @parent_r: r value passed from the parent */ static int crush_choose_firstn(const struct crush_map *map, struct crush_bucket *bucket, @@ -304,10 +306,12 @@ static int crush_choose_firstn(const struct crush_map *map, int *out, int outpos, unsigned int tries, unsigned int recurse_tries, - unsigned int local_tries, - unsigned int local_fallback_tries, + unsigned int local_retries, + unsigned int local_fallback_retries, int recurse_to_leaf, - int *out2) + unsigned int vary_r, + int *out2, + int parent_r) { int rep; unsigned int ftotal, flocal; @@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map, int itemtype; int collide, reject; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", - bucket->id, x, outpos, numrep); + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep, + tries, recurse_tries, local_retries, local_fallback_retries, + parent_r); for (rep = outpos; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ @@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map, do { collide = 0; retry_bucket = 0; - r = rep; + r = rep + parent_r; /* r' = r + f_total */ r += ftotal; @@ -344,9 +351,9 @@ static int crush_choose_firstn(const struct crush_map *map, reject = 1; goto reject; } - if (local_fallback_tries > 0 && + if (local_fallback_retries > 0 && flocal >= (in->size>>1) && - flocal > local_fallback_tries) + flocal > local_fallback_retries) item = bucket_perm_choose(in, x, r); else item = crush_bucket_choose(in, x, r); @@ -387,16 +394,23 @@ static int crush_choose_firstn(const struct crush_map *map, reject = 0; if (!collide && recurse_to_leaf) { if (item < 0) { + int sub_r; + if (vary_r) + sub_r = r >> (vary_r-1); + else + sub_r = 0; if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, x, outpos+1, 0, out2, outpos, recurse_tries, 0, - local_tries, - local_fallback_tries, + local_retries, + local_fallback_retries, 0, - NULL) <= outpos) + vary_r, + NULL, + sub_r) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -420,14 +434,14 @@ reject: ftotal++; flocal++; - if (collide && flocal <= local_tries) + if (collide && flocal <= local_retries) /* retry locally a few times */ retry_bucket = 1; - else if (local_fallback_tries > 0 && - flocal <= in->size + local_fallback_tries) + else if (local_fallback_retries > 0 && + flocal <= in->size + local_fallback_retries) /* exhaustive bucket search */ retry_bucket = 1; - else if (ftotal <= tries) + else if (ftotal < tries) /* then retry descent */ retry_descent = 1; else @@ -640,10 +654,20 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; - int choose_tries = map->choose_total_tries; - int choose_local_tries = map->choose_local_tries; - int choose_local_fallback_tries = map->choose_local_fallback_tries; + /* + * the original choose_total_tries value was off by one (it + * counted "retries" and not "tries"). add one. + */ + int choose_tries = map->choose_total_tries + 1; int choose_leaf_tries = 0; + /* + * the local tries values were counted as "retries", though, + * and need no adjustment + */ + int choose_local_retries = map->choose_local_tries; + int choose_local_fallback_retries = map->choose_local_fallback_tries; + + int vary_r = map->chooseleaf_vary_r; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -676,13 +700,18 @@ int crush_do_rule(const struct crush_map *map, break; case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: - if (curstep->arg1 > 0) - choose_local_tries = curstep->arg1; + if (curstep->arg1 >= 0) + choose_local_retries = curstep->arg1; break; case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: - if (curstep->arg1 > 0) - choose_local_fallback_tries = curstep->arg1; + if (curstep->arg1 >= 0) + choose_local_fallback_retries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: + if (curstep->arg1 >= 0) + vary_r = curstep->arg1; break; case CRUSH_RULE_CHOOSELEAF_FIRSTN: @@ -734,10 +763,12 @@ int crush_do_rule(const struct crush_map *map, o+osize, j, choose_tries, recurse_tries, - choose_local_tries, - choose_local_fallback_tries, + choose_local_retries, + choose_local_fallback_retries, recurse_to_leaf, - c+osize); + vary_r, + c+osize, + 0); } else { crush_choose_indep( map, diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 258a382e75ed..10421a4b76f8 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -53,34 +53,55 @@ static int osdmap_show(struct seq_file *s, void *p) { int i; struct ceph_client *client = s->private; + struct ceph_osdmap *map = client->osdc.osdmap; struct rb_node *n; - if (client->osdc.osdmap == NULL) + if (map == NULL) return 0; - seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); + + seq_printf(s, "epoch %d\n", map->epoch); seq_printf(s, "flags%s%s\n", - (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? - " NEARFULL" : "", - (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? - " FULL" : ""); - for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { + (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "", + (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); + + for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pool = rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pg_pool %llu pg_num %d / %d\n", - (unsigned long long)pool->id, pool->pg_num, - pool->pg_num_mask); + + seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", + pool->id, pool->pg_num, pool->pg_num_mask, + pool->read_tier, pool->write_tier); } - for (i = 0; i < client->osdc.osdmap->max_osd; i++) { - struct ceph_entity_addr *addr = - &client->osdc.osdmap->osd_addr[i]; - int state = client->osdc.osdmap->osd_state[i]; + for (i = 0; i < map->max_osd; i++) { + struct ceph_entity_addr *addr = &map->osd_addr[i]; + int state = map->osd_state[i]; char sb[64]; - seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", i, ceph_pr_addr(&addr->in_addr), - ((client->osdc.osdmap->osd_weight[i]*100) >> 16), - ceph_osdmap_state_str(sb, sizeof(sb), state)); + ((map->osd_weight[i]*100) >> 16), + ceph_osdmap_state_str(sb, sizeof(sb), state), + ((ceph_get_primary_affinity(map, i)*100) >> 16)); + } + for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool, + pg->pgid.seed); + for (i = 0; i < pg->pg_temp.len; i++) + seq_printf(s, "%s%d", (i == 0 ? "" : ","), + pg->pg_temp.osds[i]); + seq_printf(s, "]\n"); } + for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, + pg->pgid.seed, pg->primary_temp.osd); + } + return 0; } diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 30efc5c18622..dac7f9b98687 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -383,7 +383,7 @@ static void con_sock_state_closed(struct ceph_connection *con) */ /* data available on socket, or listen socket received a connect */ -static void ceph_sock_data_ready(struct sock *sk, int count_unused) +static void ceph_sock_data_ready(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; if (atomic_read(&con->msgr->stopping)) { @@ -919,6 +919,9 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, if (!bytes || cursor->page_offset) return false; /* more bytes to process in the current page */ + if (!cursor->resid) + return false; /* no more data */ + /* Move on to the next page; offset is already at 0 */ BUG_ON(cursor->page_index >= cursor->page_count); @@ -1004,6 +1007,9 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, if (!bytes || cursor->offset & ~PAGE_MASK) return false; /* more bytes to process in the current page */ + if (!cursor->resid) + return false; /* no more data */ + /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 82750f915865..b0dfce77656a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode) case CEPH_OSD_OP_OMAPCLEAR: case CEPH_OSD_OP_OMAPRMKEYS: case CEPH_OSD_OP_OMAP_CMP: + case CEPH_OSD_OP_SETALLOCHINT: case CEPH_OSD_OP_CLONERANGE: case CEPH_OSD_OP_ASSERT_SRC_VERSION: case CEPH_OSD_OP_SRC_CMPXATTR: @@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_watch_init); +void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, + unsigned int which, + u64 expected_object_size, + u64 expected_write_size) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + CEPH_OSD_OP_SETALLOCHINT); + + op->alloc_hint.expected_object_size = expected_object_size; + op->alloc_hint.expected_write_size = expected_write_size; + + /* + * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed + * not worth a feature bit. Set FAILOK per-op flag to make + * sure older osds don't trip over an unsupported opcode. + */ + op->flags |= CEPH_OSD_OP_FLAG_FAILOK; +} +EXPORT_SYMBOL(osd_req_op_alloc_hint_init); + static void ceph_osdc_msg_data_add(struct ceph_msg *msg, struct ceph_osd_data *osd_data) { @@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->watch.ver = cpu_to_le64(src->watch.ver); dst->watch.flag = src->watch.flag; break; + case CEPH_OSD_OP_SETALLOCHINT: + dst->alloc_hint.expected_object_size = + cpu_to_le64(src->alloc_hint.expected_object_size); + dst->alloc_hint.expected_write_size = + cpu_to_le64(src->alloc_hint.expected_write_size); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); @@ -688,7 +715,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, return 0; } + dst->op = cpu_to_le16(src->op); + dst->flags = cpu_to_le32(src->flags); dst->payload_len = cpu_to_le32(src->payload_len); return request_data_len; @@ -1304,7 +1333,7 @@ static int __map_request(struct ceph_osd_client *osdc, { struct ceph_pg pgid; int acting[CEPH_PG_MAX_SIZE]; - int o = -1, num = 0; + int num, o; int err; bool was_paused; @@ -1317,11 +1346,9 @@ static int __map_request(struct ceph_osd_client *osdc, } req->r_pgid = pgid; - err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); - if (err > 0) { - o = acting[0]; - num = err; - } + num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); + if (num < 0) + num = 0; was_paused = req->r_paused; req->r_paused = __req_should_be_paused(osdc, req); @@ -2033,7 +2060,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) int skipped_map = 0; dout("taking full map %u len %d\n", epoch, maplen); - newmap = osdmap_decode(&p, p+maplen); + newmap = ceph_osdmap_decode(&p, p+maplen); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index aade4a5c1c07..e632b5a52f5b 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -343,7 +343,7 @@ bad: /* * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) + * to a set of osds) and primary_temp (explicit primary setting) */ static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) { @@ -506,7 +506,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) kfree(pi); } -static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) +static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) { u8 ev, cv; unsigned len, num; @@ -587,7 +587,7 @@ bad: return -EINVAL; } -static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) +static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) { struct ceph_pg_pool_info *pi; u32 num, len; @@ -633,6 +633,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) rb_erase(&pg->node, &map->pg_temp); kfree(pg); } + while (!RB_EMPTY_ROOT(&map->primary_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->primary_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->primary_temp); + kfree(pg); + } while (!RB_EMPTY_ROOT(&map->pg_pools)) { struct ceph_pg_pool_info *pi = rb_entry(rb_first(&map->pg_pools), @@ -642,186 +649,516 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) kfree(map->osd_state); kfree(map->osd_weight); kfree(map->osd_addr); + kfree(map->osd_primary_affinity); kfree(map); } /* - * adjust max osd value. reallocate arrays. + * Adjust max_osd value, (re)allocate arrays. + * + * The new elements are properly initialized. */ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) { u8 *state; - struct ceph_entity_addr *addr; u32 *weight; + struct ceph_entity_addr *addr; + int i; - state = kcalloc(max, sizeof(*state), GFP_NOFS); - addr = kcalloc(max, sizeof(*addr), GFP_NOFS); - weight = kcalloc(max, sizeof(*weight), GFP_NOFS); - if (state == NULL || addr == NULL || weight == NULL) { + state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); + weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); + addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); + if (!state || !weight || !addr) { kfree(state); - kfree(addr); kfree(weight); + kfree(addr); + return -ENOMEM; } - /* copy old? */ - if (map->osd_state) { - memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); - memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); - memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); - kfree(map->osd_state); - kfree(map->osd_addr); - kfree(map->osd_weight); + for (i = map->max_osd; i < max; i++) { + state[i] = 0; + weight[i] = CEPH_OSD_OUT; + memset(addr + i, 0, sizeof(*addr)); } map->osd_state = state; map->osd_weight = weight; map->osd_addr = addr; + + if (map->osd_primary_affinity) { + u32 *affinity; + + affinity = krealloc(map->osd_primary_affinity, + max*sizeof(*affinity), GFP_NOFS); + if (!affinity) + return -ENOMEM; + + for (i = map->max_osd; i < max; i++) + affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + + map->osd_primary_affinity = affinity; + } + map->max_osd = max; + return 0; } +#define OSDMAP_WRAPPER_COMPAT_VER 7 +#define OSDMAP_CLIENT_DATA_COMPAT_VER 1 + /* - * decode a full map. + * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps, + * to struct_v of the client_data section for new (v7 and above) + * osdmaps. */ -struct ceph_osdmap *osdmap_decode(void **p, void *end) +static int get_osdmap_client_data_v(void **p, void *end, + const char *prefix, u8 *v) { - struct ceph_osdmap *map; - u16 version; - u32 len, max, i; - int err = -EINVAL; - void *start = *p; - struct ceph_pg_pool_info *pi; + u8 struct_v; + + ceph_decode_8_safe(p, end, struct_v, e_inval); + if (struct_v >= 7) { + u8 struct_compat; + + ceph_decode_8_safe(p, end, struct_compat, e_inval); + if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { + pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n", + struct_v, struct_compat, + OSDMAP_WRAPPER_COMPAT_VER, prefix); + return -EINVAL; + } + *p += 4; /* ignore wrapper struct_len */ + + ceph_decode_8_safe(p, end, struct_v, e_inval); + ceph_decode_8_safe(p, end, struct_compat, e_inval); + if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { + pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n", + struct_v, struct_compat, + OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); + return -EINVAL; + } + *p += 4; /* ignore client data struct_len */ + } else { + u16 version; + + *p -= 1; + ceph_decode_16_safe(p, end, version, e_inval); + if (version < 6) { + pr_warning("got v %d < 6 of %s ceph_osdmap\n", version, + prefix); + return -EINVAL; + } - dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); + /* old osdmap enconding */ + struct_v = 0; + } - map = kzalloc(sizeof(*map), GFP_NOFS); - if (map == NULL) - return ERR_PTR(-ENOMEM); - map->pg_temp = RB_ROOT; + *v = struct_v; + return 0; - ceph_decode_16_safe(p, end, version, bad); - if (version > 6) { - pr_warning("got unknown v %d > 6 of osdmap\n", version); - goto bad; +e_inval: + return -EINVAL; +} + +static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg_pool_info *pi; + u64 pool; + int ret; + + ceph_decode_64_safe(p, end, pool, e_inval); + + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (!incremental || !pi) { + pi = kzalloc(sizeof(*pi), GFP_NOFS); + if (!pi) + return -ENOMEM; + + pi->id = pool; + + ret = __insert_pg_pool(&map->pg_pools, pi); + if (ret) { + kfree(pi); + return ret; + } + } + + ret = decode_pool(p, end, pi); + if (ret) + return ret; } - if (version < 6) { - pr_warning("got old v %d < 6 of osdmap\n", version); - goto bad; + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_pools(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pools(p, end, map, false); +} + +static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pools(p, end, map, true); +} + +static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg pgid; + u32 len, i; + int ret; + + ret = ceph_decode_pgid(p, end, &pgid); + if (ret) + return ret; + + ceph_decode_32_safe(p, end, len, e_inval); + + ret = __remove_pg_mapping(&map->pg_temp, pgid); + BUG_ON(!incremental && ret != -ENOENT); + + if (!incremental || len > 0) { + struct ceph_pg_mapping *pg; + + ceph_decode_need(p, end, len*sizeof(u32), e_inval); + + if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) + return -EINVAL; + + pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); + if (!pg) + return -ENOMEM; + + pg->pgid = pgid; + pg->pg_temp.len = len; + for (i = 0; i < len; i++) + pg->pg_temp.osds[i] = ceph_decode_32(p); + + ret = __insert_pg_mapping(pg, &map->pg_temp); + if (ret) { + kfree(pg); + return ret; + } + } } - ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pg_temp(p, end, map, false); +} + +static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pg_temp(p, end, map, true); +} + +static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg pgid; + u32 osd; + int ret; + + ret = ceph_decode_pgid(p, end, &pgid); + if (ret) + return ret; + + ceph_decode_32_safe(p, end, osd, e_inval); + + ret = __remove_pg_mapping(&map->primary_temp, pgid); + BUG_ON(!incremental && ret != -ENOENT); + + if (!incremental || osd != (u32)-1) { + struct ceph_pg_mapping *pg; + + pg = kzalloc(sizeof(*pg), GFP_NOFS); + if (!pg) + return -ENOMEM; + + pg->pgid = pgid; + pg->primary_temp.osd = osd; + + ret = __insert_pg_mapping(pg, &map->primary_temp); + if (ret) { + kfree(pg); + return ret; + } + } + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_primary_temp(p, end, map, false); +} + +static int decode_new_primary_temp(void **p, void *end, + struct ceph_osdmap *map) +{ + return __decode_primary_temp(p, end, map, true); +} + +u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) +{ + BUG_ON(osd >= map->max_osd); + + if (!map->osd_primary_affinity) + return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + + return map->osd_primary_affinity[osd]; +} + +static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) +{ + BUG_ON(osd >= map->max_osd); + + if (!map->osd_primary_affinity) { + int i; + + map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32), + GFP_NOFS); + if (!map->osd_primary_affinity) + return -ENOMEM; + + for (i = 0; i < map->max_osd; i++) + map->osd_primary_affinity[i] = + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + } + + map->osd_primary_affinity[osd] = aff; + + return 0; +} + +static int decode_primary_affinity(void **p, void *end, + struct ceph_osdmap *map) +{ + u32 len, i; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len == 0) { + kfree(map->osd_primary_affinity); + map->osd_primary_affinity = NULL; + return 0; + } + if (len != map->max_osd) + goto e_inval; + + ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); + + for (i = 0; i < map->max_osd; i++) { + int ret; + + ret = set_primary_affinity(map, i, ceph_decode_32(p)); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_new_primary_affinity(void **p, void *end, + struct ceph_osdmap *map) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + u32 osd, aff; + int ret; + + ceph_decode_32_safe(p, end, osd, e_inval); + ceph_decode_32_safe(p, end, aff, e_inval); + + ret = set_primary_affinity(map, osd, aff); + if (ret) + return ret; + + pr_info("osd%d primary-affinity 0x%x\n", osd, aff); + } + + return 0; + +e_inval: + return -EINVAL; +} + +/* + * decode a full map. + */ +static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) +{ + u8 struct_v; + u32 epoch = 0; + void *start = *p; + u32 max; + u32 len, i; + int err; + + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); + + err = get_osdmap_client_data_v(p, end, "full", &struct_v); + if (err) + goto bad; + + /* fsid, epoch, created, modified */ + ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + + sizeof(map->created) + sizeof(map->modified), e_inval); ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); - map->epoch = ceph_decode_32(p); + epoch = map->epoch = ceph_decode_32(p); ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); - ceph_decode_32_safe(p, end, max, bad); - while (max--) { - ceph_decode_need(p, end, 8 + 2, bad); - err = -ENOMEM; - pi = kzalloc(sizeof(*pi), GFP_NOFS); - if (!pi) - goto bad; - pi->id = ceph_decode_64(p); - err = __decode_pool(p, end, pi); - if (err < 0) { - kfree(pi); - goto bad; - } - __insert_pg_pool(&map->pg_pools, pi); - } + /* pools */ + err = decode_pools(p, end, map); + if (err) + goto bad; - err = __decode_pool_names(p, end, map); - if (err < 0) { - dout("fail to decode pool names"); + /* pool_name */ + err = decode_pool_names(p, end, map); + if (err) goto bad; - } - ceph_decode_32_safe(p, end, map->pool_max, bad); + ceph_decode_32_safe(p, end, map->pool_max, e_inval); - ceph_decode_32_safe(p, end, map->flags, bad); + ceph_decode_32_safe(p, end, map->flags, e_inval); - max = ceph_decode_32(p); + /* max_osd */ + ceph_decode_32_safe(p, end, max, e_inval); /* (re)alloc osd arrays */ err = osdmap_set_max_osd(map, max); - if (err < 0) + if (err) goto bad; - dout("osdmap_decode max_osd = %d\n", map->max_osd); - /* osds */ - err = -EINVAL; + /* osd_state, osd_weight, osd_addrs->client_addr */ ceph_decode_need(p, end, 3*sizeof(u32) + map->max_osd*(1 + sizeof(*map->osd_weight) + - sizeof(*map->osd_addr)), bad); - *p += 4; /* skip length field (should match max) */ + sizeof(*map->osd_addr)), e_inval); + + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + ceph_decode_copy(p, map->osd_state, map->max_osd); - *p += 4; /* skip length field (should match max) */ + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + for (i = 0; i < map->max_osd; i++) map->osd_weight[i] = ceph_decode_32(p); - *p += 4; /* skip length field (should match max) */ + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); for (i = 0; i < map->max_osd; i++) ceph_decode_addr(&map->osd_addr[i]); /* pg_temp */ - ceph_decode_32_safe(p, end, len, bad); - for (i = 0; i < len; i++) { - int n, j; - struct ceph_pg pgid; - struct ceph_pg_mapping *pg; + err = decode_pg_temp(p, end, map); + if (err) + goto bad; - err = ceph_decode_pgid(p, end, &pgid); + /* primary_temp */ + if (struct_v >= 1) { + err = decode_primary_temp(p, end, map); if (err) goto bad; - ceph_decode_need(p, end, sizeof(u32), bad); - n = ceph_decode_32(p); - err = -EINVAL; - if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) - goto bad; - ceph_decode_need(p, end, n * sizeof(u32), bad); - err = -ENOMEM; - pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); - if (!pg) - goto bad; - pg->pgid = pgid; - pg->len = n; - for (j = 0; j < n; j++) - pg->osds[j] = ceph_decode_32(p); + } - err = __insert_pg_mapping(pg, &map->pg_temp); + /* primary_affinity */ + if (struct_v >= 2) { + err = decode_primary_affinity(p, end, map); if (err) goto bad; - dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed, - len); + } else { + /* XXX can this happen? */ + kfree(map->osd_primary_affinity); + map->osd_primary_affinity = NULL; } /* crush */ - ceph_decode_32_safe(p, end, len, bad); - dout("osdmap_decode crush len %d from off 0x%x\n", len, - (int)(*p - start)); - ceph_decode_need(p, end, len, bad); - map->crush = crush_decode(*p, end); - *p += len; + ceph_decode_32_safe(p, end, len, e_inval); + map->crush = crush_decode(*p, min(*p + len, end)); if (IS_ERR(map->crush)) { err = PTR_ERR(map->crush); map->crush = NULL; goto bad; } + *p += len; - /* ignore the rest of the map */ + /* ignore the rest */ *p = end; - dout("osdmap_decode done %p %p\n", *p, end); - return map; + dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); + return 0; +e_inval: + err = -EINVAL; bad: - dout("osdmap_decode fail err %d\n", err); - ceph_osdmap_destroy(map); - return ERR_PTR(err); + pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", + err, epoch, (int)(*p - start), *p, start, end); + print_hex_dump(KERN_DEBUG, "osdmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); + return err; +} + +/* + * Allocate and decode a full map. + */ +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +{ + struct ceph_osdmap *map; + int ret; + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (!map) + return ERR_PTR(-ENOMEM); + + map->pg_temp = RB_ROOT; + map->primary_temp = RB_ROOT; + mutex_init(&map->crush_scratch_mutex); + + ret = osdmap_decode(p, end, map); + if (ret) { + ceph_osdmap_destroy(map); + return ERR_PTR(ret); + } + + return map; } /* @@ -840,17 +1177,18 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, __s64 new_pool_max; __s32 new_flags, max; void *start = *p; - int err = -EINVAL; - u16 version; + int err; + u8 struct_v; + + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); - ceph_decode_16_safe(p, end, version, bad); - if (version != 6) { - pr_warning("got unknown v %d != 6 of inc osdmap\n", version); + err = get_osdmap_client_data_v(p, end, "inc", &struct_v); + if (err) goto bad; - } - ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), - bad); + /* fsid, epoch, modified, new_pool_max, new_flags */ + ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + + sizeof(u64) + sizeof(u32), e_inval); ceph_decode_copy(p, &fsid, sizeof(fsid)); epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); @@ -859,21 +1197,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, new_flags = ceph_decode_32(p); /* full map? */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); - return osdmap_decode(p, min(*p+len, end)); + return ceph_osdmap_decode(p, min(*p+len, end)); } /* new crush? */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { - dout("apply_incremental new crush map len %d, %p to %p\n", - len, *p, end); newcrush = crush_decode(*p, min(*p+len, end)); - if (IS_ERR(newcrush)) - return ERR_CAST(newcrush); + if (IS_ERR(newcrush)) { + err = PTR_ERR(newcrush); + newcrush = NULL; + goto bad; + } *p += len; } @@ -883,13 +1222,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (new_pool_max >= 0) map->pool_max = new_pool_max; - ceph_decode_need(p, end, 5*sizeof(u32), bad); - /* new max? */ - max = ceph_decode_32(p); + ceph_decode_32_safe(p, end, max, e_inval); if (max >= 0) { err = osdmap_set_max_osd(map, max); - if (err < 0) + if (err) goto bad; } @@ -902,51 +1239,34 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, newcrush = NULL; } - /* new_pool */ - ceph_decode_32_safe(p, end, len, bad); - while (len--) { - struct ceph_pg_pool_info *pi; + /* new_pools */ + err = decode_new_pools(p, end, map); + if (err) + goto bad; - ceph_decode_64_safe(p, end, pool, bad); - pi = __lookup_pg_pool(&map->pg_pools, pool); - if (!pi) { - pi = kzalloc(sizeof(*pi), GFP_NOFS); - if (!pi) { - err = -ENOMEM; - goto bad; - } - pi->id = pool; - __insert_pg_pool(&map->pg_pools, pi); - } - err = __decode_pool(p, end, pi); - if (err < 0) - goto bad; - } - if (version >= 5) { - err = __decode_pool_names(p, end, map); - if (err < 0) - goto bad; - } + /* new_pool_names */ + err = decode_pool_names(p, end, map); + if (err) + goto bad; /* old_pool */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { struct ceph_pg_pool_info *pi; - ceph_decode_64_safe(p, end, pool, bad); + ceph_decode_64_safe(p, end, pool, e_inval); pi = __lookup_pg_pool(&map->pg_pools, pool); if (pi) __remove_pg_pool(&map->pg_pools, pi); } /* new_up */ - err = -EINVAL; - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { u32 osd; struct ceph_entity_addr addr; - ceph_decode_32_safe(p, end, osd, bad); - ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); + ceph_decode_32_safe(p, end, osd, e_inval); + ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); ceph_decode_addr(&addr); pr_info("osd%d up\n", osd); BUG_ON(osd >= map->max_osd); @@ -955,11 +1275,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_state */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { u32 osd; u8 xorstate; - ceph_decode_32_safe(p, end, osd, bad); + ceph_decode_32_safe(p, end, osd, e_inval); xorstate = **(u8 **)p; (*p)++; /* clean flag */ if (xorstate == 0) @@ -971,10 +1291,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_weight */ - ceph_decode_32_safe(p, end, len, bad); + ceph_decode_32_safe(p, end, len, e_inval); while (len--) { u32 osd, off; - ceph_decode_need(p, end, sizeof(u32)*2, bad); + ceph_decode_need(p, end, sizeof(u32)*2, e_inval); osd = ceph_decode_32(p); off = ceph_decode_32(p); pr_info("osd%d weight 0x%x %s\n", osd, off, @@ -985,56 +1305,35 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_pg_temp */ - ceph_decode_32_safe(p, end, len, bad); - while (len--) { - struct ceph_pg_mapping *pg; - int j; - struct ceph_pg pgid; - u32 pglen; + err = decode_new_pg_temp(p, end, map); + if (err) + goto bad; - err = ceph_decode_pgid(p, end, &pgid); + /* new_primary_temp */ + if (struct_v >= 1) { + err = decode_new_primary_temp(p, end, map); if (err) goto bad; - ceph_decode_need(p, end, sizeof(u32), bad); - pglen = ceph_decode_32(p); - if (pglen) { - ceph_decode_need(p, end, pglen*sizeof(u32), bad); - - /* removing existing (if any) */ - (void) __remove_pg_mapping(&map->pg_temp, pgid); + } - /* insert */ - err = -EINVAL; - if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) - goto bad; - err = -ENOMEM; - pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); - if (!pg) - goto bad; - pg->pgid = pgid; - pg->len = pglen; - for (j = 0; j < pglen; j++) - pg->osds[j] = ceph_decode_32(p); - err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) { - kfree(pg); - goto bad; - } - dout(" added pg_temp %lld.%x len %d\n", pgid.pool, - pgid.seed, pglen); - } else { - /* remove */ - __remove_pg_mapping(&map->pg_temp, pgid); - } + /* new_primary_affinity */ + if (struct_v >= 2) { + err = decode_new_primary_affinity(p, end, map); + if (err) + goto bad; } /* ignore the rest */ *p = end; + + dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); return map; +e_inval: + err = -EINVAL; bad: - pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", - epoch, (int)(*p - start), *p, start, end); + pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", + err, epoch, (int)(*p - start), *p, start, end); print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); @@ -1142,61 +1441,249 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, } EXPORT_SYMBOL(ceph_oloc_oid_to_pg); -static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, - int *result, int result_max, - const __u32 *weight, int weight_max) +static int do_crush(struct ceph_osdmap *map, int ruleno, int x, + int *result, int result_max, + const __u32 *weight, int weight_max) { - int scratch[result_max * 3]; + int r; - return crush_do_rule(map, ruleno, x, result, result_max, - weight, weight_max, scratch); + BUG_ON(result_max > CEPH_PG_MAX_SIZE); + + mutex_lock(&map->crush_scratch_mutex); + r = crush_do_rule(map->crush, ruleno, x, result, result_max, + weight, weight_max, map->crush_scratch_ary); + mutex_unlock(&map->crush_scratch_mutex); + + return r; } /* - * Calculate raw osd vector for the given pgid. Return pointer to osd - * array, or NULL on failure. + * Calculate raw (crush) set for given pgid. + * + * Return raw set length, or error. */ -static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds, int *num) +static int pg_to_raw_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, + struct ceph_pg pgid, u32 pps, int *osds) { - struct ceph_pg_mapping *pg; - struct ceph_pg_pool_info *pool; int ruleno; - int r; - u32 pps; + int len; - pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); - if (!pool) - return NULL; + /* crush */ + ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, + pool->type, pool->size); + if (ruleno < 0) { + pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", + pgid.pool, pool->crush_ruleset, pool->type, + pool->size); + return -ENOENT; + } - /* pg_temp? */ + len = do_crush(osdmap, ruleno, pps, osds, + min_t(int, pool->size, CEPH_PG_MAX_SIZE), + osdmap->osd_weight, osdmap->max_osd); + if (len < 0) { + pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", + len, ruleno, pgid.pool, pool->crush_ruleset, + pool->type, pool->size); + return len; + } + + return len; +} + +/* + * Given raw set, calculate up set and up primary. + * + * Return up set length. *primary is set to up primary osd id, or -1 + * if up set is empty. + */ +static int raw_to_up_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, + int *osds, int len, int *primary) +{ + int up_primary = -1; + int i; + + if (ceph_can_shift_osds(pool)) { + int removed = 0; + + for (i = 0; i < len; i++) { + if (ceph_osd_is_down(osdmap, osds[i])) { + removed++; + continue; + } + if (removed) + osds[i - removed] = osds[i]; + } + + len -= removed; + if (len > 0) + up_primary = osds[0]; + } else { + for (i = len - 1; i >= 0; i--) { + if (ceph_osd_is_down(osdmap, osds[i])) + osds[i] = CRUSH_ITEM_NONE; + else + up_primary = osds[i]; + } + } + + *primary = up_primary; + return len; +} + +static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, + struct ceph_pg_pool_info *pool, + int *osds, int len, int *primary) +{ + int i; + int pos = -1; + + /* + * Do we have any non-default primary_affinity values for these + * osds? + */ + if (!osdmap->osd_primary_affinity) + return; + + for (i = 0; i < len; i++) { + if (osds[i] != CRUSH_ITEM_NONE && + osdmap->osd_primary_affinity[i] != + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + break; + } + } + if (i == len) + return; + + /* + * Pick the primary. Feed both the seed (for the pg) and the + * osd into the hash/rng so that a proportional fraction of an + * osd's pgs get rejected as primary. + */ + for (i = 0; i < len; i++) { + int osd; + u32 aff; + + osd = osds[i]; + if (osd == CRUSH_ITEM_NONE) + continue; + + aff = osdmap->osd_primary_affinity[osd]; + if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && + (crush_hash32_2(CRUSH_HASH_RJENKINS1, + pps, osd) >> 16) >= aff) { + /* + * We chose not to use this primary. Note it + * anyway as a fallback in case we don't pick + * anyone else, but keep looking. + */ + if (pos < 0) + pos = i; + } else { + pos = i; + break; + } + } + if (pos < 0) + return; + + *primary = osds[pos]; + + if (ceph_can_shift_osds(pool) && pos > 0) { + /* move the new primary to the front */ + for (i = pos; i > 0; i--) + osds[i] = osds[i - 1]; + osds[0] = *primary; + } +} + +/* + * Given up set, apply pg_temp and primary_temp mappings. + * + * Return acting set length. *primary is set to acting primary osd id, + * or -1 if acting set is empty. + */ +static int apply_temps(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, struct ceph_pg pgid, + int *osds, int len, int *primary) +{ + struct ceph_pg_mapping *pg; + int temp_len; + int temp_primary; + int i; + + /* raw_pg -> pg */ pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, pool->pg_num_mask); + + /* pg_temp? */ pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { - *num = pg->len; - return pg->osds; + temp_len = 0; + temp_primary = -1; + + for (i = 0; i < pg->pg_temp.len; i++) { + if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { + if (ceph_can_shift_osds(pool)) + continue; + else + osds[temp_len++] = CRUSH_ITEM_NONE; + } else { + osds[temp_len++] = pg->pg_temp.osds[i]; + } + } + + /* apply pg_temp's primary */ + for (i = 0; i < temp_len; i++) { + if (osds[i] != CRUSH_ITEM_NONE) { + temp_primary = osds[i]; + break; + } + } + } else { + temp_len = len; + temp_primary = *primary; } - /* crush */ - ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, - pool->type, pool->size); - if (ruleno < 0) { - pr_err("no crush rule pool %lld ruleset %d type %d size %d\n", - pgid.pool, pool->crush_ruleset, pool->type, - pool->size); - return NULL; + /* primary_temp? */ + pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); + if (pg) + temp_primary = pg->primary_temp.osd; + + *primary = temp_primary; + return temp_len; +} + +/* + * Calculate acting set for given pgid. + * + * Return acting set length, or error. *primary is set to acting + * primary osd id, or -1 if acting set is empty or on error. + */ +int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *osds, int *primary) +{ + struct ceph_pg_pool_info *pool; + u32 pps; + int len; + + pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); + if (!pool) { + *primary = -1; + return -ENOENT; } if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { - /* hash pool id and seed sothat pool PGs do not overlap */ + /* hash pool id and seed so that pool PGs do not overlap */ pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, ceph_stable_mod(pgid.seed, pool->pgp_num, pool->pgp_num_mask), pgid.pool); } else { /* - * legacy ehavior: add ps and pool together. this is + * legacy behavior: add ps and pool together. this is * not a great approach because the PGs from each pool * will overlap on top of each other: 0.5 == 1.4 == * 2.3 == ... @@ -1205,38 +1692,20 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, pool->pgp_num_mask) + (unsigned)pgid.pool; } - r = crush_do_rule_ary(osdmap->crush, ruleno, pps, - osds, min_t(int, pool->size, *num), - osdmap->osd_weight, osdmap->max_osd); - if (r < 0) { - pr_err("error %d from crush rule: pool %lld ruleset %d type %d" - " size %d\n", r, pgid.pool, pool->crush_ruleset, - pool->type, pool->size); - return NULL; + + len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); + if (len < 0) { + *primary = -1; + return len; } - *num = r; - return osds; -} -/* - * Return acting set for given pgid. - */ -int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *acting) -{ - int rawosds[CEPH_PG_MAX_SIZE], *osds; - int i, o, num = CEPH_PG_MAX_SIZE; + len = raw_to_up_osds(osdmap, pool, osds, len, primary); - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); - if (!osds) - return -1; + apply_primary_affinity(osdmap, pps, pool, osds, len, primary); - /* primary is first up osd */ - o = 0; - for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) - acting[o++] = osds[i]; - return o; + len = apply_temps(osdmap, pool, pgid, osds, len, primary); + + return len; } /* @@ -1244,17 +1713,11 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, */ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) { - int rawosds[CEPH_PG_MAX_SIZE], *osds; - int i, num = CEPH_PG_MAX_SIZE; + int osds[CEPH_PG_MAX_SIZE]; + int primary; - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); - if (!osds) - return -1; + ceph_calc_pg_acting(osdmap, pgid, osds, &primary); - /* primary is first up osd */ - for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) - return osds[i]; - return -1; + return primary; } EXPORT_SYMBOL(ceph_calc_pg_primary); diff --git a/net/core/dev.c b/net/core/dev.c index 757063420ce0..14dac0654f28 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4043,6 +4043,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); napi->skb = skb; } @@ -4588,8 +4589,7 @@ void *netdev_lower_get_next_private(struct net_device *dev, if (&lower->list == &dev->adj_list.lower) return NULL; - if (iter) - *iter = lower->list.next; + *iter = lower->list.next; return lower->private; } @@ -4617,8 +4617,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, if (&lower->list == &dev->adj_list.lower) return NULL; - if (iter) - *iter = &lower->list; + *iter = &lower->list; return lower->private; } @@ -5696,6 +5695,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } +#ifdef CONFIG_NET_RX_BUSY_POLL + if (dev->netdev_ops->ndo_busy_poll) + features |= NETIF_F_BUSY_POLL; + else +#endif + features &= ~NETIF_F_BUSY_POLL; + return features; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 30071dec287a..640ba0e5831c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -97,6 +97,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXFCS_BIT] = "rx-fcs", [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", + [NETIF_F_BUSY_POLL_BIT] = "busy-poll", }; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) diff --git a/net/core/filter.c b/net/core/filter.c index 765556ba32ef..e08b3822c72a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -295,43 +295,43 @@ select_insn: (*(s64 *) &A) >>= K; CONT; BPF_ALU64_BPF_MOD_BPF_X: + if (unlikely(X == 0)) + return 0; tmp = A; - if (X) - A = do_div(tmp, X); + A = do_div(tmp, X); CONT; BPF_ALU_BPF_MOD_BPF_X: + if (unlikely(X == 0)) + return 0; tmp = (u32) A; - if (X) - A = do_div(tmp, (u32) X); + A = do_div(tmp, (u32) X); CONT; BPF_ALU64_BPF_MOD_BPF_K: tmp = A; - if (K) - A = do_div(tmp, K); + A = do_div(tmp, K); CONT; BPF_ALU_BPF_MOD_BPF_K: tmp = (u32) A; - if (K) - A = do_div(tmp, (u32) K); + A = do_div(tmp, (u32) K); CONT; BPF_ALU64_BPF_DIV_BPF_X: - if (X) - do_div(A, X); + if (unlikely(X == 0)) + return 0; + do_div(A, X); CONT; BPF_ALU_BPF_DIV_BPF_X: + if (unlikely(X == 0)) + return 0; tmp = (u32) A; - if (X) - do_div(tmp, (u32) X); + do_div(tmp, (u32) X); A = (u32) tmp; CONT; BPF_ALU64_BPF_DIV_BPF_K: - if (K) - do_div(A, K); + do_div(A, K); CONT; BPF_ALU_BPF_DIV_BPF_K: tmp = (u32) A; - if (K) - do_div(tmp, (u32) K); + do_div(tmp, (u32) K); A = (u32) tmp; CONT; BPF_ALU_BPF_END_BPF_TO_BE: diff --git a/net/core/flow.c b/net/core/flow.c index 31cfb365e0c6..a0348fde1fdf 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -455,6 +455,8 @@ int flow_cache_init(struct net *net) if (!fc->percpu) return -ENOMEM; + cpu_notifier_register_begin(); + for_each_online_cpu(i) { if (flow_cache_cpu_prepare(fc, i)) goto err; @@ -462,7 +464,9 @@ int flow_cache_init(struct net *net) fc->hotcpu_notifier = (struct notifier_block){ .notifier_call = flow_cache_cpu, }; - register_hotcpu_notifier(&fc->hotcpu_notifier); + __register_hotcpu_notifier(&fc->hotcpu_notifier); + + cpu_notifier_register_done(); setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, (unsigned long) fc); @@ -478,6 +482,8 @@ err: fcp->hash_table = NULL; } + cpu_notifier_register_done(); + free_percpu(fc->percpu); fc->percpu = NULL; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d0dac57291af..0304f981f7ff 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3338,9 +3338,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) queue_map = skb_get_queue_mapping(pkt_dev->skb); txq = netdev_get_tx_queue(odev, queue_map); - __netif_tx_lock_bh(txq); + local_bh_disable(); - if (unlikely(netif_xmit_frozen_or_stopped(txq))) { + HARD_TX_LOCK(odev, txq, smp_processor_id()); + + if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; @@ -3374,7 +3376,9 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 0; } unlock: - __netif_tx_unlock_bh(txq); + HARD_TX_UNLOCK(odev, txq); + + local_bh_enable(); /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 30c7d35dd862..1b62343f5837 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3458,8 +3458,6 @@ static void sock_rmem_free(struct sk_buff *skb) */ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { - int len = skb->len; - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)sk->sk_rcvbuf) return -ENOMEM; @@ -3474,7 +3472,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb_queue_tail(&sk->sk_error_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, len); + sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(sock_queue_err_skb); @@ -3937,12 +3935,14 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet); unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); - unsigned int hdr_len; if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len = tcp_hdrlen(skb); - else - hdr_len = sizeof(struct udphdr); - return hdr_len + shinfo->gso_size; + return tcp_hdrlen(skb) + shinfo->gso_size; + + /* UFO sets gso_size to the size of the fragmentation + * payload, i.e. the size of the L4 (UDP) header is already + * accounted for. + */ + return shinfo->gso_size; } EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); diff --git a/net/core/sock.c b/net/core/sock.c index c0fc6bdad1e3..b4fff008136f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -428,7 +428,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); return 0; } EXPORT_SYMBOL(sock_queue_rcv_skb); @@ -2196,7 +2196,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk, int len) +static void sock_def_readable(struct sock *sk) { struct socket_wq *wq; diff --git a/net/dccp/input.c b/net/dccp/input.c index 14cdafad7a90..3c8ec7d4a34e 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -28,7 +28,7 @@ static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } static void dccp_fin(struct sock *sk, struct sk_buff *skb) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 9e2f78bc1553..c69eb9c4fbb8 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -237,7 +237,7 @@ int dccp_child_process(struct sock *parent, struct sock *child, /* Wakeup parent, send SIGIO */ if (state == DCCP_RESPOND && child->sk_state != state) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index c344163e6ac0..fe5f01485d33 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -585,7 +585,6 @@ out: static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue) { int err; - int skb_len; /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces number of warnings when compiling with -W --ANK @@ -600,12 +599,11 @@ static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig if (err) goto out; - skb_len = skb->len; skb_set_owner_r(skb, sk); skb_queue_tail(queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); out: return err; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ec4f762efda5..94213c891565 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -463,6 +463,7 @@ static const struct net_device_ops ipgre_netdev_ops = { static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; + dev->type = ARPHRD_IPGRE; ip_tunnel_setup(dev, ipgre_net_id); } @@ -501,7 +502,6 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); - dev->type = ARPHRD_IPGRE; dev->flags = IFF_NOARP; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->addr_len = 4; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 687ddef4e574..afcee51b90ed 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -337,6 +337,7 @@ static const struct net_device_ops vti_netdev_ops = { static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; + dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } @@ -348,7 +349,6 @@ static int vti_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); - dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); dev->mtu = ETH_DATA_LEN; dev->flags = IFF_NOARP; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 59da7cde0724..f95b6f93814b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 718dfbd30cbe..99e810f84671 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1be9e990514d..34d094cadb11 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -188,7 +188,7 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e1661f46fd19..d6b46eb2f94c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4413,7 +4413,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } @@ -4914,7 +4914,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t BUG(); tp->urg_data = TCP_URG_VALID | tmp; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } } @@ -5000,11 +5000,11 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } else if (chunk > 0) { tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } out: return copied_early; @@ -5275,7 +5275,7 @@ no_ack: #endif if (eaten) kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6379894ec210..438f3b95143d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1434,7 +1434,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; tp->syn_data_acked = 1; } - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); bh_unlock_sock(child); sock_put(child); WARN_ON(req->sk == NULL); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ca788ada5bd3..05c1b155251d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -745,7 +745,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, skb->len); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 710238f58aa9..e080fbbbc0e5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1241,8 +1241,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5ca56cee2dae..e289830ed6e3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -798,7 +798,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; - if (rt6_need_strict(&fl6.daddr) || !oif) + if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = inet6_iif(skb); else fl6.flowi6_oif = oif; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index a5e03119107a..01e77b0ae075 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1757,7 +1757,7 @@ static int iucv_callback_connreq(struct iucv_path *path, /* Wake up accept */ nsk->sk_state = IUCV_CONNECTED; - sk->sk_data_ready(sk, 1); + sk->sk_data_ready(sk); err = 0; fail: bh_unlock_sock(sk); @@ -1968,7 +1968,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) if (!err) { iucv_accept_enqueue(sk, nsk); nsk->sk_state = IUCV_CONNECTED; - sk->sk_data_ready(sk, 1); + sk->sk_data_ready(sk); } else iucv_sock_kill(nsk); bh_unlock_sock(sk); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index cd5b8ec9be04..da787930df0a 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -621,6 +621,42 @@ static void iucv_disable(void) put_online_cpus(); } +static void free_iucv_data(int cpu) +{ + kfree(iucv_param_irq[cpu]); + iucv_param_irq[cpu] = NULL; + kfree(iucv_param[cpu]); + iucv_param[cpu] = NULL; + kfree(iucv_irq_data[cpu]); + iucv_irq_data[cpu] = NULL; +} + +static int alloc_iucv_data(int cpu) +{ + /* Note: GFP_DMA used to get memory below 2G */ + iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_irq_data[cpu]) + goto out_free; + + /* Allocate parameter blocks. */ + iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_param[cpu]) + goto out_free; + + iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param), + GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); + if (!iucv_param_irq[cpu]) + goto out_free; + + return 0; + +out_free: + free_iucv_data(cpu); + return -ENOMEM; +} + static int iucv_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -630,38 +666,14 @@ static int iucv_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_irq_data[cpu]) - return notifier_from_errno(-ENOMEM); - - iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_param[cpu]) { - kfree(iucv_irq_data[cpu]); - iucv_irq_data[cpu] = NULL; + if (alloc_iucv_data(cpu)) return notifier_from_errno(-ENOMEM); - } - iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_param_irq[cpu]) { - kfree(iucv_param[cpu]); - iucv_param[cpu] = NULL; - kfree(iucv_irq_data[cpu]); - iucv_irq_data[cpu] = NULL; - return notifier_from_errno(-ENOMEM); - } break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - kfree(iucv_param_irq[cpu]); - iucv_param_irq[cpu] = NULL; - kfree(iucv_param[cpu]); - iucv_param[cpu] = NULL; - kfree(iucv_irq_data[cpu]); - iucv_irq_data[cpu] = NULL; + free_iucv_data(cpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: @@ -2016,7 +2028,7 @@ static int __init iucv_init(void) rc = iucv_query_maxconn(); if (rc) goto out_ctl; - rc = register_external_interrupt(0x4000, iucv_external_interrupt); + rc = register_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); if (rc) goto out_ctl; iucv_root = root_device_register("iucv"); @@ -2025,33 +2037,20 @@ static int __init iucv_init(void) goto out_int; } - for_each_online_cpu(cpu) { - /* Note: GFP_DMA used to get memory below 2G */ - iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_irq_data[cpu]) { - rc = -ENOMEM; - goto out_free; - } + cpu_notifier_register_begin(); - /* Allocate parameter blocks. */ - iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_param[cpu]) { - rc = -ENOMEM; - goto out_free; - } - iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param), - GFP_KERNEL|GFP_DMA, cpu_to_node(cpu)); - if (!iucv_param_irq[cpu]) { + for_each_online_cpu(cpu) { + if (alloc_iucv_data(cpu)) { rc = -ENOMEM; goto out_free; } - } - rc = register_hotcpu_notifier(&iucv_cpu_notifier); + rc = __register_hotcpu_notifier(&iucv_cpu_notifier); if (rc) goto out_free; + + cpu_notifier_register_done(); + rc = register_reboot_notifier(&iucv_reboot_notifier); if (rc) goto out_cpu; @@ -2069,19 +2068,17 @@ static int __init iucv_init(void) out_reboot: unregister_reboot_notifier(&iucv_reboot_notifier); out_cpu: - unregister_hotcpu_notifier(&iucv_cpu_notifier); + cpu_notifier_register_begin(); + __unregister_hotcpu_notifier(&iucv_cpu_notifier); out_free: - for_each_possible_cpu(cpu) { - kfree(iucv_param_irq[cpu]); - iucv_param_irq[cpu] = NULL; - kfree(iucv_param[cpu]); - iucv_param[cpu] = NULL; - kfree(iucv_irq_data[cpu]); - iucv_irq_data[cpu] = NULL; - } + for_each_possible_cpu(cpu) + free_iucv_data(cpu); + + cpu_notifier_register_done(); + root_device_unregister(iucv_root); out_int: - unregister_external_interrupt(0x4000, iucv_external_interrupt); + unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); out_ctl: ctl_clear_bit(0, 1); out: @@ -2105,18 +2102,14 @@ static void __exit iucv_exit(void) kfree(p); spin_unlock_irq(&iucv_queue_lock); unregister_reboot_notifier(&iucv_reboot_notifier); - unregister_hotcpu_notifier(&iucv_cpu_notifier); - for_each_possible_cpu(cpu) { - kfree(iucv_param_irq[cpu]); - iucv_param_irq[cpu] = NULL; - kfree(iucv_param[cpu]); - iucv_param[cpu] = NULL; - kfree(iucv_irq_data[cpu]); - iucv_irq_data[cpu] = NULL; - } + cpu_notifier_register_begin(); + __unregister_hotcpu_notifier(&iucv_cpu_notifier); + for_each_possible_cpu(cpu) + free_iucv_data(cpu); + cpu_notifier_register_done(); root_device_unregister(iucv_root); bus_unregister(&iucv_bus); - unregister_external_interrupt(0x4000, iucv_external_interrupt); + unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt); } subsys_initcall(iucv_init); diff --git a/net/key/af_key.c b/net/key/af_key.c index e72589a8400d..f3c83073afc4 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -205,7 +205,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { skb_set_owner_r(*skb2, sk); skb_queue_tail(&sk->sk_receive_queue, *skb2); - sk->sk_data_ready(sk, (*skb2)->len); + sk->sk_data_ready(sk); *skb2 = NULL; err = 0; } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index d276e2d4a589..950909f04ee6 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -753,9 +753,9 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, session->deref = pppol2tp_session_sock_put; /* If PMTU discovery was enabled, use the MTU that was discovered */ - dst = sk_dst_get(sk); + dst = sk_dst_get(tunnel->sock); if (dst != NULL) { - u32 pmtu = dst_mtu(__sk_dst_get(sk)); + u32 pmtu = dst_mtu(__sk_dst_get(tunnel->sock)); if (pmtu != 0) session->mtu = session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD; diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 153bd1ddbfbb..f0991f2344d4 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -26,7 +26,6 @@ #include <net/mac802154.h> #include <net/ieee802154_netdev.h> #include <net/wpan-phy.h> -#include <net/ieee802154_netdev.h> #include "mac802154.h" diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 33045a562297..3fd159db9f06 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -152,8 +152,8 @@ nf_tables_chain_type_lookup(const struct nft_af_info *afi, #ifdef CONFIG_MODULES if (autoload) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-chain-%u-%*.s", afi->family, - nla_len(nla)-1, (const char *)nla_data(nla)); + request_module("nft-chain-%u-%.*s", afi->family, + nla_len(nla), (const char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); type = __nf_tables_chain_type_lookup(afi->family, nla); if (type != NULL) @@ -1946,7 +1946,8 @@ static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING }, - [NFTA_SET_NAME] = { .type = NLA_STRING }, + [NFTA_SET_NAME] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 9a8e77e7f8d4..f4e833005320 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -54,7 +54,8 @@ static struct xt_match cgroup_mt_reg __read_mostly = { .matchsize = sizeof(struct xt_cgroup_info), .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING), + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), }; static int __init cgroup_mt_init(void) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 458464e7bd7a..fbc66bb250d5 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -32,8 +32,14 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> -#define CONNLIMIT_SLOTS 32 -#define CONNLIMIT_LOCK_SLOTS 32 +#define CONNLIMIT_SLOTS 256U + +#ifdef CONFIG_LOCKDEP +#define CONNLIMIT_LOCK_SLOTS 8U +#else +#define CONNLIMIT_LOCK_SLOTS 256U +#endif + #define CONNLIMIT_GC_MAX_NODES 8 /* we will save the tuples of all connections we care about */ @@ -49,10 +55,11 @@ struct xt_connlimit_rb { union nf_inet_addr addr; /* search key */ }; +static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; + struct xt_connlimit_data { struct rb_root climit_root4[CONNLIMIT_SLOTS]; struct rb_root climit_root6[CONNLIMIT_SLOTS]; - spinlock_t locks[CONNLIMIT_LOCK_SLOTS]; }; static u_int32_t connlimit_rnd __read_mostly; @@ -297,11 +304,11 @@ static int count_them(struct net *net, root = &data->climit_root4[hash]; } - spin_lock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]); + spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); count = count_tree(net, root, tuple, addr, mask, family); - spin_unlock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]); + spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); return count; } @@ -377,9 +384,6 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) return -ENOMEM; } - for (i = 0; i < ARRAY_SIZE(info->data->locks); ++i) - spin_lock_init(&info->data->locks[i]); - for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) info->data->climit_root4[i] = RB_ROOT; for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) @@ -435,11 +439,14 @@ static struct xt_match connlimit_mt_reg __read_mostly = { static int __init connlimit_mt_init(void) { - int ret; + int ret, i; BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); + for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) + spin_lock_init(&xt_connlimit_locks[i]); + connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", sizeof(struct xt_connlimit_conn), 0, 0, NULL); diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 7174611bd672..c529161cdbf8 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -422,4 +422,6 @@ module_exit(xt_osf_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS("ipt_osf"); +MODULE_ALIAS("ip6t_osf"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c2d585c4f7c5..894cda0206bb 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1653,7 +1653,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) else #endif /* CONFIG_NETLINK_MMAP */ skb_queue_tail(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, len); + sk->sk_data_ready(sk); return len; } @@ -2394,7 +2394,7 @@ out: return err ? : copied; } -static void netlink_data_ready(struct sock *sk, int len) +static void netlink_data_ready(struct sock *sk) { BUG(); } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index b74aa0755521..ede50d197e10 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1011,7 +1011,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) skb_queue_head(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); bh_unlock_sock(sk); diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index b486f12ae243..b4671958fcf9 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -976,7 +976,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, new_sk->sk_state = LLCP_CONNECTED; /* Wake the listening processes */ - parent->sk_data_ready(parent, 0); + parent->sk_data_ready(parent); /* Send CC */ nfc_llcp_send_cc(new_sock); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 01039d2b1695..b85c67ccb797 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -261,7 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb) local_bh_disable(); HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_stopped(txq)) { + if (!netif_xmit_frozen_or_drv_stopped(txq)) { ret = ops->ndo_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) txq_trans_update(txq); @@ -275,6 +275,7 @@ static int packet_direct_xmit(struct sk_buff *skb) return ret; drop: + atomic_long_inc(&dev->tx_dropped); kfree_skb(skb); return NET_XMIT_DROP; } @@ -1847,7 +1848,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb->dropcount = atomic_read(&sk->sk_drops); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); return 0; drop_n_acct: @@ -2053,7 +2054,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, else prb_clear_blk_fill_status(&po->rx_ring); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { @@ -2068,7 +2069,7 @@ ring_is_full: po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); kfree_skb(copy_skb); goto drop_n_restore; } diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c index a2fba7edfd1f..66dc65e7c6a1 100644 --- a/net/phonet/pep-gprs.c +++ b/net/phonet/pep-gprs.c @@ -37,7 +37,7 @@ struct gprs_dev { struct sock *sk; void (*old_state_change)(struct sock *); - void (*old_data_ready)(struct sock *, int); + void (*old_data_ready)(struct sock *); void (*old_write_space)(struct sock *); struct net_device *dev; @@ -146,7 +146,7 @@ drop: return err; } -static void gprs_data_ready(struct sock *sk, int len) +static void gprs_data_ready(struct sock *sk) { struct gprs_dev *gp = sk->sk_user_data; struct sk_buff *skb; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index e77411735de8..70a547ea5177 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -462,10 +462,9 @@ out: queue: skb->dev = NULL; skb_set_owner_r(skb, sk); - err = skb->len; skb_queue_tail(queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, err); + sk->sk_data_ready(sk); return NET_RX_SUCCESS; } @@ -587,10 +586,9 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) pn->rx_credits--; skb->dev = NULL; skb_set_owner_r(skb, sk); - err = skb->len; skb_queue_tail(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, err); + sk->sk_data_ready(sk); return NET_RX_SUCCESS; case PNS_PEP_CONNECT_RESP: @@ -698,7 +696,7 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb) skb_queue_head(&sk->sk_receive_queue, skb); sk_acceptq_added(sk); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return NET_RX_SUCCESS; case PNS_PEP_DISCONNECT_REQ: diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 9cf2927d0021..65637491f728 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -61,12 +61,12 @@ void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ int rds_tcp_listen_init(void); void rds_tcp_listen_stop(void); -void rds_tcp_listen_data_ready(struct sock *sk, int bytes); +void rds_tcp_listen_data_ready(struct sock *sk); /* tcp_recv.c */ int rds_tcp_recv_init(void); void rds_tcp_recv_exit(void); -void rds_tcp_data_ready(struct sock *sk, int bytes); +void rds_tcp_data_ready(struct sock *sk); int rds_tcp_recv(struct rds_connection *conn); void rds_tcp_inc_free(struct rds_incoming *inc); int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 7787537e9c2e..4e638f851185 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -108,9 +108,9 @@ static void rds_tcp_accept_worker(struct work_struct *work) cond_resched(); } -void rds_tcp_listen_data_ready(struct sock *sk, int bytes) +void rds_tcp_listen_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); rdsdebug("listen data ready sk %p\n", sk); @@ -132,7 +132,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) out: read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } int rds_tcp_listen_init(void) diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 4fac4f2bb9dc..9ae6e0a264ec 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -314,13 +314,13 @@ int rds_tcp_recv(struct rds_connection *conn) return ret; } -void rds_tcp_data_ready(struct sock *sk, int bytes) +void rds_tcp_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); struct rds_connection *conn; struct rds_tcp_connection *tc; - rdsdebug("data ready sk %p bytes %d\n", sk, bytes); + rdsdebug("data ready sk %p\n", sk); read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; @@ -337,7 +337,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) queue_delayed_work(rds_wq, &conn->c_recv_w, 0); out: read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } int rds_tcp_recv_init(void) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index c2cca2ee6aef..8451c8cdc9de 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1041,7 +1041,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros rose_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); return 1; } diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 73742647c135..63b21e580de9 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -113,7 +113,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, spin_unlock_bh(&sk->sk_receive_queue.lock); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb_len); + sk->sk_data_ready(sk); } skb = NULL; } else { @@ -632,14 +632,14 @@ cant_find_conn: * handle data received on the local endpoint * - may be called in interrupt context */ -void rxrpc_data_ready(struct sock *sk, int count) +void rxrpc_data_ready(struct sock *sk) { struct rxrpc_skb_priv *sp; struct rxrpc_local *local; struct sk_buff *skb; int ret; - _enter("%p, %d", sk, count); + _enter("%p", sk); ASSERT(!irqs_disabled()); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c831d44b0841..ba9fd36d3f15 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -518,7 +518,7 @@ void rxrpc_UDP_error_handler(struct work_struct *); */ extern const char *rxrpc_pkts[]; -void rxrpc_data_ready(struct sock *, int); +void rxrpc_data_ready(struct sock *); int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool); void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 981aaf8b6ace..e13519e9df80 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6593,6 +6593,46 @@ static void __sctp_write_space(struct sctp_association *asoc) } } +static void sctp_wake_up_waiters(struct sock *sk, + struct sctp_association *asoc) +{ + struct sctp_association *tmp = asoc; + + /* We do accounting for the sndbuf space per association, + * so we only need to wake our own association. + */ + if (asoc->ep->sndbuf_policy) + return __sctp_write_space(asoc); + + /* If association goes down and is just flushing its + * outq, then just normally notify others. + */ + if (asoc->base.dead) + return sctp_write_space(sk); + + /* Accounting for the sndbuf space is per socket, so we + * need to wake up others, try to be fair and in case of + * other associations, let them have a go first instead + * of just doing a sctp_write_space() call. + * + * Note that we reach sctp_wake_up_waiters() only when + * associations free up queued chunks, thus we are under + * lock and the list of associations on a socket is + * guaranteed not to change. + */ + for (tmp = list_next_entry(tmp, asocs); 1; + tmp = list_next_entry(tmp, asocs)) { + /* Manually skip the head element. */ + if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs)) + continue; + /* Wake up association. */ + __sctp_write_space(tmp); + /* We've reached the end. */ + if (tmp == asoc) + break; + } +} + /* Do accounting for the sndbuf space. * Decrement the used sndbuf space of the corresponding association by the * data size which was just transmitted(freed). @@ -6620,7 +6660,7 @@ static void sctp_wfree(struct sk_buff *skb) sk_mem_uncharge(sk, skb->truesize); sock_wfree(skb); - __sctp_write_space(asoc); + sctp_wake_up_waiters(sk, asoc); sctp_association_put(asoc); } @@ -6705,7 +6745,7 @@ do_nonblock: goto out; } -void sctp_data_ready(struct sock *sk, int len) +void sctp_data_ready(struct sock *sk) { struct socket_wq *wq; diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 5dc94117e9d4..7144eb6a1b95 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -259,7 +259,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) sctp_ulpq_clear_pd(ulpq); if (queue == &sk->sk_receive_queue) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return 1; out_free: @@ -1135,5 +1135,5 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) /* If there is data waiting, send it up the socket now. */ if (sctp_ulpq_clear_pd(ulpq) || ev) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 241b54f30204..0754d0f466d2 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -9,19 +9,6 @@ config SUNRPC_BACKCHANNEL bool depends on SUNRPC -config SUNRPC_XPRT_RDMA - tristate - depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS - default SUNRPC && INFINIBAND - help - This option allows the NFS client and server to support - an RDMA-enabled transport. - - To compile RPC client RDMA transport support as a module, - choose M here: the module will be called xprtrdma. - - If unsure, say N. - config SUNRPC_SWAP bool depends on SUNRPC @@ -57,3 +44,29 @@ config SUNRPC_DEBUG but makes troubleshooting NFS issues significantly harder. If unsure, say Y. + +config SUNRPC_XPRT_RDMA_CLIENT + tristate "RPC over RDMA Client Support" + depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS + default SUNRPC && INFINIBAND + help + This option allows the NFS client to support an RDMA-enabled + transport. + + To compile RPC client RDMA transport support as a module, + choose M here: the module will be called xprtrdma. + + If unsure, say N. + +config SUNRPC_XPRT_RDMA_SERVER + tristate "RPC over RDMA Server Support" + depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS + default SUNRPC && INFINIBAND + help + This option allows the NFS server to support an RDMA-enabled + transport. + + To compile RPC server RDMA transport support as a module, + choose M here: the module will be called svcrdma. + + If unsure, say N. diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 8209a0411bca..e5a7a1cac8f3 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -5,7 +5,8 @@ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ -obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ + +obj-y += xprtrdma/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o auth_generic.o \ diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index e860d4f7ed2a..3513d559bc45 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -212,39 +212,23 @@ out: } EXPORT_SYMBOL_GPL(xprt_destroy_backchannel); -/* - * One or more rpc_rqst structure have been preallocated during the - * backchannel setup. Buffer space for the send and private XDR buffers - * has been preallocated as well. Use xprt_alloc_bc_request to allocate - * to this request. Use xprt_free_bc_request to return it. - * - * We know that we're called in soft interrupt context, grab the spin_lock - * since there is no need to grab the bottom half spin_lock. - * - * Return an available rpc_rqst, otherwise NULL if non are available. - */ -struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt) +static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid) { - struct rpc_rqst *req; + struct rpc_rqst *req = NULL; dprintk("RPC: allocate a backchannel request\n"); - spin_lock(&xprt->bc_pa_lock); - if (!list_empty(&xprt->bc_pa_list)) { - req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst, - rq_bc_pa_list); - list_del(&req->rq_bc_pa_list); - } else { - req = NULL; - } - spin_unlock(&xprt->bc_pa_lock); + if (list_empty(&xprt->bc_pa_list)) + goto not_found; - if (req != NULL) { - set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); - req->rq_reply_bytes_recvd = 0; - req->rq_bytes_sent = 0; - memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst, + rq_bc_pa_list); + req->rq_reply_bytes_recvd = 0; + req->rq_bytes_sent = 0; + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); - } + req->rq_xid = xid; + req->rq_connect_cookie = xprt->connect_cookie; +not_found: dprintk("RPC: backchannel req=%p\n", req); return req; } @@ -259,6 +243,7 @@ void xprt_free_bc_request(struct rpc_rqst *req) dprintk("RPC: free backchannel req=%p\n", req); + req->rq_connect_cookie = xprt->connect_cookie - 1; smp_mb__before_clear_bit(); WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); @@ -281,7 +266,57 @@ void xprt_free_bc_request(struct rpc_rqst *req) * may be reused by a new callback request. */ spin_lock_bh(&xprt->bc_pa_lock); - list_add(&req->rq_bc_pa_list, &xprt->bc_pa_list); + list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); spin_unlock_bh(&xprt->bc_pa_lock); } +/* + * One or more rpc_rqst structure have been preallocated during the + * backchannel setup. Buffer space for the send and private XDR buffers + * has been preallocated as well. Use xprt_alloc_bc_request to allocate + * to this request. Use xprt_free_bc_request to return it. + * + * We know that we're called in soft interrupt context, grab the spin_lock + * since there is no need to grab the bottom half spin_lock. + * + * Return an available rpc_rqst, otherwise NULL if non are available. + */ +struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid) +{ + struct rpc_rqst *req; + + spin_lock(&xprt->bc_pa_lock); + list_for_each_entry(req, &xprt->bc_pa_list, rq_bc_pa_list) { + if (req->rq_connect_cookie != xprt->connect_cookie) + continue; + if (req->rq_xid == xid) + goto found; + } + req = xprt_alloc_bc_request(xprt, xid); +found: + spin_unlock(&xprt->bc_pa_lock); + return req; +} + +/* + * Add callback request to callback list. The callback + * service sleeps on the sv_cb_waitq waiting for new + * requests. Wake it up after adding enqueing the + * request. + */ +void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) +{ + struct rpc_xprt *xprt = req->rq_xprt; + struct svc_serv *bc_serv = xprt->bc_serv; + + req->rq_private_buf.len = copied; + set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); + + dprintk("RPC: add callback request to list\n"); + spin_lock(&bc_serv->sv_cb_lock); + list_del(&req->rq_bc_pa_list); + list_add(&req->rq_bc_list, &bc_serv->sv_cb_list); + wake_up(&bc_serv->sv_cb_waitq); + spin_unlock(&bc_serv->sv_cb_lock); +} + diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0edada973434..2e6ab10734f6 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -438,6 +438,38 @@ out_no_rpciod: return ERR_PTR(err); } +struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, + struct rpc_xprt *xprt) +{ + struct rpc_clnt *clnt = NULL; + + clnt = rpc_new_client(args, xprt, NULL); + if (IS_ERR(clnt)) + return clnt; + + if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { + int err = rpc_ping(clnt); + if (err != 0) { + rpc_shutdown_client(clnt); + return ERR_PTR(err); + } + } + + clnt->cl_softrtry = 1; + if (args->flags & RPC_CLNT_CREATE_HARDRTRY) + clnt->cl_softrtry = 0; + + if (args->flags & RPC_CLNT_CREATE_AUTOBIND) + clnt->cl_autobind = 1; + if (args->flags & RPC_CLNT_CREATE_DISCRTRY) + clnt->cl_discrtry = 1; + if (!(args->flags & RPC_CLNT_CREATE_QUIET)) + clnt->cl_chatty = 1; + + return clnt; +} +EXPORT_SYMBOL_GPL(rpc_create_xprt); + /** * rpc_create - create an RPC client and transport with one call * @args: rpc_clnt create argument structure @@ -451,7 +483,6 @@ out_no_rpciod: struct rpc_clnt *rpc_create(struct rpc_create_args *args) { struct rpc_xprt *xprt; - struct rpc_clnt *clnt; struct xprt_create xprtargs = { .net = args->net, .ident = args->protocol, @@ -515,30 +546,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) xprt->resvport = 0; - clnt = rpc_new_client(args, xprt, NULL); - if (IS_ERR(clnt)) - return clnt; - - if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { - int err = rpc_ping(clnt); - if (err != 0) { - rpc_shutdown_client(clnt); - return ERR_PTR(err); - } - } - - clnt->cl_softrtry = 1; - if (args->flags & RPC_CLNT_CREATE_HARDRTRY) - clnt->cl_softrtry = 0; - - if (args->flags & RPC_CLNT_CREATE_AUTOBIND) - clnt->cl_autobind = 1; - if (args->flags & RPC_CLNT_CREATE_DISCRTRY) - clnt->cl_discrtry = 1; - if (!(args->flags & RPC_CLNT_CREATE_QUIET)) - clnt->cl_chatty = 1; - - return clnt; + return rpc_create_xprt(args, xprt); } EXPORT_SYMBOL_GPL(rpc_create); @@ -1363,6 +1371,7 @@ rpc_restart_call_prepare(struct rpc_task *task) if (RPC_ASSASSINATED(task)) return 0; task->tk_action = call_start; + task->tk_status = 0; if (task->tk_ops->rpc_call_prepare != NULL) task->tk_action = rpc_prepare_task; return 1; @@ -1379,6 +1388,7 @@ rpc_restart_call(struct rpc_task *task) if (RPC_ASSASSINATED(task)) return 0; task->tk_action = call_start; + task->tk_status = 0; return 1; } EXPORT_SYMBOL_GPL(rpc_restart_call); @@ -1728,9 +1738,7 @@ call_bind_status(struct rpc_task *task) case -EPROTONOSUPPORT: dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n", task->tk_pid); - task->tk_status = 0; - task->tk_action = call_bind; - return; + goto retry_timeout; case -ECONNREFUSED: /* connection problems */ case -ECONNRESET: case -ECONNABORTED: @@ -1756,6 +1764,7 @@ call_bind_status(struct rpc_task *task) return; retry_timeout: + task->tk_status = 0; task->tk_action = call_timeout; } @@ -1798,21 +1807,19 @@ call_connect_status(struct rpc_task *task) trace_rpc_connect_status(task, status); task->tk_status = 0; switch (status) { - /* if soft mounted, test if we've timed out */ - case -ETIMEDOUT: - task->tk_action = call_timeout; - return; case -ECONNREFUSED: case -ECONNRESET: case -ECONNABORTED: case -ENETUNREACH: case -EHOSTUNREACH: - /* retry with existing socket, after a delay */ - rpc_delay(task, 3*HZ); if (RPC_IS_SOFTCONN(task)) break; + /* retry with existing socket, after a delay */ + rpc_delay(task, 3*HZ); case -EAGAIN: - task->tk_action = call_bind; + /* Check for timeouts before looping back to call_bind */ + case -ETIMEDOUT: + task->tk_action = call_timeout; return; case 0: clnt->cl_stats->netreconn++; @@ -2007,6 +2014,10 @@ call_status(struct rpc_task *task) case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + if (RPC_IS_SOFTCONN(task)) { + rpc_exit(task, status); + break; + } /* * Delay any retries for 3 seconds, then handle as if it * were a timeout. diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index ff3cc4bf4b24..25578afe1548 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -637,7 +637,8 @@ static void __rpc_queue_timer_fn(unsigned long ptr) static void __rpc_atrun(struct rpc_task *task) { - task->tk_status = 0; + if (task->tk_status == -ETIMEDOUT) + task->tk_status = 0; } /* diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b6e59f0a9475..43bcb4699d69 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -60,7 +60,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int flags); -static void svc_udp_data_ready(struct sock *, int); +static void svc_udp_data_ready(struct sock *); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); static void svc_sock_detach(struct svc_xprt *); @@ -403,14 +403,14 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, /* * INET callback when data has been received on the socket. */ -static void svc_udp_data_ready(struct sock *sk, int count) +static void svc_udp_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; wait_queue_head_t *wq = sk_sleep(sk); if (svsk) { - dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", - svsk, sk, count, + dprintk("svc: socket %p(inet %p), busy=%d\n", + svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); @@ -731,7 +731,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) * A data_ready event on a listening socket means there's a connection * pending. Do not use state_change as a substitute for it. */ -static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused) +static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; wait_queue_head_t *wq; @@ -783,7 +783,7 @@ static void svc_tcp_state_change(struct sock *sk) wake_up_interruptible_all(wq); } -static void svc_tcp_data_ready(struct sock *sk, int count) +static void svc_tcp_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; wait_queue_head_t *wq = sk_sleep(sk); @@ -1397,6 +1397,22 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, return svsk; } +bool svc_alien_sock(struct net *net, int fd) +{ + int err; + struct socket *sock = sockfd_lookup(fd, &err); + bool ret = false; + + if (!sock) + goto out; + if (sock_net(sock->sk) != net) + ret = true; + sockfd_put(sock); +out: + return ret; +} +EXPORT_SYMBOL_GPL(svc_alien_sock); + /** * svc_addsock - add a listener socket to an RPC service * @serv: pointer to RPC service to which to add a new listener diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 1504bb11e4f3..dd97ba3c4456 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -833,8 +833,20 @@ xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) } EXPORT_SYMBOL_GPL(xdr_buf_from_iov); -/* Sets subbuf to the portion of buf of length len beginning base bytes - * from the start of buf. Returns -1 if base of length are out of bounds. */ +/** + * xdr_buf_subsegment - set subbuf to a portion of buf + * @buf: an xdr buffer + * @subbuf: the result buffer + * @base: beginning of range in bytes + * @len: length of range in bytes + * + * sets @subbuf to an xdr buffer representing the portion of @buf of + * length @len starting at offset @base. + * + * @buf and @subbuf may be pointers to the same struct xdr_buf. + * + * Returns -1 if base of length are out of bounds. + */ int xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, unsigned int base, unsigned int len) @@ -847,9 +859,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, len -= subbuf->head[0].iov_len; base = 0; } else { - subbuf->head[0].iov_base = NULL; - subbuf->head[0].iov_len = 0; base -= buf->head[0].iov_len; + subbuf->head[0].iov_len = 0; } if (base < buf->page_len) { @@ -871,9 +882,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, len -= subbuf->tail[0].iov_len; base = 0; } else { - subbuf->tail[0].iov_base = NULL; - subbuf->tail[0].iov_len = 0; base -= buf->tail[0].iov_len; + subbuf->tail[0].iov_len = 0; } if (base || len) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 7d4df99f761f..d173f79947c6 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1383,15 +1383,3 @@ void xprt_put(struct rpc_xprt *xprt) if (atomic_dec_and_test(&xprt->count)) xprt_destroy(xprt); } - -/** - * xprt_get - return a reference to an RPC transport. - * @xprt: pointer to the transport - * - */ -struct rpc_xprt *xprt_get(struct rpc_xprt *xprt) -{ - if (atomic_inc_not_zero(&xprt->count)) - return xprt; - return NULL; -} diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 5a8f268bdd30..da5136fd5694 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,8 +1,8 @@ -obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o +obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o xprtrdma-y := transport.o rpc_rdma.o verbs.o -obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o +obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o svcrdma-y := svc_rdma.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index e03725bfe2b8..96ead526b125 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -649,9 +649,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) break; page_base = 0; } - rqst->rq_rcv_buf.page_len = olen - copy_len; - } else - rqst->rq_rcv_buf.page_len = 0; + } if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { curlen = copy_len; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 0ce75524ed21..8d904e4eef15 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -90,6 +90,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, sge_no++; } rqstp->rq_respages = &rqstp->rq_pages[sge_no]; + rqstp->rq_next_page = rqstp->rq_respages + 1; /* We should never run out of SGE because the limit is defined to * support the max allowed RPC data length @@ -169,6 +170,7 @@ static int map_read_chunks(struct svcxprt_rdma *xprt, */ head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; + rqstp->rq_next_page = rqstp->rq_respages + 1; byte_count -= sge_bytes; ch_bytes -= sge_bytes; @@ -276,6 +278,7 @@ static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, /* rq_respages points one past arg pages */ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + rqstp->rq_next_page = rqstp->rq_respages + 1; /* Create the reply and chunk maps */ offset = 0; @@ -520,13 +523,6 @@ next_sge: for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) rqstp->rq_pages[ch_no] = NULL; - /* - * Detach res pages. If svc_release sees any it will attempt to - * put them. - */ - while (rqstp->rq_next_page != rqstp->rq_respages) - *(--rqstp->rq_next_page) = NULL; - return err; } @@ -550,7 +546,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp, /* rq_respages starts after the last arg page */ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; - rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no]; + rqstp->rq_next_page = rqstp->rq_respages + 1; /* Rebuild rq_arg head and tail. */ rqstp->rq_arg.head[0] = head->arg.head[0]; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index c1d124dc772b..7e024a51617e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -265,6 +265,7 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, xdr_off -= xdr->head[0].iov_len; if (xdr_off < xdr->page_len) { /* This offset is in the page list */ + xdr_off += xdr->page_base; page = xdr->pages[xdr_off >> PAGE_SHIFT]; xdr_off &= ~PAGE_MASK; } else { @@ -625,6 +626,7 @@ static int send_reply(struct svcxprt_rdma *rdma, if (page_no+1 >= sge_no) ctxt->sge[page_no+1].length = 0; } + rqstp->rq_next_page = rqstp->rq_respages + 1; BUG_ON(sge_no > rdma->sc_max_sge); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 62e4f9bcc387..25688fa2207f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -477,8 +477,7 @@ struct page *svc_rdma_get_page(void) while ((page = alloc_page(GFP_KERNEL)) == NULL) { /* If we can't get memory, wait a bit and try again */ - printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 " - "jiffies.\n"); + printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n"); schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); } return page; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 285dc0884115..1eb9c468d0c9 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -733,7 +733,7 @@ static void __exit xprt_rdma_cleanup(void) { int rc; - dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n"); + dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); #ifdef RPC_DEBUG if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); @@ -755,14 +755,14 @@ static int __init xprt_rdma_init(void) if (rc) return rc; - dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n"); + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); - dprintk(KERN_INFO "Defaults:\n"); - dprintk(KERN_INFO "\tSlots %d\n" + dprintk("Defaults:\n"); + dprintk("\tSlots %d\n" "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", xprt_rdma_slot_table_entries, xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); - dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n", + dprintk("\tPadding %d\n\tMemreg %d\n", xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); #ifdef RPC_DEBUG diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0addefca8e77..25a3dcf15cae 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -254,7 +254,7 @@ struct sock_xprt { /* * Saved socket callback addresses */ - void (*old_data_ready)(struct sock *, int); + void (*old_data_ready)(struct sock *); void (*old_state_change)(struct sock *); void (*old_write_space)(struct sock *); void (*old_error_report)(struct sock *); @@ -909,6 +909,12 @@ static void xs_tcp_close(struct rpc_xprt *xprt) xs_tcp_shutdown(xprt); } +static void xs_xprt_free(struct rpc_xprt *xprt) +{ + xs_free_peer_addresses(xprt); + xprt_free(xprt); +} + /** * xs_destroy - prepare to shutdown a transport * @xprt: doomed transport @@ -919,8 +925,7 @@ static void xs_destroy(struct rpc_xprt *xprt) dprintk("RPC: xs_destroy xprt %p\n", xprt); xs_close(xprt); - xs_free_peer_addresses(xprt); - xprt_free(xprt); + xs_xprt_free(xprt); module_put(THIS_MODULE); } @@ -946,7 +951,7 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) * * Currently this assumes we can read the whole reply in a single gulp. */ -static void xs_local_data_ready(struct sock *sk, int len) +static void xs_local_data_ready(struct sock *sk) { struct rpc_task *task; struct rpc_xprt *xprt; @@ -1009,7 +1014,7 @@ static void xs_local_data_ready(struct sock *sk, int len) * @len: how much data to read * */ -static void xs_udp_data_ready(struct sock *sk, int len) +static void xs_udp_data_ready(struct sock *sk) { struct rpc_task *task; struct rpc_xprt *xprt; @@ -1306,41 +1311,29 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, * If we're unable to obtain the rpc_rqst we schedule the closing of the * connection and return -1. */ -static inline int xs_tcp_read_callback(struct rpc_xprt *xprt, +static int xs_tcp_read_callback(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct rpc_rqst *req; - req = xprt_alloc_bc_request(xprt); + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->transport_lock); + req = xprt_lookup_bc_request(xprt, transport->tcp_xid); if (req == NULL) { + spin_unlock(&xprt->transport_lock); printk(KERN_WARNING "Callback slot table overflowed\n"); xprt_force_disconnect(xprt); return -1; } - req->rq_xid = transport->tcp_xid; dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid)); xs_tcp_read_common(xprt, desc, req); - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) { - struct svc_serv *bc_serv = xprt->bc_serv; - - /* - * Add callback request to callback list. The callback - * service sleeps on the sv_cb_waitq waiting for new - * requests. Wake it up after adding enqueing the - * request. - */ - dprintk("RPC: add callback request to list\n"); - spin_lock(&bc_serv->sv_cb_lock); - list_add(&req->rq_bc_list, &bc_serv->sv_cb_list); - spin_unlock(&bc_serv->sv_cb_lock); - wake_up(&bc_serv->sv_cb_waitq); - } - - req->rq_private_buf.len = transport->tcp_copied; + if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) + xprt_complete_bc_request(req, transport->tcp_copied); + spin_unlock(&xprt->transport_lock); return 0; } @@ -1444,7 +1437,7 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns * @bytes: how much data to read * */ -static void xs_tcp_data_ready(struct sock *sk, int bytes) +static void xs_tcp_data_ready(struct sock *sk) { struct rpc_xprt *xprt; read_descriptor_t rd_desc; @@ -2544,6 +2537,10 @@ static void bc_close(struct rpc_xprt *xprt) static void bc_destroy(struct rpc_xprt *xprt) { + dprintk("RPC: bc_destroy xprt %p\n", xprt); + + xs_xprt_free(xprt); + module_put(THIS_MODULE); } static struct rpc_xprt_ops xs_local_ops = { @@ -2744,7 +2741,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) return xprt; ret = ERR_PTR(-EINVAL); out_err: - xprt_free(xprt); + xs_xprt_free(xprt); return ret; } @@ -2822,7 +2819,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) return xprt; ret = ERR_PTR(-EINVAL); out_err: - xprt_free(xprt); + xs_xprt_free(xprt); return ret; } @@ -2897,12 +2894,11 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PROTO]); - if (try_module_get(THIS_MODULE)) return xprt; ret = ERR_PTR(-EINVAL); out_err: - xprt_free(xprt); + xs_xprt_free(xprt); return ret; } @@ -2919,15 +2915,6 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) struct svc_sock *bc_sock; struct rpc_xprt *ret; - if (args->bc_xprt->xpt_bc_xprt) { - /* - * This server connection already has a backchannel - * transport; we can't create a new one, as we wouldn't - * be able to match replies based on xid any more. So, - * reuse the already-existing one: - */ - return args->bc_xprt->xpt_bc_xprt; - } xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, xprt_tcp_slot_table_entries); if (IS_ERR(xprt)) @@ -2985,13 +2972,14 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) */ xprt_set_connected(xprt); - if (try_module_get(THIS_MODULE)) return xprt; + + args->bc_xprt->xpt_bc_xprt = NULL; xprt_put(xprt); ret = ERR_PTR(-EINVAL); out_err: - xprt_free(xprt); + xs_xprt_free(xprt); return ret; } diff --git a/net/tipc/net.c b/net/tipc/net.c index 0374a817631e..4c564eb69e1a 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -182,6 +182,8 @@ void tipc_net_start(u32 addr) tipc_bclink_init(); write_unlock_bh(&tipc_net_lock); + tipc_nametbl_publish(TIPC_CFG_SRV, tipc_own_addr, tipc_own_addr, + TIPC_ZONE_SCOPE, 0, tipc_own_addr); pr_info("Started in network mode\n"); pr_info("Own node address %s, network identity %u\n", tipc_addr_string_fill(addr_string, tipc_own_addr), tipc_net_id); @@ -192,6 +194,7 @@ void tipc_net_stop(void) if (!tipc_own_addr) return; + tipc_nametbl_withdraw(TIPC_CFG_SRV, tipc_own_addr, 0, tipc_own_addr); write_lock_bh(&tipc_net_lock); tipc_bearer_stop(); tipc_bclink_stop(); diff --git a/net/tipc/server.c b/net/tipc/server.c index 646a930eefbf..a538a02f869b 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -119,7 +119,7 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid) return con; } -static void sock_data_ready(struct sock *sk, int unused) +static void sock_data_ready(struct sock *sk) { struct tipc_conn *con; @@ -297,7 +297,7 @@ static int tipc_accept_from_sock(struct tipc_conn *con) newcon->usr_data = s->tipc_conn_new(newcon->conid); /* Wake up receive process in case of 'SYN+' message */ - newsock->sk->sk_data_ready(newsock->sk, 0); + newsock->sk->sk_data_ready(newsock->sk); return ret; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 29b7f26a12cf..3c0256962f7d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -45,7 +45,7 @@ #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ static int backlog_rcv(struct sock *sk, struct sk_buff *skb); -static void tipc_data_ready(struct sock *sk, int len); +static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); @@ -301,7 +301,6 @@ static int tipc_release(struct socket *sock) struct tipc_sock *tsk; struct tipc_port *port; struct sk_buff *buf; - int res; /* * Exit if socket isn't fully initialized (occurs when a failed accept() @@ -349,7 +348,7 @@ static int tipc_release(struct socket *sock) sock_put(sk); sock->sk = NULL; - return res; + return 0; } /** @@ -1249,7 +1248,7 @@ static void tipc_write_space(struct sock *sk) * @sk: socket * @len: the length of messages */ -static void tipc_data_ready(struct sock *sk, int len) +static void tipc_data_ready(struct sock *sk) { struct socket_wq *wq; @@ -1411,7 +1410,7 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) __skb_queue_tail(&sk->sk_receive_queue, buf); skb_set_owner_r(buf, sk); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return TIPC_OK; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 94404f19f9de..bb7e8ba821f4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1217,7 +1217,7 @@ restart: __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); - other->sk_data_ready(other, 0); + other->sk_data_ready(other); sock_put(other); return 0; @@ -1600,7 +1600,7 @@ restart: if (max_level > unix_sk(other)->recursion_level) unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); - other->sk_data_ready(other, len); + other->sk_data_ready(other); sock_put(other); scm_destroy(siocb->scm); return len; @@ -1706,7 +1706,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, if (max_level > unix_sk(other)->recursion_level) unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); - other->sk_data_ready(other, size); + other->sk_data_ready(other); sent += size; } diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c index 9a730744e7bc..9b7f207f2bee 100644 --- a/net/vmw_vsock/vmci_transport_notify.c +++ b/net/vmw_vsock/vmci_transport_notify.c @@ -315,7 +315,7 @@ vmci_transport_handle_wrote(struct sock *sk, struct vsock_sock *vsk = vsock_sk(sk); PKT_FIELD(vsk, sent_waiting_read) = false; #endif - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } static void vmci_transport_notify_pkt_socket_init(struct sock *sk) diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c index 622bd7aa1016..dc9c7929a2f9 100644 --- a/net/vmw_vsock/vmci_transport_notify_qstate.c +++ b/net/vmw_vsock/vmci_transport_notify_qstate.c @@ -92,7 +92,7 @@ vmci_transport_handle_wrote(struct sock *sk, bool bottom_half, struct sockaddr_vm *dst, struct sockaddr_vm *src) { - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } static void vsock_block_update_write_window(struct sock *sk) @@ -290,7 +290,7 @@ vmci_transport_notify_pkt_recv_post_dequeue( /* See the comment in * vmci_transport_notify_pkt_send_post_enqueue(). */ - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } return err; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 6177479c7de9..5ad4418ef093 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1064,7 +1064,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, x25_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skb->len); + sk->sk_data_ready(sk); rc = 1; sock_put(sk); out: diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index d1b0dc79bb6f..7ac50098a375 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -79,7 +79,7 @@ static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) skb_set_owner_r(skbn, sk); skb_queue_tail(&sk->sk_receive_queue, skbn); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, skbn->len); + sk->sk_data_ready(sk); return 0; } |