diff options
Diffstat (limited to 'net')
144 files changed, 2256 insertions, 2340 deletions
diff --git a/net/802/garp.c b/net/802/garp.c index 941f2a324d3a..c1df2dad8c6b 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -346,8 +346,8 @@ int garp_request_join(const struct net_device *dev, const struct garp_application *appl, const void *data, u8 len, u8 type) { - struct garp_port *port = dev->garp_port; - struct garp_applicant *app = port->applicants[appl->type]; + struct garp_port *port = rtnl_dereference(dev->garp_port); + struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); struct garp_attr *attr; spin_lock_bh(&app->lock); @@ -366,8 +366,8 @@ void garp_request_leave(const struct net_device *dev, const struct garp_application *appl, const void *data, u8 len, u8 type) { - struct garp_port *port = dev->garp_port; - struct garp_applicant *app = port->applicants[appl->type]; + struct garp_port *port = rtnl_dereference(dev->garp_port); + struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); struct garp_attr *attr; spin_lock_bh(&app->lock); @@ -546,11 +546,11 @@ static int garp_init_port(struct net_device *dev) static void garp_release_port(struct net_device *dev) { - struct garp_port *port = dev->garp_port; + struct garp_port *port = rtnl_dereference(dev->garp_port); unsigned int i; for (i = 0; i <= GARP_APPLICATION_MAX; i++) { - if (port->applicants[i]) + if (rtnl_dereference(port->applicants[i])) return; } rcu_assign_pointer(dev->garp_port, NULL); @@ -565,7 +565,7 @@ int garp_init_applicant(struct net_device *dev, struct garp_application *appl) ASSERT_RTNL(); - if (!dev->garp_port) { + if (!rtnl_dereference(dev->garp_port)) { err = garp_init_port(dev); if (err < 0) goto err1; @@ -601,8 +601,8 @@ EXPORT_SYMBOL_GPL(garp_init_applicant); void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl) { - struct garp_port *port = dev->garp_port; - struct garp_applicant *app = port->applicants[appl->type]; + struct garp_port *port = rtnl_dereference(dev->garp_port); + struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]); ASSERT_RTNL(); diff --git a/net/802/stp.c b/net/802/stp.c index 53c8f77f0ccd..978c30b1b36b 100644 --- a/net/802/stp.c +++ b/net/802/stp.c @@ -21,8 +21,8 @@ #define GARP_ADDR_MAX 0x2F #define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN) -static const struct stp_proto *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; -static const struct stp_proto *stp_proto __read_mostly; +static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; +static const struct stp_proto __rcu *stp_proto __read_mostly; static struct llc_sap *sap __read_mostly; static unsigned int sap_registered; diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 05b867e43757..52077ca22072 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -112,7 +112,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) ASSERT_RTNL(); - grp = real_dev->vlgrp; + grp = rtnl_dereference(real_dev->vlgrp); BUG_ON(!grp); /* Take it out of our own structures, but be sure to interlock with @@ -177,7 +177,7 @@ int register_vlan_dev(struct net_device *dev) struct vlan_group *grp, *ngrp = NULL; int err; - grp = real_dev->vlgrp; + grp = rtnl_dereference(real_dev->vlgrp); if (!grp) { ngrp = grp = vlan_group_alloc(real_dev); if (!grp) @@ -385,7 +385,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0); } - grp = dev->vlgrp; + grp = rtnl_dereference(dev->vlgrp); if (!grp) goto out; diff --git a/net/9p/client.c b/net/9p/client.c index 83bf0541d66f..a848bca9fbff 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -450,32 +450,43 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) return err; } - if (type == P9_RERROR) { + if (type == P9_RERROR || type == P9_RLERROR) { int ecode; - char *ename; - err = p9pdu_readf(req->rc, c->proto_version, "s?d", - &ename, &ecode); - if (err) { - P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", - err); - return err; - } + if (!p9_is_proto_dotl(c)) { + char *ename; - if (p9_is_proto_dotu(c) || - p9_is_proto_dotl(c)) - err = -ecode; + err = p9pdu_readf(req->rc, c->proto_version, "s?d", + &ename, &ecode); + if (err) + goto out_err; + + if (p9_is_proto_dotu(c)) + err = -ecode; + + if (!err || !IS_ERR_VALUE(err)) { + err = p9_errstr2errno(ename, strlen(ename)); + + P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, ename); - if (!err || !IS_ERR_VALUE(err)) - err = p9_errstr2errno(ename, strlen(ename)); + kfree(ename); + } + } else { + err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode); + err = -ecode; - P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, ename); + P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); + } - kfree(ename); } else err = 0; return err; + +out_err: + P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err); + + return err; } /** @@ -568,11 +579,14 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) va_start(ap, fmt); err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap); va_end(ap); + if (err) + goto reterr; p9pdu_finalize(req->tc); err = c->trans_mod->request(c, req); if (err < 0) { - c->status = Disconnected; + if (err != -ERESTARTSYS) + c->status = Disconnected; goto reterr; } @@ -1151,12 +1165,44 @@ int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname) } EXPORT_SYMBOL(p9_client_link); +int p9_client_fsync(struct p9_fid *fid, int datasync) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + P9_DPRINTK(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n", + fid->fid, datasync); + err = 0; + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + P9_DPRINTK(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid); + + p9_free_req(clnt, req); + +error: + return err; +} +EXPORT_SYMBOL(p9_client_fsync); + int p9_client_clunk(struct p9_fid *fid) { int err; struct p9_client *clnt; struct p9_req_t *req; + if (!fid) { + P9_EPRINTK(KERN_WARNING, "Trying to clunk with NULL fid\n"); + dump_stack(); + return 0; + } + P9_DPRINTK(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid); err = 0; clnt = fid->clnt; @@ -1240,16 +1286,13 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset, if (data) { memmove(data, dataptr, count); - } - - if (udata) { + } else { err = copy_to_user(udata, dataptr, count); if (err) { err = -EFAULT; goto free_and_error; } } - p9_free_req(clnt, req); return count; @@ -1761,3 +1804,96 @@ error: } EXPORT_SYMBOL(p9_client_mkdir_dotl); + +int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + err = 0; + clnt = fid->clnt; + P9_DPRINTK(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d " + "start %lld length %lld proc_id %d client_id %s\n", + fid->fid, flock->type, flock->flags, flock->start, + flock->length, flock->proc_id, flock->client_id); + + req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type, + flock->flags, flock->start, flock->length, + flock->proc_id, flock->client_id); + + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(req->rc, clnt->proto_version, "b", status); + if (err) { + p9pdu_dump(1, req->rc); + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status); +error: + p9_free_req(clnt, req); + return err; + +} +EXPORT_SYMBOL(p9_client_lock_dotl); + +int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + err = 0; + clnt = fid->clnt; + P9_DPRINTK(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld " + "length %lld proc_id %d client_id %s\n", fid->fid, glock->type, + glock->start, glock->length, glock->proc_id, glock->client_id); + + req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid, glock->type, + glock->start, glock->length, glock->proc_id, glock->client_id); + + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type, + &glock->start, &glock->length, &glock->proc_id, + &glock->client_id); + if (err) { + p9pdu_dump(1, req->rc); + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld " + "proc_id %d client_id %s\n", glock->type, glock->start, + glock->length, glock->proc_id, glock->client_id); +error: + p9_free_req(clnt, req); + return err; +} +EXPORT_SYMBOL(p9_client_getlock_dotl); + +int p9_client_readlink(struct p9_fid *fid, char **target) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + err = 0; + clnt = fid->clnt; + P9_DPRINTK(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid); + + req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid); + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(req->rc, clnt->proto_version, "s", target); + if (err) { + p9pdu_dump(1, req->rc); + goto error; + } + P9_DPRINTK(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target); +error: + p9_free_req(clnt, req); + return err; +} +EXPORT_SYMBOL(p9_client_readlink); diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 3acd3afb20c8..45c15f491401 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -122,9 +122,8 @@ static size_t pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size) { size_t len = MIN(pdu->capacity - pdu->size, size); - int err = copy_from_user(&pdu->sdata[pdu->size], udata, len); - if (err) - printk(KERN_WARNING "pdu_write_u returning: %d\n", err); + if (copy_from_user(&pdu->sdata[pdu->size], udata, len)) + len = 0; pdu->size += len; return size - len; diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index b88515936e4b..c8f3f72ab20e 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -75,6 +75,8 @@ struct virtio_chan { struct p9_client *client; struct virtio_device *vdev; struct virtqueue *vq; + int ring_bufs_avail; + wait_queue_head_t *vc_wq; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; @@ -134,16 +136,30 @@ static void req_done(struct virtqueue *vq) struct p9_fcall *rc; unsigned int len; struct p9_req_t *req; + unsigned long flags; P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n"); - while ((rc = virtqueue_get_buf(chan->vq, &len)) != NULL) { - P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); - P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); - req = p9_tag_lookup(chan->client, rc->tag); - req->status = REQ_STATUS_RCVD; - p9_client_cb(chan->client, req); - } + do { + spin_lock_irqsave(&chan->lock, flags); + rc = virtqueue_get_buf(chan->vq, &len); + + if (rc != NULL) { + if (!chan->ring_bufs_avail) { + chan->ring_bufs_avail = 1; + wake_up(chan->vc_wq); + } + spin_unlock_irqrestore(&chan->lock, flags); + P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); + P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", + rc->tag); + req = p9_tag_lookup(chan->client, rc->tag); + req->status = REQ_STATUS_RCVD; + p9_client_cb(chan->client, req); + } else { + spin_unlock_irqrestore(&chan->lock, flags); + } + } while (rc != NULL); } /** @@ -199,23 +215,43 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) int in, out; struct virtio_chan *chan = client->trans; char *rdata = (char *)req->rc+sizeof(struct p9_fcall); + unsigned long flags; + int err; P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n"); +req_retry: + req->status = REQ_STATUS_SENT; + + spin_lock_irqsave(&chan->lock, flags); out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM-out, rdata, client->msize); - req->status = REQ_STATUS_SENT; - - if (virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc) < 0) { - P9_DPRINTK(P9_DEBUG_TRANS, - "9p debug: virtio rpc add_buf returned failure"); - return -EIO; + err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc); + if (err < 0) { + if (err == -ENOSPC) { + chan->ring_bufs_avail = 0; + spin_unlock_irqrestore(&chan->lock, flags); + err = wait_event_interruptible(*chan->vc_wq, + chan->ring_bufs_avail); + if (err == -ERESTARTSYS) + return err; + + P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n"); + goto req_retry; + } else { + spin_unlock_irqrestore(&chan->lock, flags); + P9_DPRINTK(P9_DEBUG_TRANS, + "9p debug: " + "virtio rpc add_buf returned failure"); + return -EIO; + } } virtqueue_kick(chan->vq); + spin_unlock_irqrestore(&chan->lock, flags); P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n"); return 0; @@ -290,14 +326,23 @@ static int p9_virtio_probe(struct virtio_device *vdev) chan->tag_len = tag_len; err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); if (err) { - kfree(tag); - goto out_free_vq; + goto out_free_tag; } + chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); + if (!chan->vc_wq) { + err = -ENOMEM; + goto out_free_tag; + } + init_waitqueue_head(chan->vc_wq); + chan->ring_bufs_avail = 1; + mutex_lock(&virtio_9p_lock); list_add_tail(&chan->chan_list, &virtio_chan_list); mutex_unlock(&virtio_9p_lock); return 0; +out_free_tag: + kfree(tag); out_free_vq: vdev->config->del_vqs(vdev); kfree(chan); @@ -371,6 +416,7 @@ static void p9_virtio_remove(struct virtio_device *vdev) mutex_unlock(&virtio_9p_lock); sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); kfree(chan->tag); + kfree(chan->vc_wq); kfree(chan); } diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 26eaebf4aaa9..bb86d2932394 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1392,6 +1392,7 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, ax25_cb *ax25; int err = 0; + memset(fsa, 0, sizeof(fsa)); lock_sock(sk); ax25 = ax25_sk(sk); @@ -1403,7 +1404,6 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, fsa->fsa_ax25.sax25_family = AF_AX25; fsa->fsa_ax25.sax25_call = ax25->dest_addr; - fsa->fsa_ax25.sax25_ndigis = 0; if (ax25->digipeat != NULL) { ndigi = ax25->digipeat->ndigi; diff --git a/net/caif/caif_config_util.c b/net/caif/caif_config_util.c index 76ae68303d3a..d522d8c1703e 100644 --- a/net/caif/caif_config_util.c +++ b/net/caif/caif_config_util.c @@ -16,11 +16,18 @@ int connect_req_to_link_param(struct cfcnfg *cnfg, { struct dev_info *dev_info; enum cfcnfg_phy_preference pref; + int res; + memset(l, 0, sizeof(*l)); - l->priority = s->priority; + /* In caif protocol low value is high priority */ + l->priority = CAIF_PRIO_MAX - s->priority + 1; - if (s->link_name[0] != '\0') - l->phyid = cfcnfg_get_named(cnfg, s->link_name); + if (s->ifindex != 0){ + res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex); + if (res < 0) + return res; + l->phyid = res; + } else { switch (s->link_selector) { case CAIF_LINK_HIGH_BANDW: diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index b99369a055d1..a42a408306e4 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -307,6 +307,8 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, case NETDEV_UNREGISTER: caifd = caif_get(dev); + if (caifd == NULL) + break; netdev_info(dev, "unregister\n"); atomic_set(&caifd->state, what); caif_device_destroy(dev); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 2eca2dd0000f..1bf0cf503796 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -716,8 +716,7 @@ static int setsockopt(struct socket *sock, { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - int prio, linksel; - struct ifreq ifreq; + int linksel; if (cf_sk->sk.sk_socket->state != SS_UNCONNECTED) return -ENOPROTOOPT; @@ -735,33 +734,6 @@ static int setsockopt(struct socket *sock, release_sock(&cf_sk->sk); return 0; - case SO_PRIORITY: - if (lvl != SOL_SOCKET) - goto bad_sol; - if (ol < sizeof(int)) - return -EINVAL; - if (copy_from_user(&prio, ov, sizeof(int))) - return -EINVAL; - lock_sock(&(cf_sk->sk)); - cf_sk->conn_req.priority = prio; - release_sock(&cf_sk->sk); - return 0; - - case SO_BINDTODEVICE: - if (lvl != SOL_SOCKET) - goto bad_sol; - if (ol < sizeof(struct ifreq)) - return -EINVAL; - if (copy_from_user(&ifreq, ov, sizeof(ifreq))) - return -EFAULT; - lock_sock(&(cf_sk->sk)); - strncpy(cf_sk->conn_req.link_name, ifreq.ifr_name, - sizeof(cf_sk->conn_req.link_name)); - cf_sk->conn_req.link_name - [sizeof(cf_sk->conn_req.link_name)-1] = 0; - release_sock(&cf_sk->sk); - return 0; - case CAIFSO_REQ_PARAM: if (lvl != SOL_CAIF) goto bad_sol; @@ -880,6 +852,18 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTING; sk->sk_state = CAIF_CONNECTING; + /* Check priority value comming from socket */ + /* if priority value is out of range it will be ajusted */ + if (cf_sk->sk.sk_priority > CAIF_PRIO_MAX) + cf_sk->conn_req.priority = CAIF_PRIO_MAX; + else if (cf_sk->sk.sk_priority < CAIF_PRIO_MIN) + cf_sk->conn_req.priority = CAIF_PRIO_MIN; + else + cf_sk->conn_req.priority = cf_sk->sk.sk_priority; + + /*ifindex = id of the interface.*/ + cf_sk->conn_req.ifindex = cf_sk->sk.sk_bound_dev_if; + dbfs_atomic_inc(&cnt.num_connect_req); cf_sk->layer.receive = caif_sktrecv_cb; err = caif_connect_client(&cf_sk->conn_req, @@ -905,6 +889,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, cf_sk->maxframe = mtu - (headroom + tailroom); if (cf_sk->maxframe < 1) { pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu); + err = -ENODEV; goto out; } @@ -1142,7 +1127,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol, set_rx_flow_on(cf_sk); /* Set default options on configuration */ - cf_sk->conn_req.priority = CAIF_PRIO_NORMAL; + cf_sk->sk.sk_priority= CAIF_PRIO_NORMAL; cf_sk->conn_req.link_selector = CAIF_LINK_LOW_LATENCY; cf_sk->conn_req.protocol = protocol; /* Increase the number of sockets created. */ diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 41adafd18914..21ede141018a 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -173,18 +173,15 @@ static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo(struct cfcnfg *cnfg, return NULL; } -int cfcnfg_get_named(struct cfcnfg *cnfg, char *name) + +int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi) { int i; - - /* Try to match with specified name */ - for (i = 0; i < MAX_PHY_LAYERS; i++) { - if (cnfg->phy_layers[i].frm_layer != NULL - && strcmp(cnfg->phy_layers[i].phy_layer->name, - name) == 0) - return cnfg->phy_layers[i].frm_layer->id; - } - return 0; + for (i = 0; i < MAX_PHY_LAYERS; i++) + if (cnfg->phy_layers[i].frm_layer != NULL && + cnfg->phy_layers[i].ifindex == ifi) + return i; + return -ENODEV; } int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer) diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 08f267a109aa..3cd8f978e309 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -361,11 +361,10 @@ void cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer) struct cfctrl_request_info *p, *tmp; struct cfctrl *ctrl = container_obj(layr); spin_lock(&ctrl->info_list_lock); - pr_warn("enter\n"); list_for_each_entry_safe(p, tmp, &ctrl->list, list) { if (p->client_layer == adap_layer) { - pr_warn("cancel req :%d\n", p->sequence_no); + pr_debug("cancel req :%d\n", p->sequence_no); list_del(&p->list); kfree(p); } diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c index 496fda9ac66f..11a2af4c162a 100644 --- a/net/caif/cfdbgl.c +++ b/net/caif/cfdbgl.c @@ -12,6 +12,8 @@ #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> +#define container_obj(layr) ((struct cfsrvl *) layr) + static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt); static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt); @@ -38,5 +40,17 @@ static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt) static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt) { + struct cfsrvl *service = container_obj(layr); + struct caif_payload_info *info; + int ret; + + if (!cfsrvl_ready(service, &ret)) + return ret; + + /* Add info for MUX-layer to route the packet out */ + info = cfpkt_info(pkt); + info->channel_id = service->layer.id; + info->dev_info = &service->dev_info; + return layr->dn->transmit(layr->dn, pkt); } diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index bde8481e8d25..e2fb5fa75795 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -193,7 +193,7 @@ out: static int cfrfml_transmit_segment(struct cfrfml *rfml, struct cfpkt *pkt) { - caif_assert(cfpkt_getlen(pkt) >= rfml->fragment_size); + caif_assert(cfpkt_getlen(pkt) < rfml->fragment_size); /* Add info for MUX-layer to route the packet out. */ cfpkt_info(pkt)->channel_id = rfml->serv.layer.id; diff --git a/net/can/bcm.c b/net/can/bcm.c index 08ffe9e4be20..6faa8256e10c 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -125,7 +125,7 @@ struct bcm_sock { struct list_head tx_ops; unsigned long dropped_usr_msgs; struct proc_dir_entry *bcm_proc_read; - char procname [9]; /* pointer printed in ASCII with \0 */ + char procname [20]; /* pointer printed in ASCII with \0 */ }; static inline struct bcm_sock *bcm_sk(const struct sock *sk) diff --git a/net/ceph/Makefile b/net/ceph/Makefile index aab1cabb8035..5f19415ec9c0 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -1,9 +1,6 @@ # # Makefile for CEPH filesystem. # - -ifneq ($(KERNELRELEASE),) - obj-$(CONFIG_CEPH_LIB) += libceph.o libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ @@ -16,22 +13,3 @@ libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ pagevec.o -else -#Otherwise we were called directly from the command -# line; invoke the kernel build system. - -KERNELDIR ?= /lib/modules/$(shell uname -r)/build -PWD := $(shell pwd) - -default: all - -all: - $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install - -clean: - $(MAKE) -C $(KERNELDIR) M=$(PWD) clean - -endif diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index 53d8abfa25d5..bf3e6a13c215 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -19,7 +19,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (b->vec.iov_base) { b->is_vmalloc = false; } else { - b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); + b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); if (!b->vec.iov_base) { kfree(b); return NULL; diff --git a/net/compat.c b/net/compat.c index 63d260e81472..3649d5895361 100644 --- a/net/compat.c +++ b/net/compat.c @@ -41,10 +41,12 @@ static inline int iov_from_user_compat_to_kern(struct iovec *kiov, compat_size_t len; if (get_user(len, &uiov32->iov_len) || - get_user(buf, &uiov32->iov_base)) { - tot_len = -EFAULT; - break; - } + get_user(buf, &uiov32->iov_base)) + return -EFAULT; + + if (len > INT_MAX - tot_len) + len = INT_MAX - tot_len; + tot_len += len; kiov->iov_base = compat_ptr(buf); kiov->iov_len = (__kernel_size_t) len; diff --git a/net/core/dev.c b/net/core/dev.c index 78b5a89b0f40..0dd54a69dace 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1685,10 +1685,10 @@ EXPORT_SYMBOL(netif_device_attach); static bool can_checksum_protocol(unsigned long features, __be16 protocol) { - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_IP_CSUM) && + return ((features & NETIF_F_NO_CSUM) || + ((features & NETIF_F_V4_CSUM) && protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_IPV6_CSUM) && + ((features & NETIF_F_V6_CSUM) && protocol == htons(ETH_P_IPV6)) || ((features & NETIF_F_FCOE_CRC) && protocol == htons(ETH_P_FCOE))); @@ -1696,22 +1696,18 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) { + __be16 protocol = skb->protocol; int features = dev->features; - if (vlan_tx_tag_present(skb)) + if (vlan_tx_tag_present(skb)) { features &= dev->vlan_features; - - if (can_checksum_protocol(features, skb->protocol)) - return true; - - if (skb->protocol == htons(ETH_P_8021Q)) { + } else if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - if (can_checksum_protocol(dev->features & dev->vlan_features, - veh->h_vlan_encapsulated_proto)) - return true; + protocol = veh->h_vlan_encapsulated_proto; + features &= dev->vlan_features; } - return false; + return can_checksum_protocol(features, protocol); } /** @@ -2135,7 +2131,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, } else { struct sock *sk = skb->sk; queue_index = sk_tx_queue_get(sk); - if (queue_index < 0) { + if (queue_index < 0 || queue_index >= dev->real_num_tx_queues) { queue_index = 0; if (dev->real_num_tx_queues > 1) @@ -2213,7 +2209,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, } static DEFINE_PER_CPU(int, xmit_recursion); -#define RECURSION_LIMIT 3 +#define RECURSION_LIMIT 10 /** * dev_queue_xmit - transmit a buffer @@ -2413,7 +2409,7 @@ EXPORT_SYMBOL(__skb_get_rxhash); #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); /* @@ -2425,7 +2421,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct netdev_rx_queue *rxqueue; - struct rps_map *map = NULL; + struct rps_map *map; struct rps_dev_flow_table *flow_table; struct rps_sock_flow_table *sock_flow_table; int cpu = -1; @@ -2444,15 +2440,15 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } else rxqueue = dev->_rx; - if (rxqueue->rps_map) { - map = rcu_dereference(rxqueue->rps_map); - if (map && map->len == 1) { + map = rcu_dereference(rxqueue->rps_map); + if (map) { + if (map->len == 1) { tcpu = map->cpus[0]; if (cpu_online(tcpu)) cpu = tcpu; goto done; } - } else if (!rxqueue->rps_flow_table) { + } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { goto done; } @@ -5416,7 +5412,7 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); WARN_ON(rcu_dereference_raw(dev->ip_ptr)); - WARN_ON(dev->ip6_ptr); + WARN_ON(rcu_dereference_raw(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); if (dev->destructor) diff --git a/net/core/dst.c b/net/core/dst.c index 8abe628b79f1..b99c7c7ffce2 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -370,6 +370,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, static struct notifier_block dst_dev_notifier = { .notifier_call = dst_dev_event, + .priority = -10, /* must be called after other network notifiers */ }; void __init dst_init(void) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 1bc3f253ba6c..82a4369ae150 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -351,12 +351,12 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { - rule->ctarget = r; + RCU_INIT_POINTER(rule->ctarget, r); break; } } - if (rule->ctarget == NULL) + if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; } else if (rule->action == FR_ACT_GOTO) goto errout_free; @@ -373,6 +373,11 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) fib_rule_get(rule); + if (last) + list_add_rcu(&rule->list, &last->list); + else + list_add_rcu(&rule->list, &ops->rules_list); + if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if @@ -381,7 +386,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref) { - BUG_ON(r->ctarget != NULL); + BUG_ON(rtnl_dereference(r->ctarget) != NULL); rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; @@ -395,11 +400,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (unresolved) ops->unresolved_rules++; - if (last) - list_add_rcu(&rule->list, &last->list); - else - list_add_rcu(&rule->list, &ops->rules_list); - notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); flush_route_cache(ops); rules_ops_put(ops); @@ -487,7 +487,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) */ if (ops->nr_goto_rules > 0) { list_for_each_entry(tmp, &ops->rules_list, list) { - if (tmp->ctarget == rule) { + if (rtnl_dereference(tmp->ctarget) == rule) { rcu_assign_pointer(tmp->ctarget, NULL); ops->unresolved_rules++; } @@ -545,7 +545,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->action = rule->action; frh->flags = rule->flags; - if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL) + if (rule->action == FR_ACT_GOTO && + rcu_dereference_raw(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { diff --git a/net/core/filter.c b/net/core/filter.c index 7adf50352918..ae21a0d3c4a2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -89,8 +89,8 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) rcu_read_lock_bh(); filter = rcu_dereference_bh(sk->sk_filter); if (filter) { - unsigned int pkt_len = sk_run_filter(skb, filter->insns, - filter->len); + unsigned int pkt_len = sk_run_filter(skb, filter->insns, filter->len); + err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } rcu_read_unlock_bh(); @@ -112,39 +112,41 @@ EXPORT_SYMBOL(sk_filter); */ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) { - struct sock_filter *fentry; /* We walk down these */ void *ptr; u32 A = 0; /* Accumulator */ u32 X = 0; /* Index Register */ u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + unsigned long memvalid = 0; u32 tmp; int k; int pc; + BUILD_BUG_ON(BPF_MEMWORDS > BITS_PER_LONG); /* * Process array of filter instructions. */ for (pc = 0; pc < flen; pc++) { - fentry = &filter[pc]; + const struct sock_filter *fentry = &filter[pc]; + u32 f_k = fentry->k; switch (fentry->code) { case BPF_S_ALU_ADD_X: A += X; continue; case BPF_S_ALU_ADD_K: - A += fentry->k; + A += f_k; continue; case BPF_S_ALU_SUB_X: A -= X; continue; case BPF_S_ALU_SUB_K: - A -= fentry->k; + A -= f_k; continue; case BPF_S_ALU_MUL_X: A *= X; continue; case BPF_S_ALU_MUL_K: - A *= fentry->k; + A *= f_k; continue; case BPF_S_ALU_DIV_X: if (X == 0) @@ -152,49 +154,49 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int A /= X; continue; case BPF_S_ALU_DIV_K: - A /= fentry->k; + A /= f_k; continue; case BPF_S_ALU_AND_X: A &= X; continue; case BPF_S_ALU_AND_K: - A &= fentry->k; + A &= f_k; continue; case BPF_S_ALU_OR_X: A |= X; continue; case BPF_S_ALU_OR_K: - A |= fentry->k; + A |= f_k; continue; case BPF_S_ALU_LSH_X: A <<= X; continue; case BPF_S_ALU_LSH_K: - A <<= fentry->k; + A <<= f_k; continue; case BPF_S_ALU_RSH_X: A >>= X; continue; case BPF_S_ALU_RSH_K: - A >>= fentry->k; + A >>= f_k; continue; case BPF_S_ALU_NEG: A = -A; continue; case BPF_S_JMP_JA: - pc += fentry->k; + pc += f_k; continue; case BPF_S_JMP_JGT_K: - pc += (A > fentry->k) ? fentry->jt : fentry->jf; + pc += (A > f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JGE_K: - pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + pc += (A >= f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JEQ_K: - pc += (A == fentry->k) ? fentry->jt : fentry->jf; + pc += (A == f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JSET_K: - pc += (A & fentry->k) ? fentry->jt : fentry->jf; + pc += (A & f_k) ? fentry->jt : fentry->jf; continue; case BPF_S_JMP_JGT_X: pc += (A > X) ? fentry->jt : fentry->jf; @@ -209,7 +211,7 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int pc += (A & X) ? fentry->jt : fentry->jf; continue; case BPF_S_LD_W_ABS: - k = fentry->k; + k = f_k; load_w: ptr = load_pointer(skb, k, 4, &tmp); if (ptr != NULL) { @@ -218,7 +220,7 @@ load_w: } break; case BPF_S_LD_H_ABS: - k = fentry->k; + k = f_k; load_h: ptr = load_pointer(skb, k, 2, &tmp); if (ptr != NULL) { @@ -227,7 +229,7 @@ load_h: } break; case BPF_S_LD_B_ABS: - k = fentry->k; + k = f_k; load_b: ptr = load_pointer(skb, k, 1, &tmp); if (ptr != NULL) { @@ -242,32 +244,34 @@ load_b: X = skb->len; continue; case BPF_S_LD_W_IND: - k = X + fentry->k; + k = X + f_k; goto load_w; case BPF_S_LD_H_IND: - k = X + fentry->k; + k = X + f_k; goto load_h; case BPF_S_LD_B_IND: - k = X + fentry->k; + k = X + f_k; goto load_b; case BPF_S_LDX_B_MSH: - ptr = load_pointer(skb, fentry->k, 1, &tmp); + ptr = load_pointer(skb, f_k, 1, &tmp); if (ptr != NULL) { X = (*(u8 *)ptr & 0xf) << 2; continue; } return 0; case BPF_S_LD_IMM: - A = fentry->k; + A = f_k; continue; case BPF_S_LDX_IMM: - X = fentry->k; + X = f_k; continue; case BPF_S_LD_MEM: - A = mem[fentry->k]; + A = (memvalid & (1UL << f_k)) ? + mem[f_k] : 0; continue; case BPF_S_LDX_MEM: - X = mem[fentry->k]; + X = (memvalid & (1UL << f_k)) ? + mem[f_k] : 0; continue; case BPF_S_MISC_TAX: X = A; @@ -276,14 +280,16 @@ load_b: A = X; continue; case BPF_S_RET_K: - return fentry->k; + return f_k; case BPF_S_RET_A: return A; case BPF_S_ST: - mem[fentry->k] = A; + memvalid |= 1UL << f_k; + mem[f_k] = A; continue; case BPF_S_STX: - mem[fentry->k] = X; + memvalid |= 1UL << f_k; + mem[f_k] = X; continue; default: WARN_ON(1); @@ -583,23 +589,16 @@ int sk_chk_filter(struct sock_filter *filter, int flen) EXPORT_SYMBOL(sk_chk_filter); /** - * sk_filter_rcu_release: Release a socket filter by rcu_head + * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ -static void sk_filter_rcu_release(struct rcu_head *rcu) +void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); - sk_filter_release(fp); -} - -static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp) -{ - unsigned int size = sk_filter_len(fp); - - atomic_sub(size, &sk->sk_omem_alloc); - call_rcu_bh(&fp->rcu, sk_filter_rcu_release); + kfree(fp); } +EXPORT_SYMBOL(sk_filter_release_rcu); /** * sk_attach_filter - attach a socket filter @@ -643,7 +642,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) - sk_filter_delayed_uncharge(sk, old_fp); + sk_filter_uncharge(sk, old_fp); return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); @@ -657,7 +656,7 @@ int sk_detach_filter(struct sock *sk) sock_owned_by_user(sk)); if (filter) { rcu_assign_pointer(sk->sk_filter, NULL); - sk_filter_delayed_uncharge(sk, filter); + sk_filter_uncharge(sk, filter); ret = 0; } return ret; diff --git a/net/core/iovec.c b/net/core/iovec.c index 72aceb1fe4fa..c40f27e7d208 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -35,10 +35,9 @@ * in any case. */ -long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) +int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) { - int size, ct; - long err; + int size, ct, err; if (m->msg_namelen) { if (mode == VERIFY_READ) { @@ -62,14 +61,13 @@ long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, err = 0; for (ct = 0; ct < m->msg_iovlen; ct++) { - err += iov[ct].iov_len; - /* - * Goal is not to verify user data, but to prevent returning - * negative value, which is interpreted as errno. - * Overflow is still possible, but it is harmless. - */ - if (err < 0) - return -EMSGSIZE; + size_t len = iov[ct].iov_len; + + if (len > INT_MAX - err) { + len = INT_MAX - err; + iov[ct].iov_len = len; + } + err += len; } return err; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b143173e3eb2..7f902cad10f8 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -598,7 +598,8 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, } spin_lock(&rps_map_lock); - old_map = queue->rps_map; + old_map = rcu_dereference_protected(queue->rps_map, + lockdep_is_held(&rps_map_lock)); rcu_assign_pointer(queue->rps_map, map); spin_unlock(&rps_map_lock); @@ -677,7 +678,8 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, table = NULL; spin_lock(&rps_dev_flow_lock); - old_table = queue->rps_flow_table; + old_table = rcu_dereference_protected(queue->rps_flow_table, + lockdep_is_held(&rps_dev_flow_lock)); rcu_assign_pointer(queue->rps_flow_table, table); spin_unlock(&rps_dev_flow_lock); @@ -705,16 +707,26 @@ static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); struct netdev_rx_queue *first = queue->first; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; - if (queue->rps_map) - call_rcu(&queue->rps_map->rcu, rps_map_release); - if (queue->rps_flow_table) - call_rcu(&queue->rps_flow_table->rcu, - rps_dev_flow_table_release); + map = rcu_dereference_raw(queue->rps_map); + if (map) { + RCU_INIT_POINTER(queue->rps_map, NULL); + call_rcu(&map->rcu, rps_map_release); + } + + flow_table = rcu_dereference_raw(queue->rps_flow_table); + if (flow_table) { + RCU_INIT_POINTER(queue->rps_flow_table, NULL); + call_rcu(&flow_table->rcu, rps_dev_flow_table_release); + } if (atomic_dec_and_test(&first->count)) kfree(first); + else + memset(kobj, 0, sizeof(*kobj)); } static struct kobj_type rx_queue_ktype = { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c988e685433a..3f860261c5ee 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -42,7 +42,9 @@ static int net_assign_generic(struct net *net, int id, void *data) BUG_ON(!mutex_is_locked(&net_mutex)); BUG_ON(id == 0); - ng = old_ng = net->gen; + old_ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&net_mutex)); + ng = old_ng; if (old_ng->len >= id) goto assign; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 2c0df0f95b3d..33bc3823ac6f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -771,10 +771,10 @@ done: static unsigned long num_arg(const char __user * user_buffer, unsigned long maxlen, unsigned long *num) { - int i = 0; + int i; *num = 0; - for (; i < maxlen; i++) { + for (i = 0; i < maxlen; i++) { char c; if (get_user(c, &user_buffer[i])) return -EFAULT; @@ -789,9 +789,9 @@ static unsigned long num_arg(const char __user * user_buffer, static int strn_len(const char __user * user_buffer, unsigned int maxlen) { - int i = 0; + int i; - for (; i < maxlen; i++) { + for (i = 0; i < maxlen; i++) { char c; if (get_user(c, &user_buffer[i])) return -EFAULT; @@ -846,7 +846,7 @@ static ssize_t pktgen_if_write(struct file *file, { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; - int i = 0, max, len; + int i, max, len; char name[16], valstr[32]; unsigned long value = 0; char *pg_result = NULL; @@ -860,13 +860,13 @@ static ssize_t pktgen_if_write(struct file *file, return -EINVAL; } - max = count - i; - tmp = count_trail_chars(&user_buffer[i], max); + max = count; + tmp = count_trail_chars(user_buffer, max); if (tmp < 0) { pr_warning("illegal format\n"); return tmp; } - i += tmp; + i = tmp; /* Read variable name */ @@ -887,10 +887,11 @@ static ssize_t pktgen_if_write(struct file *file, i += len; if (debug) { - char tb[count + 1]; - if (copy_from_user(tb, user_buffer, count)) + size_t copy = min_t(size_t, count, 1023); + char tb[copy + 1]; + if (copy_from_user(tb, user_buffer, copy)) return -EFAULT; - tb[count] = 0; + tb[copy] = 0; printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name, (unsigned long)count, tb); } @@ -1764,7 +1765,7 @@ static ssize_t pktgen_thread_write(struct file *file, { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; - int i = 0, max, len, ret; + int i, max, len, ret; char name[40]; char *pg_result; @@ -1773,12 +1774,12 @@ static ssize_t pktgen_thread_write(struct file *file, return -EINVAL; } - max = count - i; - len = count_trail_chars(&user_buffer[i], max); + max = count; + len = count_trail_chars(user_buffer, max); if (len < 0) return len; - i += len; + i = len; /* Read variable name */ @@ -1975,7 +1976,7 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, const char *ifname) { char b[IFNAMSIZ+5]; - int i = 0; + int i; for (i = 0; ifname[i] != '@'; i++) { if (i == IFNAMSIZ) @@ -2519,8 +2520,8 @@ static void free_SAs(struct pktgen_dev *pkt_dev) { if (pkt_dev->cflows) { /* let go of the SAs if we have them */ - int i = 0; - for (; i < pkt_dev->cflows; i++) { + int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; if (x) { xfrm_state_put(x); @@ -2611,8 +2612,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ - queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; datalen = (odev->hard_header_len + 16) & ~0xf; @@ -2975,8 +2976,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, /* Update any of the values, used when we're incrementing various * fields. */ - queue_map = pkt_dev->cur_queue_map; mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; skb = __netdev_alloc_skb(odev, pkt_dev->cur_pkt_size + 64 diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 7552495aff7a..fceeb37d7161 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -45,9 +45,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) - lopt = __vmalloc(lopt_size, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + lopt = vzalloc(lopt_size); else lopt = kzalloc(lopt_size, GFP_KERNEL); if (lopt == NULL) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8121268ddbdd..841c287ef40a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -347,16 +347,17 @@ static size_t rtnl_link_get_size(const struct net_device *dev) if (!ops) return 0; - size = nlmsg_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ - nlmsg_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ + size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ + nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ - size += nlmsg_total_size(sizeof(struct nlattr)) + + size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) - size += ops->get_xstats_size(dev); /* IFLA_INFO_XSTATS */ + /* IFLA_INFO_XSTATS */ + size += nla_total_size(ops->get_xstats_size(dev)); return size; } diff --git a/net/core/sock.c b/net/core/sock.c index 11db43632df8..fb6080111461 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1225,7 +1225,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); skb_queue_head_init(&newsk->sk_error_queue); - filter = newsk->sk_filter; + filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) sk_filter_charge(newsk, filter); @@ -1653,10 +1653,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); - int allocated; + long allocated; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = atomic_add_return(amt, prot->memory_allocated); + allocated = atomic_long_add_return(amt, prot->memory_allocated); /* Under limit. */ if (allocated <= prot->sysctl_mem[0]) { @@ -1714,7 +1714,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - atomic_sub(amt, prot->memory_allocated); + atomic_long_sub(amt, prot->memory_allocated); return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -1727,12 +1727,12 @@ void __sk_mem_reclaim(struct sock *sk) { struct proto *prot = sk->sk_prot; - atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, + atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, prot->memory_allocated); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (prot->memory_pressure && *prot->memory_pressure && - (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) + (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) *prot->memory_pressure = 0; } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -2452,12 +2452,12 @@ static char proto_method_implemented(const void *method) static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { - seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), - proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, + proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", proto->max_header, proto->slab == NULL ? "no" : "yes", diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 01eee5d984be..385b6095fdc4 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -34,7 +34,8 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rps_sock_flow_table; + orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 117fb093dcaf..75c3582a7678 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -134,13 +134,41 @@ static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); +/* + * Congestion control of queued data packets via CCID decision. + * + * The TX CCID performs its congestion-control by indicating whether and when a + * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). + * The following modes are supported via the symbolic constants below: + * - timer-based pacing (CCID returns a delay value in milliseconds); + * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). + */ + +enum ccid_dequeueing_decision { + CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ + CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ + CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ + CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ + CCID_PACKET_ERR = 0xF0000, /* error condition */ +}; + +static inline int ccid_packet_dequeue_eval(const int return_code) +{ + if (return_code < 0) + return CCID_PACKET_ERR; + if (return_code == 0) + return CCID_PACKET_SEND_AT_ONCE; + if (return_code <= CCID_PACKET_DELAY_MAX) + return CCID_PACKET_DELAY; + return return_code; +} + static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { - int rc = 0; if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return rc; + return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); + return CCID_PACKET_SEND_AT_ONCE; } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index d850e291f87c..6576eae9e779 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -78,12 +78,9 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if (hc->tx_pipe < hc->tx_cwnd) - return 0; - - return 1; /* XXX CCID should dequeue when ready instead of polling */ + if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) + return CCID_PACKET_WILL_DEQUEUE_LATER; + return CCID_PACKET_SEND_AT_ONCE; } static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) @@ -115,6 +112,7 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -129,8 +127,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) if (hc->tx_rto > DCCP_RTO_MAX) hc->tx_rto = DCCP_RTO_MAX; - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); - /* adjust pipe, cwnd etc */ hc->tx_ssthresh = hc->tx_cwnd / 2; if (hc->tx_ssthresh < 2) @@ -146,6 +142,12 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hc->tx_rpseq = 0; hc->tx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); + + /* if we were blocked before, we may now send cwnd=1 packet */ + if (sender_was_blocked) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + /* restart backed-off timer */ + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); out: bh_unlock_sock(sk); sock_put(sk); @@ -434,6 +436,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); u64 ackno, seqno; struct ccid2_seq *seqp; unsigned char *vector; @@ -631,6 +634,10 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) sk_stop_timer(sk, &hc->tx_rtotimer); else sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); + + /* check if incoming Acks allow pending packets to be sent */ + if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 9731c2dc1487..25cb6b216eda 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -81,6 +81,11 @@ struct ccid2_hc_tx_sock { u64 tx_high_ack; }; +static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) +{ + return hc->tx_pipe >= hc->tx_cwnd; +} + struct ccid2_hc_rx_sock { int rx_data; }; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 3060a60ed5ab..3d604e1349c0 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -268,11 +268,11 @@ out: sock_put(sk); } -/* - * returns - * > 0: delay (in msecs) that should pass before actually sending - * = 0: can send immediately - * < 0: error condition; do not send packet +/** + * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets + * @skb: next packet candidate to send on @sk + * This function uses the convention of ccid_packet_dequeue_eval() and + * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. */ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { @@ -348,7 +348,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) /* set the nominal send time for the next following packet */ hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); - return 0; + return CCID_PACKET_SEND_AT_ONCE; } static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 3eb264b60823..a8ed459508b2 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -243,8 +243,9 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); -extern void dccp_write_xmit(struct sock *sk, int block); -extern void dccp_write_space(struct sock *sk); +extern void dccp_write_xmit(struct sock *sk); +extern void dccp_write_space(struct sock *sk); +extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); extern void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) diff --git a/net/dccp/input.c b/net/dccp/input.c index 265985370fa1..e424a09e83f6 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -239,7 +239,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) dccp_update_gsr(sk, seqno); if (dh->dccph_type != DCCP_PKT_SYNC && - (ackno != DCCP_PKT_WITHOUT_ACK_SEQ)) + ackno != DCCP_PKT_WITHOUT_ACK_SEQ && + after48(ackno, dp->dccps_gar)) dp->dccps_gar = ackno; } else { unsigned long now = jiffies; diff --git a/net/dccp/output.c b/net/dccp/output.c index a988fe9ffcba..45b91853f5ae 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -209,108 +209,150 @@ void dccp_write_space(struct sock *sk) } /** - * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet + * dccp_wait_for_ccid - Await CCID send permission * @sk: socket to wait for - * @skb: current skb to pass on for waiting - * @delay: sleep timeout in milliseconds (> 0) - * This function is called by default when the socket is closed, and - * when a non-zero linger time is set on the socket. For consistency + * @delay: timeout in jiffies + * This is used by CCIDs which need to delay the send time in process context. */ -static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) +static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) { - struct dccp_sock *dp = dccp_sk(sk); DEFINE_WAIT(wait); - unsigned long jiffdelay; - int rc; + long remaining; + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending++; + release_sock(sk); + + remaining = schedule_timeout(delay); + + lock_sock(sk); + sk->sk_write_pending--; + finish_wait(sk_sleep(sk), &wait); + + if (signal_pending(current) || sk->sk_err) + return -1; + return remaining; +} + +/** + * dccp_xmit_packet - Send data packet under control of CCID + * Transmits next-queued payload and informs CCID to account for the packet. + */ +static void dccp_xmit_packet(struct sock *sk) +{ + int err, len; + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue); - do { - dccp_pr_debug("delayed send by %d msec\n", delay); - jiffdelay = msecs_to_jiffies(delay); + if (unlikely(skb == NULL)) + return; + len = skb->len; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (sk->sk_state == DCCP_PARTOPEN) { + const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; + /* + * See 8.1.5 - Handshake Completion. + * + * For robustness we resend Confirm options until the client has + * entered OPEN. During the initial feature negotiation, the MPS + * is smaller than usual, reduced by the Change/Confirm options. + */ + if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { + DCCP_WARN("Payload too large (%d) for featneg.\n", len); + dccp_send_ack(sk); + dccp_feat_list_purge(&dp->dccps_featneg); + } - sk->sk_write_pending++; - release_sock(sk); - schedule_timeout(jiffdelay); - lock_sock(sk); - sk->sk_write_pending--; + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else if (dccp_ack_pending(sk)) { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; + } + + err = dccp_transmit_skb(sk, skb); + if (err) + dccp_pr_debug("transmit_skb() returned err=%d\n", err); + /* + * Register this one as sent even if an error occurred. To the remote + * end a local packet drop is indistinguishable from network loss, i.e. + * any local drop will eventually be reported via receiver feedback. + */ + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); +} - if (sk->sk_err) - goto do_error; - if (signal_pending(current)) - goto do_interrupted; +/** + * dccp_flush_write_queue - Drain queue at end of connection + * Since dccp_sendmsg queues packets without waiting for them to be sent, it may + * happen that the TX queue is not empty at the end of a connection. We give the + * HC-sender CCID a grace period of up to @time_budget jiffies. If this function + * returns with a non-empty write queue, it will be purged later. + */ +void dccp_flush_write_queue(struct sock *sk, long *time_budget) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + long delay, rc; + while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - } while ((delay = rc) > 0); -out: - finish_wait(sk_sleep(sk), &wait); - return rc; - -do_error: - rc = -EPIPE; - goto out; -do_interrupted: - rc = -EINTR; - goto out; + + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + /* + * If the CCID determines when to send, the next sending + * time is unknown or the CCID may not even send again + * (e.g. remote host crashes or lost Ack packets). + */ + DCCP_WARN("CCID did not manage to send all packets\n"); + return; + case CCID_PACKET_DELAY: + delay = msecs_to_jiffies(rc); + if (delay > *time_budget) + return; + rc = dccp_wait_for_ccid(sk, delay); + if (rc < 0) + return; + *time_budget -= (delay - rc); + /* check again if we can send now */ + break; + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + skb_dequeue(&sk->sk_write_queue); + kfree_skb(skb); + dccp_pr_debug("packet discarded due to err=%ld\n", rc); + } + } } -void dccp_write_xmit(struct sock *sk, int block) +void dccp_write_xmit(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; while ((skb = skb_peek(&sk->sk_write_queue))) { - int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - if (err > 0) { - if (!block) { - sk_reset_timer(sk, &dp->dccps_xmit_timer, - msecs_to_jiffies(err)+jiffies); - break; - } else - err = dccp_wait_for_ccid(sk, skb, err); - if (err && err != -EINTR) - DCCP_BUG("err=%d after dccp_wait_for_ccid", err); - } + int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - skb_dequeue(&sk->sk_write_queue); - if (err == 0) { - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const int len = skb->len; - - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } - - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - dcb->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) - dcb->dccpd_type = DCCP_PKT_DATAACK; - else - dcb->dccpd_type = DCCP_PKT_DATA; - - err = dccp_transmit_skb(sk, skb); - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - if (err) - DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", - err); - } else { - dccp_pr_debug("packet discarded due to err=%d\n", err); + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + return; + case CCID_PACKET_DELAY: + sk_reset_timer(sk, &dp->dccps_xmit_timer, + jiffies + msecs_to_jiffies(rc)); + return; + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + skb_dequeue(&sk->sk_write_queue); kfree_skb(skb); + dccp_pr_debug("packet discarded due to err=%d\n", rc); } } } @@ -622,7 +664,6 @@ void dccp_send_close(struct sock *sk, const int active) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; if (active) { - dccp_write_xmit(sk, 1); dccp_skb_entail(sk, skb); dccp_transmit_skb(sk, skb_clone(skb, prio)); /* diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 7e5fc04eb6d1..ef343d53fcea 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -726,7 +726,13 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto out_discard; skb_queue_tail(&sk->sk_write_queue, skb); - dccp_write_xmit(sk,0); + /* + * The xmit_timer is set if the TX CCID is rate-based and will expire + * when congestion control permits to release further packets into the + * network. Window-based CCIDs do not use this timer. + */ + if (!timer_pending(&dp->dccps_xmit_timer)) + dccp_write_xmit(sk); out_release: release_sock(sk); return rc ? : len; @@ -951,9 +957,22 @@ void dccp_close(struct sock *sk, long timeout) /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { + /* + * Normal connection termination. May need to wait if there are + * still packets in the TX queue that are delayed by the CCID. + */ + dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } + /* + * Flush write queue. This may be necessary in several cases: + * - we have been closed by the peer but still have application data; + * - abortive termination (unread data or zero linger time), + * - normal termination but queue could not be flushed within time limit + */ + __skb_queue_purge(&sk->sk_write_queue); + sk_stream_wait_close(sk, timeout); adjudge_to_death: diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 1a9aa05d4dc4..7587870b7040 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -237,32 +237,35 @@ out: sock_put(sk); } -/* Transmit-delay timer: used by the CCIDs to delay actual send time */ -static void dccp_write_xmit_timer(unsigned long data) +/** + * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface + * See the comments above %ccid_dequeueing_decision for supported modes. + */ +static void dccp_write_xmitlet(unsigned long data) { struct sock *sk = (struct sock *)data; - struct dccp_sock *dp = dccp_sk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); + sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); else - dccp_write_xmit(sk, 0); + dccp_write_xmit(sk); bh_unlock_sock(sk); - sock_put(sk); } -static void dccp_init_write_xmit_timer(struct sock *sk) +static void dccp_write_xmit_timer(unsigned long data) { - struct dccp_sock *dp = dccp_sk(sk); - - setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, - (unsigned long)sk); + dccp_write_xmitlet(data); + sock_put((struct sock *)data); } void dccp_init_xmit_timers(struct sock *sk) { - dccp_init_write_xmit_timer(sk); + struct dccp_sock *dp = dccp_sk(sk); + + tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, + (unsigned long)sk); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index d6b93d19790f..6f97268ed85f 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -155,7 +155,7 @@ static const struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; -static atomic_t decnet_memory_allocated; +static atomic_long_t decnet_memory_allocated; static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags); static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); @@ -1556,6 +1556,8 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us if (r_len > sizeof(struct linkinfo_dn)) r_len = sizeof(struct linkinfo_dn); + memset(&link, 0, sizeof(link)); + switch(sock->state) { case SS_CONNECTING: link.idn_linkstate = LL_CONNECTING; diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index be3eb8e23288..28f8b5e5f73b 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -38,7 +38,7 @@ int decnet_log_martians = 1; int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; /* Reasonable defaults, I hope, based on tcp's defaults */ -int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; +long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 }; int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; @@ -324,7 +324,7 @@ static ctl_table dn_table[] = { .data = &sysctl_decnet_mem, .maxlen = sizeof(sysctl_decnet_mem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax }, { .procname = "decnet_rmem", diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index f8c1ae4b41f0..13992e1d2726 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -31,6 +31,7 @@ #include <linux/skbuff.h> #include <linux/udp.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include <net/sock.h> #include <net/inet_common.h> #include <linux/stat.h> @@ -276,12 +277,12 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, #endif #ifdef CONFIG_ECONET_AUNUDP struct msghdr udpmsg; - struct iovec iov[msg->msg_iovlen+1]; + struct iovec iov[2]; struct aunhdr ah; struct sockaddr_in udpdest; __kernel_size_t size; - int i; mm_segment_t oldfs; + char *userbuf; #endif /* @@ -297,23 +298,14 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, mutex_lock(&econet_mutex); - if (saddr == NULL) { - struct econet_sock *eo = ec_sk(sk); - - addr.station = eo->station; - addr.net = eo->net; - port = eo->port; - cb = eo->cb; - } else { - if (msg->msg_namelen < sizeof(struct sockaddr_ec)) { - mutex_unlock(&econet_mutex); - return -EINVAL; - } - addr.station = saddr->addr.station; - addr.net = saddr->addr.net; - port = saddr->port; - cb = saddr->cb; - } + if (saddr == NULL || msg->msg_namelen < sizeof(struct sockaddr_ec)) { + mutex_unlock(&econet_mutex); + return -EINVAL; + } + addr.station = saddr->addr.station; + addr.net = saddr->addr.net; + port = saddr->port; + cb = saddr->cb; /* Look for a device with the right network number. */ dev = net2dev_map[addr.net]; @@ -328,17 +320,17 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, } } - if (len + 15 > dev->mtu) { - mutex_unlock(&econet_mutex); - return -EMSGSIZE; - } - if (dev->type == ARPHRD_ECONET) { /* Real hardware Econet. We're not worthy etc. */ #ifdef CONFIG_ECONET_NATIVE unsigned short proto = 0; int res; + if (len + 15 > dev->mtu) { + mutex_unlock(&econet_mutex); + return -EMSGSIZE; + } + dev_hold(dev); skb = sock_alloc_send_skb(sk, len+LL_ALLOCATED_SPACE(dev), @@ -351,7 +343,6 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, eb = (struct ec_cb *)&skb->cb; - /* BUG: saddr may be NULL */ eb->cookie = saddr->cookie; eb->sec = *saddr; eb->sent = ec_tx_done; @@ -415,6 +406,11 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, return -ENETDOWN; /* No socket - can't send */ } + if (len > 32768) { + err = -E2BIG; + goto error; + } + /* Make up a UDP datagram and hand it off to some higher intellect. */ memset(&udpdest, 0, sizeof(udpdest)); @@ -446,36 +442,26 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, /* tack our header on the front of the iovec */ size = sizeof(struct aunhdr); - /* - * XXX: that is b0rken. We can't mix userland and kernel pointers - * in iovec, since on a lot of platforms copy_from_user() will - * *not* work with the kernel and userland ones at the same time, - * regardless of what we do with set_fs(). And we are talking about - * econet-over-ethernet here, so "it's only ARM anyway" doesn't - * apply. Any suggestions on fixing that code? -- AV - */ iov[0].iov_base = (void *)&ah; iov[0].iov_len = size; - for (i = 0; i < msg->msg_iovlen; i++) { - void __user *base = msg->msg_iov[i].iov_base; - size_t iov_len = msg->msg_iov[i].iov_len; - /* Check it now since we switch to KERNEL_DS later. */ - if (!access_ok(VERIFY_READ, base, iov_len)) { - mutex_unlock(&econet_mutex); - return -EFAULT; - } - iov[i+1].iov_base = base; - iov[i+1].iov_len = iov_len; - size += iov_len; + + userbuf = vmalloc(len); + if (userbuf == NULL) { + err = -ENOMEM; + goto error; } + iov[1].iov_base = userbuf; + iov[1].iov_len = len; + err = memcpy_fromiovec(userbuf, msg->msg_iov, len); + if (err) + goto error_free_buf; + /* Get a skbuff (no data, just holds our cb information) */ if ((skb = sock_alloc_send_skb(sk, 0, msg->msg_flags & MSG_DONTWAIT, - &err)) == NULL) { - mutex_unlock(&econet_mutex); - return err; - } + &err)) == NULL) + goto error_free_buf; eb = (struct ec_cb *)&skb->cb; @@ -491,7 +477,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, udpmsg.msg_name = (void *)&udpdest; udpmsg.msg_namelen = sizeof(udpdest); udpmsg.msg_iov = &iov[0]; - udpmsg.msg_iovlen = msg->msg_iovlen + 1; + udpmsg.msg_iovlen = 2; udpmsg.msg_control = NULL; udpmsg.msg_controllen = 0; udpmsg.msg_flags=0; @@ -499,9 +485,13 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, oldfs = get_fs(); set_fs(KERNEL_DS); /* More privs :-) */ err = sock_sendmsg(udpsock, &udpmsg, size); set_fs(oldfs); + +error_free_buf: + vfree(userbuf); #else err = -EPROTOTYPE; #endif + error: mutex_unlock(&econet_mutex); return err; @@ -671,6 +661,9 @@ static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg) err = 0; switch (cmd) { case SIOCSIFADDR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + edev = dev->ec_ptr; if (edev == NULL) { /* Magic up a new one. */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 36e27c2107de..eb6f69a8f27a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1052,7 +1052,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { hlist_del(node); fib_table_flush(tb); - kfree(tb); + fib_free_table(tb); } } kfree(net->ipv4.fib_table_hash); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 43e1c594ce8f..b3acb0417b21 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -120,11 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz, struct fib_node *f; hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { - struct hlist_head __rcu *new_head; + struct hlist_head *new_head; hlist_del_rcu(&f->fn_hash); - new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + new_head = rcu_dereference_protected(fz->fz_hash, 1) + + fn_hash(f->fn_key, fz); hlist_add_head_rcu(&f->fn_hash, new_head); } } @@ -179,8 +180,8 @@ static void fn_rehash_zone(struct fn_zone *fz) memcpy(&nfz, fz, sizeof(nfz)); write_seqlock_bh(&fz->fz_lock); - old_ht = fz->fz_hash; - nfz.fz_hash = ht; + old_ht = rcu_dereference_protected(fz->fz_hash, 1); + RCU_INIT_POINTER(nfz.fz_hash, ht); nfz.fz_hashmask = new_hashmask; nfz.fz_divisor = new_divisor; fn_rebuild_zone(&nfz, old_ht, old_divisor); @@ -236,7 +237,7 @@ fn_new_zone(struct fn_hash *table, int z) seqlock_init(&fz->fz_lock); fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; fz->fz_hashmask = fz->fz_divisor - 1; - fz->fz_hash = fz->fz_embedded_hash; + RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash); fz->fz_order = z; fz->fz_revorder = 32 - z; fz->fz_mask = inet_make_mask(z); @@ -272,7 +273,7 @@ int fib_table_lookup(struct fib_table *tb, for (fz = rcu_dereference(t->fn_zone_list); fz != NULL; fz = rcu_dereference(fz->fz_next)) { - struct hlist_head __rcu *head; + struct hlist_head *head; struct hlist_node *node; struct fib_node *f; __be32 k; @@ -282,7 +283,7 @@ int fib_table_lookup(struct fib_table *tb, seq = read_seqbegin(&fz->fz_lock); k = fz_key(flp->fl4_dst, fz); - head = &fz->fz_hash[fn_hash(k, fz)]; + head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz); hlist_for_each_entry_rcu(f, node, head, fn_hash) { if (f->fn_key != k) continue; @@ -311,6 +312,7 @@ void fib_table_select_default(struct fib_table *tb, struct fib_info *last_resort; struct fn_hash *t = (struct fn_hash *)tb->tb_data; struct fn_zone *fz = t->fn_zones[0]; + struct hlist_head *head; if (fz == NULL) return; @@ -320,7 +322,8 @@ void fib_table_select_default(struct fib_table *tb, order = -1; rcu_read_lock(); - hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) { + head = rcu_dereference(fz->fz_hash); + hlist_for_each_entry_rcu(f, node, head, fn_hash) { struct fib_alias *fa; list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { @@ -374,7 +377,7 @@ out: /* Insert node F to FZ. */ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) { - struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz); hlist_add_head_rcu(&f->fn_hash, head); } @@ -382,7 +385,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) /* Return the node in FZ matching KEY. */ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) { - struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz); struct hlist_node *node; struct fib_node *f; @@ -662,7 +665,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) static int fn_flush_list(struct fn_zone *fz, int idx) { - struct hlist_head *head = &fz->fz_hash[idx]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx; struct hlist_node *node, *n; struct fib_node *f; int found = 0; @@ -713,6 +716,24 @@ int fib_table_flush(struct fib_table *tb) return found; } +void fib_free_table(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz, *next; + + next = table->fn_zone_list; + while (next != NULL) { + fz = next; + next = fz->fz_next; + + if (fz->fz_hash != fz->fz_embedded_hash) + fz_hash_free(fz->fz_hash, fz->fz_divisor); + + kfree(fz); + } + + kfree(tb); +} static inline int fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, @@ -761,14 +782,15 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, struct fn_zone *fz) { int h, s_h; + struct hlist_head *head = rcu_dereference(fz->fz_hash); - if (fz->fz_hash == NULL) + if (head == NULL) return skb->len; s_h = cb->args[3]; for (h = s_h; h < fz->fz_divisor; h++) { - if (hlist_empty(&fz->fz_hash[h])) + if (hlist_empty(head + h)) continue; - if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) { + if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) { cb->args[3] = h; return -1; } @@ -872,7 +894,7 @@ static struct fib_alias *fib_get_first(struct seq_file *seq) if (!iter->zone->fz_nent) continue; - iter->hash_head = iter->zone->fz_hash; + iter->hash_head = rcu_dereference(iter->zone->fz_hash); maxslot = iter->zone->fz_divisor; for (iter->bucket = 0; iter->bucket < maxslot; @@ -957,7 +979,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq) goto out; iter->bucket = 0; - iter->hash_head = iter->zone->fz_hash; + iter->hash_head = rcu_dereference(iter->zone->fz_hash); hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { list_for_each_entry(fa, &fn->fn_alias, fa_list) { diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a29edf2219c8..c079cc0ec651 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -47,11 +47,8 @@ extern int fib_detect_death(struct fib_info *fi, int order, static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { - if (res->fi != NULL) - fib_info_put(res->fi); + /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; - if (fi != NULL) - atomic_inc(&fi->fib_clntref); } #endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b14450895102..0f280348e0fd 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -365,7 +365,7 @@ static struct tnode *tnode_alloc(size_t size) if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else - return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + return vzalloc(size); } static void __tnode_vfree(struct work_struct *arg) @@ -1797,6 +1797,11 @@ int fib_table_flush(struct fib_table *tb) return found; } +void fib_free_table(struct fib_table *tb) +{ + kfree(tb); +} + void fib_table_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index caea6885fdbd..c6933f2ea310 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -22,7 +22,7 @@ #include <net/gre.h> -static const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly; +static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; static DEFINE_SPINLOCK(gre_proto_lock); int gre_add_protocol(const struct gre_protocol *proto, u8 version) @@ -51,7 +51,8 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) goto err_out; spin_lock(&gre_proto_lock); - if (gre_proto[version] != proto) + if (rcu_dereference_protected(gre_proto[version], + lockdep_is_held(&gre_proto_lock)) != proto) goto err_out_unlock; rcu_assign_pointer(gre_proto[version], NULL); spin_unlock(&gre_proto_lock); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 96bc7f9475a3..e5d1a44bcbdf 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -569,6 +569,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) /* No need to clone since we're just using its address. */ rt2 = rt; + if (!fl.nl_u.ip4_u.saddr) + fl.nl_u.ip4_u.saddr = rt->rt_src; + err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); switch (err) { case 0: diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c8877c6c7216..3c53c2d89e3b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2306,10 +2306,8 @@ void ip_mc_drop_socket(struct sock *sk) in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); - if (in_dev != NULL) { + if (in_dev != NULL) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); - in_dev_put(in_dev); - } /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); call_rcu(&iml->rcu, ip_mc_socklist_reclaim); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ba8042665849..2ada17129fce 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -490,9 +490,11 @@ static int inet_csk_diag_dump(struct sock *sk, { struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { struct inet_diag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); + const struct nlattr *bc = nlmsg_find_attr(cb->nlh, + sizeof(*r), + INET_DIAG_REQ_BYTECODE); struct inet_sock *inet = inet_sk(sk); entry.family = sk->sk_family; @@ -512,7 +514,7 @@ static int inet_csk_diag_dump(struct sock *sk, entry.dport = ntohs(inet->inet_dport); entry.userlocks = sk->sk_userlocks; - if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) return 0; } @@ -527,9 +529,11 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, { struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { struct inet_diag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); + const struct nlattr *bc = nlmsg_find_attr(cb->nlh, + sizeof(*r), + INET_DIAG_REQ_BYTECODE); entry.family = tw->tw_family; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -548,7 +552,7 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, entry.dport = ntohs(tw->tw_dport); entry.userlocks = 0; - if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) return 0; } @@ -618,7 +622,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, struct inet_diag_req *r = NLMSG_DATA(cb->nlh); struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt; - struct rtattr *bc = NULL; + const struct nlattr *bc = NULL; struct inet_sock *inet = inet_sk(sk); int j, s_j; int reqnum, s_reqnum; @@ -638,8 +642,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (!lopt || !lopt->qlen) goto out; - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - bc = (struct rtattr *)(r + 1); + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { + bc = nlmsg_find_attr(cb->nlh, sizeof(*r), + INET_DIAG_REQ_BYTECODE); entry.sport = inet->inet_num; entry.userlocks = sk->sk_userlocks; } @@ -672,8 +677,8 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, &ireq->rmt_addr; entry.dport = ntohs(ireq->rmt_port); - if (!inet_diag_bc_run(RTA_DATA(bc), - RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), + nla_len(bc), &entry)) continue; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 1b344f30b463..3c0369a3a663 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -133,8 +133,7 @@ int __inet_inherit_port(struct sock *sk, struct sock *child) } } } - sk_add_bind_node(child, &tb->owners); - inet_csk(child)->icsk_bind_hash = tb; + inet_bind_hash(child, tb, port); spin_unlock(&head->lock); return 0; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 9ffa24b9a804..9e94d7cf4f8a 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -72,18 +72,19 @@ static struct kmem_cache *peer_cachep __read_mostly; #define node_height(x) x->avl_height #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) +#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node) static const struct inet_peer peer_fake_node = { - .avl_left = peer_avl_empty, - .avl_right = peer_avl_empty, + .avl_left = peer_avl_empty_rcu, + .avl_right = peer_avl_empty_rcu, .avl_height = 0 }; static struct { - struct inet_peer *root; + struct inet_peer __rcu *root; spinlock_t lock; int total; } peers = { - .root = peer_avl_empty, + .root = peer_avl_empty_rcu, .lock = __SPIN_LOCK_UNLOCKED(peers.lock), .total = 0, }; @@ -156,11 +157,14 @@ static void unlink_from_unused(struct inet_peer *p) */ #define lookup(_daddr, _stack) \ ({ \ - struct inet_peer *u, **v; \ + struct inet_peer *u; \ + struct inet_peer __rcu **v; \ \ stackptr = _stack; \ *stackptr++ = &peers.root; \ - for (u = peers.root; u != peer_avl_empty; ) { \ + for (u = rcu_dereference_protected(peers.root, \ + lockdep_is_held(&peers.lock)); \ + u != peer_avl_empty; ) { \ if (_daddr == u->v4daddr) \ break; \ if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ @@ -168,7 +172,8 @@ static void unlink_from_unused(struct inet_peer *p) else \ v = &u->avl_right; \ *stackptr++ = v; \ - u = *v; \ + u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ } \ u; \ }) @@ -209,13 +214,17 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr) /* Called with local BH disabled and the pool lock held. */ #define lookup_rightempty(start) \ ({ \ - struct inet_peer *u, **v; \ + struct inet_peer *u; \ + struct inet_peer __rcu **v; \ *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ - for (u = *v; u->avl_right != peer_avl_empty; ) { \ + for (u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ + u->avl_right != peer_avl_empty_rcu; ) { \ v = &u->avl_right; \ *stackptr++ = v; \ - u = *v; \ + u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ } \ u; \ }) @@ -224,74 +233,86 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr) * Variable names are the proof of operation correctness. * Look into mm/map_avl.c for more detail description of the ideas. */ -static void peer_avl_rebalance(struct inet_peer **stack[], - struct inet_peer ***stackend) +static void peer_avl_rebalance(struct inet_peer __rcu **stack[], + struct inet_peer __rcu ***stackend) { - struct inet_peer **nodep, *node, *l, *r; + struct inet_peer __rcu **nodep; + struct inet_peer *node, *l, *r; int lh, rh; while (stackend > stack) { nodep = *--stackend; - node = *nodep; - l = node->avl_left; - r = node->avl_right; + node = rcu_dereference_protected(*nodep, + lockdep_is_held(&peers.lock)); + l = rcu_dereference_protected(node->avl_left, + lockdep_is_held(&peers.lock)); + r = rcu_dereference_protected(node->avl_right, + lockdep_is_held(&peers.lock)); lh = node_height(l); rh = node_height(r); if (lh > rh + 1) { /* l: RH+2 */ struct inet_peer *ll, *lr, *lrl, *lrr; int lrh; - ll = l->avl_left; - lr = l->avl_right; + ll = rcu_dereference_protected(l->avl_left, + lockdep_is_held(&peers.lock)); + lr = rcu_dereference_protected(l->avl_right, + lockdep_is_held(&peers.lock)); lrh = node_height(lr); if (lrh <= node_height(ll)) { /* ll: RH+1 */ - node->avl_left = lr; /* lr: RH or RH+1 */ - node->avl_right = r; /* r: RH */ + RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ + RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = lrh + 1; /* RH+1 or RH+2 */ - l->avl_left = ll; /* ll: RH+1 */ - l->avl_right = node; /* node: RH+1 or RH+2 */ + RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */ + RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */ l->avl_height = node->avl_height + 1; - *nodep = l; + RCU_INIT_POINTER(*nodep, l); } else { /* ll: RH, lr: RH+1 */ - lrl = lr->avl_left; /* lrl: RH or RH-1 */ - lrr = lr->avl_right; /* lrr: RH or RH-1 */ - node->avl_left = lrr; /* lrr: RH or RH-1 */ - node->avl_right = r; /* r: RH */ + lrl = rcu_dereference_protected(lr->avl_left, + lockdep_is_held(&peers.lock)); /* lrl: RH or RH-1 */ + lrr = rcu_dereference_protected(lr->avl_right, + lockdep_is_held(&peers.lock)); /* lrr: RH or RH-1 */ + RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ + RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = rh + 1; /* node: RH+1 */ - l->avl_left = ll; /* ll: RH */ - l->avl_right = lrl; /* lrl: RH or RH-1 */ + RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */ + RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */ l->avl_height = rh + 1; /* l: RH+1 */ - lr->avl_left = l; /* l: RH+1 */ - lr->avl_right = node; /* node: RH+1 */ + RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */ + RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */ lr->avl_height = rh + 2; - *nodep = lr; + RCU_INIT_POINTER(*nodep, lr); } } else if (rh > lh + 1) { /* r: LH+2 */ struct inet_peer *rr, *rl, *rlr, *rll; int rlh; - rr = r->avl_right; - rl = r->avl_left; + rr = rcu_dereference_protected(r->avl_right, + lockdep_is_held(&peers.lock)); + rl = rcu_dereference_protected(r->avl_left, + lockdep_is_held(&peers.lock)); rlh = node_height(rl); if (rlh <= node_height(rr)) { /* rr: LH+1 */ - node->avl_right = rl; /* rl: LH or LH+1 */ - node->avl_left = l; /* l: LH */ + RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ + RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = rlh + 1; /* LH+1 or LH+2 */ - r->avl_right = rr; /* rr: LH+1 */ - r->avl_left = node; /* node: LH+1 or LH+2 */ + RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */ + RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */ r->avl_height = node->avl_height + 1; - *nodep = r; + RCU_INIT_POINTER(*nodep, r); } else { /* rr: RH, rl: RH+1 */ - rlr = rl->avl_right; /* rlr: LH or LH-1 */ - rll = rl->avl_left; /* rll: LH or LH-1 */ - node->avl_right = rll; /* rll: LH or LH-1 */ - node->avl_left = l; /* l: LH */ + rlr = rcu_dereference_protected(rl->avl_right, + lockdep_is_held(&peers.lock)); /* rlr: LH or LH-1 */ + rll = rcu_dereference_protected(rl->avl_left, + lockdep_is_held(&peers.lock)); /* rll: LH or LH-1 */ + RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ + RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = lh + 1; /* node: LH+1 */ - r->avl_right = rr; /* rr: LH */ - r->avl_left = rlr; /* rlr: LH or LH-1 */ + RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */ + RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */ r->avl_height = lh + 1; /* r: LH+1 */ - rl->avl_right = r; /* r: LH+1 */ - rl->avl_left = node; /* node: LH+1 */ + RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */ + RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */ rl->avl_height = lh + 2; - *nodep = rl; + RCU_INIT_POINTER(*nodep, rl); } } else { node->avl_height = (lh > rh ? lh : rh) + 1; @@ -303,10 +324,10 @@ static void peer_avl_rebalance(struct inet_peer **stack[], #define link_to_pool(n) \ do { \ n->avl_height = 1; \ - n->avl_left = peer_avl_empty; \ - n->avl_right = peer_avl_empty; \ - smp_wmb(); /* lockless readers can catch us now */ \ - **--stackptr = n; \ + n->avl_left = peer_avl_empty_rcu; \ + n->avl_right = peer_avl_empty_rcu; \ + /* lockless readers can catch us now */ \ + rcu_assign_pointer(**--stackptr, n); \ peer_avl_rebalance(stack, stackptr); \ } while (0) @@ -330,24 +351,25 @@ static void unlink_from_pool(struct inet_peer *p) * We use refcnt=-1 to alert lockless readers this entry is deleted. */ if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { - struct inet_peer **stack[PEER_MAXDEPTH]; - struct inet_peer ***stackptr, ***delp; + struct inet_peer __rcu **stack[PEER_MAXDEPTH]; + struct inet_peer __rcu ***stackptr, ***delp; if (lookup(p->v4daddr, stack) != p) BUG(); delp = stackptr - 1; /* *delp[0] == p */ - if (p->avl_left == peer_avl_empty) { + if (p->avl_left == peer_avl_empty_rcu) { *delp[0] = p->avl_right; --stackptr; } else { /* look for a node to insert instead of p */ struct inet_peer *t; t = lookup_rightempty(p); - BUG_ON(*stackptr[-1] != t); + BUG_ON(rcu_dereference_protected(*stackptr[-1], + lockdep_is_held(&peers.lock)) != t); **--stackptr = t->avl_left; /* t is removed, t->v4daddr > x->v4daddr for any * x in p->avl_left subtree. * Put t in the old place of p. */ - *delp[0] = t; + RCU_INIT_POINTER(*delp[0], t); t->avl_left = p->avl_left; t->avl_right = p->avl_right; t->avl_height = p->avl_height; @@ -414,7 +436,7 @@ static int cleanup_once(unsigned long ttl) struct inet_peer *inet_getpeer(__be32 daddr, int create) { struct inet_peer *p; - struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; + struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; /* Look up for the address quickly, lockless. * Because of a concurrent writer, we might not find an existing entry. diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d0ffcbe369b7..70ff77f02eee 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1072,6 +1072,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } ipgre_tunnel_unlink(ign, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; t->parms.i_key = p.i_key; @@ -1324,7 +1325,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id); tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -1335,7 +1335,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) tunnel->hlen = sizeof(struct iphdr) + 4; dev_hold(dev); - rcu_assign_pointer(ign->tunnels_wc[0], tunnel); } @@ -1382,10 +1381,12 @@ static int __net_init ipgre_init_net(struct net *net) if ((err = register_netdev(ign->fb_tunnel_dev))) goto err_reg_dev; + rcu_assign_pointer(ign->tunnels_wc[0], + netdev_priv(ign->fb_tunnel_dev)); return 0; err_reg_dev: - free_netdev(ign->fb_tunnel_dev); + ipgre_dev_free(ign->fb_tunnel_dev); err_alloc_dev: return err; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 64b70ad162e3..3948c86e59ca 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -238,7 +238,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) but receiver should be enough clever f.e. to forward mtrace requests, sent to multicast group to reach destination designated router. */ -struct ip_ra_chain *ip_ra_chain; +struct ip_ra_chain __rcu *ip_ra_chain; static DEFINE_SPINLOCK(ip_ra_lock); @@ -253,7 +253,8 @@ static void ip_ra_destroy_rcu(struct rcu_head *head) int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) { - struct ip_ra_chain *ra, *new_ra, **rap; + struct ip_ra_chain *ra, *new_ra; + struct ip_ra_chain __rcu **rap; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; @@ -261,7 +262,10 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; spin_lock_bh(&ip_ra_lock); - for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { + for (rap = &ip_ra_chain; + (ra = rcu_dereference_protected(*rap, + lockdep_is_held(&ip_ra_lock))) != NULL; + rap = &ra->next) { if (ra->sk == sk) { if (on) { spin_unlock_bh(&ip_ra_lock); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e9b816e6cd73..cd300aaee78f 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -676,6 +676,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) } t = netdev_priv(dev); ipip_tunnel_unlink(ipn, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; memcpy(dev->dev_addr, &p.iph.saddr, 4); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3cad2591ace0..3fac340a28d5 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -927,6 +927,7 @@ static int get_info(struct net *net, void __user *user, private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d31b007a6d80..a846d633b3b6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1124,6 +1124,7 @@ static int get_info(struct net *net, void __user *user, private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 295c97431e43..c04787ce1a71 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -47,26 +47,6 @@ __nf_nat_proto_find(u_int8_t protonum) return rcu_dereference(nf_nat_protos[protonum]); } -static const struct nf_nat_protocol * -nf_nat_proto_find_get(u_int8_t protonum) -{ - const struct nf_nat_protocol *p; - - rcu_read_lock(); - p = __nf_nat_proto_find(protonum); - if (!try_module_get(p->me)) - p = &nf_nat_unknown_protocol; - rcu_read_unlock(); - - return p; -} - -static void -nf_nat_proto_put(const struct nf_nat_protocol *p) -{ - module_put(p->me); -} - /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int hash_by_src(const struct net *net, u16 zone, @@ -588,6 +568,26 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> +static const struct nf_nat_protocol * +nf_nat_proto_find_get(u_int8_t protonum) +{ + const struct nf_nat_protocol *p; + + rcu_read_lock(); + p = __nf_nat_proto_find(protonum); + if (!try_module_get(p->me)) + p = &nf_nat_unknown_protocol; + rcu_read_unlock(); + + return p; +} + +static void +nf_nat_proto_put(const struct nf_nat_protocol *p) +{ + module_put(p->me); +} + static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4ae1f203f7cb..1b48eb1ed453 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) local_bh_enable(); socket_seq_show(seq); - seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", + seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, tcp_death_row.tw_count, sockets, - atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d mem %d\n", + atomic_long_read(&tcp_memory_allocated)); + seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), - atomic_read(&udp_memory_allocated)); + atomic_long_read(&udp_memory_allocated)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 65699c24411c..9ae5c01cd0b2 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -28,7 +28,7 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct net_protocol *inet_protos[MAX_INET_PROTOS] __read_mostly; +const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; /* * Add a protocol handler to the hash tables @@ -38,7 +38,8 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { int hash = protocol & (MAX_INET_PROTOS - 1); - return !cmpxchg(&inet_protos[hash], NULL, prot) ? 0 : -1; + return !cmpxchg((const struct net_protocol **)&inet_protos[hash], + NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet_add_protocol); @@ -50,7 +51,8 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { int ret, hash = protocol & (MAX_INET_PROTOS - 1); - ret = (cmpxchg(&inet_protos[hash], prot, NULL) == prot) ? 0 : -1; + ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], + prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d6cb2bfcd8e1..987bf9adb318 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -198,7 +198,7 @@ const __u8 ip_tos2prio[16] = { */ struct rt_hash_bucket { - struct rtable *chain; + struct rtable __rcu *chain; }; #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ @@ -280,7 +280,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) struct rtable *r = NULL; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - if (!rt_hash_table[st->bucket].chain) + if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) continue; rcu_read_lock_bh(); r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); @@ -300,17 +300,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, { struct rt_cache_iter_state *st = seq->private; - r = r->dst.rt_next; + r = rcu_dereference_bh(r->dst.rt_next); while (!r) { rcu_read_unlock_bh(); do { if (--st->bucket < 0) return NULL; - } while (!rt_hash_table[st->bucket].chain); + } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; + r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); } - return rcu_dereference_bh(r); + return r; } static struct rtable *rt_cache_get_next(struct seq_file *seq, @@ -721,19 +721,23 @@ static void rt_do_flush(int process_context) for (i = 0; i <= rt_hash_mask; i++) { if (process_context && need_resched()) cond_resched(); - rth = rt_hash_table[i].chain; + rth = rcu_dereference_raw(rt_hash_table[i].chain); if (!rth) continue; spin_lock_bh(rt_hash_lock_addr(i)); #ifdef CONFIG_NET_NS { - struct rtable ** prev, * p; + struct rtable __rcu **prev; + struct rtable *p; - rth = rt_hash_table[i].chain; + rth = rcu_dereference_protected(rt_hash_table[i].chain, + lockdep_is_held(rt_hash_lock_addr(i))); /* defer releasing the head of the list after spin_unlock */ - for (tail = rth; tail; tail = tail->dst.rt_next) + for (tail = rth; tail; + tail = rcu_dereference_protected(tail->dst.rt_next, + lockdep_is_held(rt_hash_lock_addr(i)))) if (!rt_is_expired(tail)) break; if (rth != tail) @@ -741,8 +745,12 @@ static void rt_do_flush(int process_context) /* call rt_free on entries after the tail requiring flush */ prev = &rt_hash_table[i].chain; - for (p = *prev; p; p = next) { - next = p->dst.rt_next; + for (p = rcu_dereference_protected(*prev, + lockdep_is_held(rt_hash_lock_addr(i))); + p != NULL; + p = next) { + next = rcu_dereference_protected(p->dst.rt_next, + lockdep_is_held(rt_hash_lock_addr(i))); if (!rt_is_expired(p)) { prev = &p->dst.rt_next; } else { @@ -752,14 +760,15 @@ static void rt_do_flush(int process_context) } } #else - rth = rt_hash_table[i].chain; - rt_hash_table[i].chain = NULL; + rth = rcu_dereference_protected(rt_hash_table[i].chain, + lockdep_is_held(rt_hash_lock_addr(i))); + rcu_assign_pointer(rt_hash_table[i].chain, NULL); tail = NULL; #endif spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth != tail; rth = next) { - next = rth->dst.rt_next; + next = rcu_dereference_protected(rth->dst.rt_next, 1); rt_free(rth); } } @@ -790,7 +799,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) while (aux != rth) { if (compare_hash_inputs(&aux->fl, &rth->fl)) return 0; - aux = aux->dst.rt_next; + aux = rcu_dereference_protected(aux->dst.rt_next, 1); } return ONE; } @@ -799,7 +808,8 @@ static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; unsigned long samples = 0; unsigned long sum = 0, sum2 = 0; unsigned long delta; @@ -825,11 +835,12 @@ static void rt_check_expire(void) samples++; - if (*rthp == NULL) + if (rcu_dereference_raw(*rthp) == NULL) continue; length = 0; spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { prefetch(rth->dst.rt_next); if (rt_is_expired(rth)) { *rthp = rth->dst.rt_next; @@ -941,7 +952,8 @@ static int rt_garbage_collect(struct dst_ops *ops) static unsigned long last_gc; static int rover; static int equilibrium; - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; unsigned long now = jiffies; int goal; int entries = dst_entries_get_fast(&ipv4_dst_ops); @@ -995,7 +1007,8 @@ static int rt_garbage_collect(struct dst_ops *ops) k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; spin_lock_bh(rt_hash_lock_addr(k)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { if (!rt_is_expired(rth) && !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; @@ -1071,7 +1084,7 @@ static int slow_chain_length(const struct rtable *head) while (rth) { length += has_noalias(head, rth); - rth = rth->dst.rt_next; + rth = rcu_dereference_protected(rth->dst.rt_next, 1); } return length >> FRACT_BITS; } @@ -1079,9 +1092,9 @@ static int slow_chain_length(const struct rtable *head) static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp, struct sk_buff *skb, int ifindex) { - struct rtable *rth, **rthp; + struct rtable *rth, *cand; + struct rtable __rcu **rthp, **candp; unsigned long now; - struct rtable *cand, **candp; u32 min_score; int chain_length; int attempts = !in_softirq(); @@ -1128,7 +1141,8 @@ restart: rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { if (rt_is_expired(rth)) { *rthp = rth->dst.rt_next; rt_free(rth); @@ -1324,12 +1338,14 @@ EXPORT_SYMBOL(__ip_select_ident); static void rt_del(unsigned hash, struct rtable *rt) { - struct rtable **rthp, *aux; + struct rtable __rcu **rthp; + struct rtable *aux; rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); - while ((aux = *rthp) != NULL) { + while ((aux = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { if (aux == rt || rt_is_expired(aux)) { *rthp = aux->dst.rt_next; rt_free(aux); @@ -1346,7 +1362,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, { int i, k; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; struct netevent_redirect netevent; @@ -1379,7 +1396,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], rt_genid(net)); - rthp=&rt_hash_table[hash].chain; + rthp = &rt_hash_table[hash].chain; while ((rth = rcu_dereference(*rthp)) != NULL) { struct rtable *rt; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d96c1da4b17c..1b4ec21497a4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -26,6 +26,8 @@ static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; +static int tcp_adv_win_scale_min = -31; +static int tcp_adv_win_scale_max = 31; /* Update system visible IP port range */ static void set_local_port_range(int range[2]) @@ -398,7 +400,7 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_mem, .maxlen = sizeof(sysctl_tcp_mem), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_doulongvec_minmax }, { .procname = "tcp_wmem", @@ -426,7 +428,9 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_adv_win_scale, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_adv_win_scale_min, + .extra2 = &tcp_adv_win_scale_max, }, { .procname = "tcp_tw_reuse", @@ -602,8 +606,7 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .proc_handler = proc_doulongvec_minmax, }, { .procname = "udp_rmem_min", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1664a0590bb8..f15c36a706ec 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); -int sysctl_tcp_mem[3] __read_mostly; +long sysctl_tcp_mem[3] __read_mostly; int sysctl_tcp_wmem[3] __read_mostly; int sysctl_tcp_rmem[3] __read_mostly; @@ -290,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); -atomic_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); /* @@ -2246,7 +2246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ - if (val < 8 || val > MAX_TCP_WINDOW) { + if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) { err = -EINVAL; break; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3357f69e353d..6d8ab1c4efc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk) int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); - if (sk->sk_sndbuf < 3 * sndmem) - sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); + if (sk->sk_sndbuf < 3 * sndmem) { + sk->sk_sndbuf = 3 * sndmem; + if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) + sk->sk_sndbuf = sysctl_tcp_wmem[2]; + } } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } @@ -4861,7 +4864,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ - if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) return 0; /* If we filled the congestion window, do not expand. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8f8527d41682..e13da6de1fc7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -415,6 +415,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) !icsk->icsk_backoff) break; + if (sock_owned_by_user(sk)) + break; + icsk->icsk_backoff--; inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << icsk->icsk_backoff; @@ -429,11 +432,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); - } else if (sock_owned_by_user(sk)) { - /* RTO revert clocked out retransmission, - * but socket is locked. Will defer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - HZ/20, TCP_RTO_MAX); } else { /* RTO revert clocked out retransmission. * Will retransmit now */ @@ -2045,7 +2043,9 @@ get_req: } get_sk: sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == st->family) { cur = sk; goto out; } diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 9a17bd2a0a37..ac3b3ee4b07c 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -14,27 +14,32 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm_tunnel *tunnel4_handlers __read_mostly; -static struct xfrm_tunnel *tunnel64_handlers __read_mostly; +static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; +static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); -static inline struct xfrm_tunnel **fam_handlers(unsigned short family) +static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) { - struct xfrm_tunnel **pprev; + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; + int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel4_mutex); - for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { - if ((*pprev)->priority > priority) + for (pprev = fam_handlers(family); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel4_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) break; - if ((*pprev)->priority == priority) + if (t->priority == priority) goto err; } @@ -52,13 +57,17 @@ EXPORT_SYMBOL(xfrm4_tunnel_register); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) { - struct xfrm_tunnel **pprev; + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel4_mutex); - for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { - if (*pprev == handler) { + for (pprev = fam_handlers(family); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel4_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { *pprev = handler->next; ret = 0; break; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b3f7e8cf18ac..5e0a3a582a59 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -110,7 +110,7 @@ struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); -int sysctl_udp_mem[3] __read_mostly; +long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); int sysctl_udp_rmem_min __read_mostly; @@ -119,7 +119,7 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min); int sysctl_udp_wmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_wmem_min); -atomic_t udp_memory_allocated; +atomic_long_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 @@ -1413,7 +1413,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } - if (sk->sk_filter) { + if (rcu_dereference_raw(sk->sk_filter)) { if (udp_lib_checksum_complete(skb)) goto drop; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ec7a91d9e865..23cc8e1ce8d4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -98,7 +98,11 @@ #endif #define INFINITY_LIFE_TIME 0xFFFFFFFF -#define TIME_DELTA(a, b) ((unsigned long)((long)(a) - (long)(b))) + +static inline u32 cstamp_delta(unsigned long cstamp) +{ + return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; +} #define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ/50 : 1) #define ADDRCONF_TIMER_FUZZ (HZ / 4) @@ -836,7 +840,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i { struct inet6_dev *idev = ifp->idev; struct in6_addr addr, *tmpaddr; - unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp; + unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp, age; unsigned long regen_advance; int tmp_plen; int ret = 0; @@ -886,12 +890,13 @@ retry: goto out; } memcpy(&addr.s6_addr[8], idev->rndid, 8); + age = (jiffies - ifp->tstamp) / HZ; tmp_valid_lft = min_t(__u32, ifp->valid_lft, - idev->cnf.temp_valid_lft); + idev->cnf.temp_valid_lft + age); tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, - idev->cnf.temp_prefered_lft - + idev->cnf.temp_prefered_lft + age - idev->cnf.max_desync_factor); tmp_plen = ifp->prefix_len; max_addresses = idev->cnf.max_addresses; @@ -1426,8 +1431,10 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; - if (addrconf_dad_end(ifp)) + if (addrconf_dad_end(ifp)) { + in6_ifa_put(ifp); return; + } if (net_ratelimit()) printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n", @@ -2021,10 +2028,11 @@ ok: ipv6_ifa_notify(0, ift); } - if (create && in6_dev->cnf.use_tempaddr > 0) { + if ((create || list_empty(&in6_dev->tempaddr_list)) && in6_dev->cnf.use_tempaddr > 0) { /* * When a new public address is created as described in [ADDRCONF], - * also create a new temporary address. + * also create a new temporary address. Also create a temporary + * address if it's enabled but no temporary address currently exists. */ read_unlock_bh(&in6_dev->lock); ipv6_create_tempaddr(ifp, NULL); @@ -2736,10 +2744,6 @@ static int addrconf_ifdown(struct net_device *dev, int how) /* Flag it for later restoration when link comes up */ ifa->flags |= IFA_F_TENTATIVE; ifa->state = INET6_IFADDR_STATE_DAD; - - write_unlock_bh(&idev->lock); - - in6_ifa_hold(ifa); } else { list_del(&ifa->if_list); @@ -2754,19 +2758,15 @@ static int addrconf_ifdown(struct net_device *dev, int how) ifa->state = INET6_IFADDR_STATE_DEAD; spin_unlock_bh(&ifa->state_lock); - if (state == INET6_IFADDR_STATE_DEAD) - goto put_ifa; - } - - __ipv6_ifa_notify(RTM_DELADDR, ifa); - if (ifa->state == INET6_IFADDR_STATE_DEAD) - atomic_notifier_call_chain(&inet6addr_chain, - NETDEV_DOWN, ifa); - -put_ifa: - in6_ifa_put(ifa); + if (state != INET6_IFADDR_STATE_DEAD) { + __ipv6_ifa_notify(RTM_DELADDR, ifa); + atomic_notifier_call_chain(&inet6addr_chain, + NETDEV_DOWN, ifa); + } - write_lock_bh(&idev->lock); + in6_ifa_put(ifa); + write_lock_bh(&idev->lock); + } } list_splice(&keep_list, &idev->addr_list); @@ -3448,10 +3448,8 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, { struct ifa_cacheinfo ci; - ci.cstamp = (u32)(TIME_DELTA(cstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(cstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); - ci.tstamp = (u32)(TIME_DELTA(tstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); + ci.cstamp = cstamp_delta(cstamp); + ci.tstamp = cstamp_delta(tstamp); ci.ifa_prefered = preferred; ci.ifa_valid = valid; @@ -3802,8 +3800,10 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_AUTOCONF] = cnf->autoconf; array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; - array[DEVCONF_RTR_SOLICIT_INTERVAL] = cnf->rtr_solicit_interval; - array[DEVCONF_RTR_SOLICIT_DELAY] = cnf->rtr_solicit_delay; + array[DEVCONF_RTR_SOLICIT_INTERVAL] = + jiffies_to_msecs(cnf->rtr_solicit_interval); + array[DEVCONF_RTR_SOLICIT_DELAY] = + jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; #ifdef CONFIG_IPV6_PRIVACY array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; @@ -3817,7 +3817,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; - array[DEVCONF_RTR_PROBE_INTERVAL] = cnf->rtr_probe_interval; + array[DEVCONF_RTR_PROBE_INTERVAL] = + jiffies_to_msecs(cnf->rtr_probe_interval); #ifdef CONFIG_IPV6_ROUTE_INFO array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; #endif @@ -3933,10 +3934,9 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, NLA_PUT_U32(skb, IFLA_INET6_FLAGS, idev->if_flags); ci.max_reasm_len = IPV6_MAXPLEN; - ci.tstamp = (__u32)(TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); - ci.reachable_time = idev->nd_parms->reachable_time; - ci.retrans_time = idev->nd_parms->retrans_time; + ci.tstamp = cstamp_delta(idev->tstamp); + ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time); + ci.retrans_time = jiffies_to_msecs(idev->nd_parms->retrans_time); NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci); nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index c2c0f89397b1..70e891a20fb9 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1175,6 +1175,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) sizeof (struct ipv6hdr); dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; @@ -1284,6 +1286,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) t = netdev_priv(dev); ip6_tnl_unlink(ip6n, t); + synchronize_net(); err = ip6_tnl_change(t, &p); ip6_tnl_link(ip6n, t); netdev_state_change(dev); @@ -1362,15 +1365,21 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { static void ip6_tnl_dev_setup(struct net_device *dev) { + struct ip6_tnl *t; + dev->netdev_ops = &ip6_tnl_netdev_ops; dev->destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + t = netdev_priv(dev); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 0553867a317f..d1770e061c08 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -343,6 +343,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; case IPV6_TRANSPARENT: + if (!capable(CAP_NET_ADMIN)) { + retv = -EPERM; + break; + } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 44d2eeac089b..448464844a25 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,10 +5,15 @@ menu "IPv6: Netfilter Configuration" depends on INET && IPV6 && NETFILTER +config NF_DEFRAG_IPV6 + tristate + default n + config NF_CONNTRACK_IPV6 tristate "IPv6 connection tracking support" depends on INET && IPV6 && NF_CONNTRACK default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV6 ---help--- Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 3f8e4a3d83ce..0a432c9b0795 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -12,11 +12,14 @@ obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o # objects for l3 independent conntrack nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o -nf_defrag_ipv6-objs := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o # l3 independent conntrack obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o +# defrag +nf_defrag_ipv6-objs := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o +obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o + # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 51df035897e7..455582384ece 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1137,6 +1137,7 @@ static int get_info(struct net *net, void __user *user, private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 489d71b844ac..79d43aa8fa8d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -286,7 +286,7 @@ found: /* Check for overlap with preceding fragment. */ if (prev && - (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset > 0) + (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) goto discard_fq; /* Look for overlap with succeeding segment. */ @@ -625,21 +625,24 @@ int nf_ct_frag6_init(void) inet_frags_init_net(&nf_init_frags); inet_frags_init(&nf_frags); +#ifdef CONFIG_SYSCTL nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path, nf_ct_frag6_sysctl_table); if (!nf_ct_frag6_sysctl_header) { inet_frags_fini(&nf_frags); return -ENOMEM; } +#endif return 0; } void nf_ct_frag6_cleanup(void) { +#ifdef CONFIG_SYSCTL unregister_sysctl_table(nf_ct_frag6_sysctl_header); nf_ct_frag6_sysctl_header = NULL; - +#endif inet_frags_fini(&nf_frags); nf_init_frags.low_thresh = 0; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index d082eaeefa25..24b3558b8e67 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -126,6 +126,8 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_SENTINEL }; @@ -134,6 +136,8 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index 9bb936ae2452..9a7978fdc02a 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -25,13 +25,14 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS] __read_mostly; +const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) { int hash = protocol & (MAX_INET_PROTOS - 1); - return !cmpxchg(&inet6_protos[hash], NULL, prot) ? 0 : -1; + return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash], + NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet6_add_protocol); @@ -43,7 +44,8 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol { int ret, hash = protocol & (MAX_INET_PROTOS - 1); - ret = (cmpxchg(&inet6_protos[hash], prot, NULL) == prot) ? 0 : -1; + ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash], + prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 45e6efb7f171..86c39526ba5e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -373,7 +373,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) { - if ((raw6_sk(sk)->checksum || sk->sk_filter) && + if ((raw6_sk(sk)->checksum || rcu_dereference_raw(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); kfree_skb(skb); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index c7ba3149633f..0f2766453759 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -349,7 +349,7 @@ found: /* Check for overlap with preceding fragment. */ if (prev && - (FRAG6_CB(prev)->offset + prev->len) - offset > 0) + (FRAG6_CB(prev)->offset + prev->len) > offset) goto discard_fq; /* Look for overlap with succeeding segment. */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 25661f968f3f..96455ffb76fb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1945,8 +1945,12 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops); struct neighbour *neigh; - if (rt == NULL) + if (rt == NULL) { + if (net_ratelimit()) + pr_warning("IPv6: Maximum number of routes reached," + " consider increasing route/max_size.\n"); return ERR_PTR(-ENOMEM); + } dev_hold(net->loopback_dev); in6_dev_hold(idev); @@ -2741,6 +2745,7 @@ static void __net_exit ip6_route_net_exit(struct net *net) kfree(net->ipv6.ip6_prohibit_entry); kfree(net->ipv6.ip6_blk_hole_entry); #endif + dst_entries_destroy(&net->ipv6.ip6_dst_ops); } static struct pernet_operations ip6_route_net_ops = { @@ -2832,5 +2837,6 @@ void ip6_route_cleanup(void) xfrm6_fini(); fib6_gc_cleanup(); unregister_pernet_subsys(&ip6_route_net_ops); + dst_entries_destroy(&ip6_dst_blackhole_ops); kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 367a6cc584cc..8c4d00c7cd2b 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -606,8 +606,9 @@ static int ipip6_rcv(struct sk_buff *skb) return 0; } - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + /* no tunnel matched, let upstream know, ipsec may handle it */ rcu_read_unlock(); + return 1; out: kfree_skb(skb); return 0; @@ -963,6 +964,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) } t = netdev_priv(dev); ipip6_tunnel_unlink(sitn, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; memcpy(dev->dev_addr, &p.iph.saddr, 4); diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index d9864725d0c6..4f3cec12aa85 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -30,23 +30,26 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm6_tunnel *tunnel6_handlers __read_mostly; -static struct xfrm6_tunnel *tunnel46_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) { - struct xfrm6_tunnel **pprev; + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel6_mutex); for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - *pprev; pprev = &(*pprev)->next) { - if ((*pprev)->priority > priority) + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) break; - if ((*pprev)->priority == priority) + if (t->priority == priority) goto err; } @@ -65,14 +68,17 @@ EXPORT_SYMBOL(xfrm6_tunnel_register); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) { - struct xfrm6_tunnel **pprev; + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel6_mutex); for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - *pprev; pprev = &(*pprev)->next) { - if (*pprev == handler) { + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { *pprev = handler->next; ret = 0; break; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c84dad432114..91def93bec85 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -527,7 +527,7 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) } } - if (sk->sk_filter) { + if (rcu_dereference_raw(sk->sk_filter)) { if (udp_lib_checksum_complete(skb)) goto drop; } diff --git a/net/irda/irttp.c b/net/irda/irttp.c index 285761e77d90..f6054f9ccbe3 100644 --- a/net/irda/irttp.c +++ b/net/irda/irttp.c @@ -550,22 +550,30 @@ EXPORT_SYMBOL(irttp_close_tsap); */ int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb) { + int ret; + IRDA_ASSERT(self != NULL, return -1;); IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); IRDA_ASSERT(skb != NULL, return -1;); IRDA_DEBUG(4, "%s()\n", __func__); + /* Take shortcut on zero byte packets */ + if (skb->len == 0) { + ret = 0; + goto err; + } + /* Check that nothing bad happens */ - if ((skb->len == 0) || (!self->connected)) { - IRDA_DEBUG(1, "%s(), No data, or not connected\n", - __func__); + if (!self->connected) { + IRDA_WARNING("%s(), Not connected\n", __func__); + ret = -ENOTCONN; goto err; } if (skb->len > self->max_seg_size) { - IRDA_DEBUG(1, "%s(), UData is too large for IrLAP!\n", - __func__); + IRDA_ERROR("%s(), UData is too large for IrLAP!\n", __func__); + ret = -EMSGSIZE; goto err; } @@ -576,7 +584,7 @@ int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb) err: dev_kfree_skb(skb); - return -1; + return ret; } EXPORT_SYMBOL(irttp_udata_request); @@ -599,9 +607,15 @@ int irttp_data_request(struct tsap_cb *self, struct sk_buff *skb) IRDA_DEBUG(2, "%s() : queue len = %d\n", __func__, skb_queue_len(&self->tx_queue)); + /* Take shortcut on zero byte packets */ + if (skb->len == 0) { + ret = 0; + goto err; + } + /* Check that nothing bad happens */ - if ((skb->len == 0) || (!self->connected)) { - IRDA_WARNING("%s: No data, or not connected\n", __func__); + if (!self->connected) { + IRDA_WARNING("%s: Not connected\n", __func__); ret = -ENOTCONN; goto err; } diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 499c045d6910..f7db676de77d 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -1798,7 +1798,8 @@ static void iucv_work_fn(struct work_struct *work) * Handles external interrupts coming in from CP. * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn(). */ -static void iucv_external_interrupt(u16 code) +static void iucv_external_interrupt(unsigned int ext_int_code, + unsigned int param32, unsigned long param64) { struct iucv_irq_data *p; struct iucv_irq_list *work; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 1712af1c7b3f..c64ce0a0bb03 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -111,6 +111,10 @@ struct l2tp_net { spinlock_t l2tp_session_hlist_lock; }; +static void l2tp_session_set_header_len(struct l2tp_session *session, int version); +static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); +static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); + static inline struct l2tp_net *l2tp_pernet(struct net *net) { BUG_ON(!net); @@ -118,6 +122,34 @@ static inline struct l2tp_net *l2tp_pernet(struct net *net) return net_generic(net, l2tp_net_id); } + +/* Tunnel reference counts. Incremented per session that is added to + * the tunnel. + */ +static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel) +{ + atomic_inc(&tunnel->ref_count); +} + +static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel) +{ + if (atomic_dec_and_test(&tunnel->ref_count)) + l2tp_tunnel_free(tunnel); +} +#ifdef L2TP_REFCNT_DEBUG +#define l2tp_tunnel_inc_refcount(_t) do { \ + printk(KERN_DEBUG "l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \ + l2tp_tunnel_inc_refcount_1(_t); \ + } while (0) +#define l2tp_tunnel_dec_refcount(_t) do { \ + printk(KERN_DEBUG "l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \ + l2tp_tunnel_dec_refcount_1(_t); \ + } while (0) +#else +#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t) +#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t) +#endif + /* Session hash global list for L2TPv3. * The session_id SHOULD be random according to RFC3931, but several * L2TP implementations use incrementing session_ids. So we do a real @@ -699,8 +731,8 @@ EXPORT_SYMBOL(l2tp_recv_common); * Returns 1 if the packet was not a good data packet and could not be * forwarded. All such packets are passed up to userspace to deal with. */ -int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, - int (*payload_hook)(struct sk_buff *skb)) +static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, + int (*payload_hook)(struct sk_buff *skb)) { struct l2tp_session *session = NULL; unsigned char *ptr, *optr; @@ -812,7 +844,6 @@ error: return 1; } -EXPORT_SYMBOL_GPL(l2tp_udp_recv_core); /* UDP encapsulation receive handler. See net/ipv4/udp.c. * Return codes: @@ -922,7 +953,8 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) return bufp - optr; } -int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t data_len) +static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, + size_t data_len) { struct l2tp_tunnel *tunnel = session->tunnel; unsigned int len = skb->len; @@ -970,7 +1002,6 @@ int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t dat return 0; } -EXPORT_SYMBOL_GPL(l2tp_xmit_core); /* Automatically called when the skb is freed. */ @@ -1089,7 +1120,7 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb); * The tunnel context is deleted only when all session sockets have been * closed. */ -void l2tp_tunnel_destruct(struct sock *sk) +static void l2tp_tunnel_destruct(struct sock *sk) { struct l2tp_tunnel *tunnel; @@ -1128,11 +1159,10 @@ void l2tp_tunnel_destruct(struct sock *sk) end: return; } -EXPORT_SYMBOL(l2tp_tunnel_destruct); /* When the tunnel is closed, all the attached sessions need to go too. */ -void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) +static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) { int hash; struct hlist_node *walk; @@ -1193,12 +1223,11 @@ again: } write_unlock_bh(&tunnel->hlist_lock); } -EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall); /* Really kill the tunnel. * Come here only when all sessions have been cleared from the tunnel. */ -void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) +static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) { struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); @@ -1217,7 +1246,6 @@ void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) atomic_dec(&l2tp_tunnel_count); kfree(tunnel); } -EXPORT_SYMBOL_GPL(l2tp_tunnel_free); /* Create a socket for the tunnel, if one isn't set up by * userspace. This is used for static tunnels where there is no @@ -1512,7 +1540,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_delete); /* We come here whenever a session's send_seq, cookie_len or * l2specific_len parameters are set. */ -void l2tp_session_set_header_len(struct l2tp_session *session, int version) +static void l2tp_session_set_header_len(struct l2tp_session *session, int version) { if (version == L2TP_HDR_VER_2) { session->hdr_len = 6; @@ -1525,7 +1553,6 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version) } } -EXPORT_SYMBOL_GPL(l2tp_session_set_header_len); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index f0f318edd3f1..a16a48e79fab 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -231,48 +231,15 @@ extern int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_i extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); extern int l2tp_session_delete(struct l2tp_session *session); -extern void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); extern void l2tp_session_free(struct l2tp_session *session); extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb)); -extern int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, int (*payload_hook)(struct sk_buff *skb)); extern int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); -extern int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t data_len); extern int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len); -extern void l2tp_tunnel_destruct(struct sock *sk); -extern void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); -extern void l2tp_session_set_header_len(struct l2tp_session *session, int version); extern int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); extern void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); -/* Tunnel reference counts. Incremented per session that is added to - * the tunnel. - */ -static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel) -{ - atomic_inc(&tunnel->ref_count); -} - -static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel) -{ - if (atomic_dec_and_test(&tunnel->ref_count)) - l2tp_tunnel_free(tunnel); -} -#ifdef L2TP_REFCNT_DEBUG -#define l2tp_tunnel_inc_refcount(_t) do { \ - printk(KERN_DEBUG "l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \ - l2tp_tunnel_inc_refcount_1(_t); \ - } while (0) -#define l2tp_tunnel_dec_refcount(_t) do { \ - printk(KERN_DEBUG "l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \ - l2tp_tunnel_dec_refcount_1(_t); \ - } while (0) -#else -#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t) -#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t) -#endif - /* Session reference counts. Incremented when code obtains a reference * to a session. */ diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 104ec3b283d4..b8dbae82fab8 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -249,7 +249,7 @@ static int l2tp_dfs_seq_open(struct inode *inode, struct file *file) struct seq_file *seq; int rc = -ENOMEM; - pd = kzalloc(GFP_KERNEL, sizeof(*pd)); + pd = kzalloc(sizeof(*pd), GFP_KERNEL); if (pd == NULL) goto out; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 1c770c0644d1..0bf6a59545ab 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -576,7 +576,7 @@ out: return copied; } -struct proto l2tp_ip_prot = { +static struct proto l2tp_ip_prot = { .name = "L2TP/IP", .owner = THIS_MODULE, .init = l2tp_ip_open, diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 43288259f4a1..1534f2b44caf 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -525,6 +525,7 @@ config NETFILTER_XT_TARGET_TPROXY depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help This option adds a `TPROXY' target, which is somewhat similar to REDIRECT. It can only be used in the mangle table and is useful @@ -927,6 +928,7 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help This option adds a `socket' match, which can be used to match packets for which a TCP or UDP socket lookup finds a valid socket. diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index a22dac227055..70bd1d0774c6 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -4,6 +4,7 @@ menuconfig IP_VS tristate "IP virtual server support" depends on NET && INET && NETFILTER + depends on (NF_CONNTRACK || NF_CONNTRACK=n) ---help--- IP Virtual Server support will let you build a high-performance virtual server based on cluster of two or more real servers. This diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1eacf8d9966a..27a5ea6b6a0f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1312,7 +1312,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls) if (!hash) { *vmalloced = 1; printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); - hash = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); } if (hash && nulls) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index ed6d92958023..dc7bb74110df 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -292,6 +292,12 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) for (i = 0; i < MAX_NF_CT_PROTO; i++) proto_array[i] = &nf_conntrack_l4proto_generic; + + /* Before making proto_array visible to lockless readers, + * we must make sure its content is committed to memory. + */ + smp_wmb(); + nf_ct_protos[l4proto->l3proto] = proto_array; } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != &nf_conntrack_l4proto_generic) { diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 19c482caf30b..640678f47a2a 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -21,7 +21,9 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#define XT_TPROXY_HAVE_IPV6 1 #include <net/if_inet6.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6/ip6_tables.h> @@ -172,7 +174,7 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_TPROXY_HAVE_IPV6 static inline const struct in6_addr * tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, @@ -372,7 +374,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_TPROXY_HAVE_IPV6 { .name = "TPROXY", .family = NFPROTO_IPV6, @@ -391,7 +393,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { static int __init tproxy_tg_init(void) { nf_defrag_ipv4_enable(); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_TPROXY_HAVE_IPV6 nf_defrag_ipv6_enable(); #endif diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 2dbd4c857735..00d6ae838303 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -14,7 +14,6 @@ #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv6/ip6_tables.h> #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> @@ -22,7 +21,12 @@ #include <net/inet_sock.h> #include <net/netfilter/nf_tproxy_core.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#define XT_SOCKET_HAVE_IPV6 1 +#include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#endif #include <linux/netfilter/xt_socket.h> @@ -186,12 +190,12 @@ socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par) return socket_match(skb, par, par->matchinfo); } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_SOCKET_HAVE_IPV6 static int extract_icmp6_fields(const struct sk_buff *skb, unsigned int outside_hdrlen, - u8 *protocol, + int *protocol, struct in6_addr **raddr, struct in6_addr **laddr, __be16 *rport, @@ -248,8 +252,7 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) struct sock *sk; struct in6_addr *daddr, *saddr; __be16 dport, sport; - int thoff; - u8 tproto; + int thoff, tproto; const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); @@ -301,7 +304,7 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) sk = NULL; } - pr_debug("proto %hhu %pI6:%hu -> %pI6:%hu " + pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu " "(orig %pI6:%hu) sock %p\n", tproto, saddr, ntohs(sport), daddr, ntohs(dport), @@ -331,7 +334,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_SOCKET_HAVE_IPV6 { .name = "socket", .revision = 1, @@ -348,7 +351,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { static int __init socket_mt_init(void) { nf_defrag_ipv4_enable(); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifdef XT_SOCKET_HAVE_IPV6 nf_defrag_ipv6_enable(); #endif diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index cd96ed3ccee4..478181d53c55 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -83,9 +83,9 @@ struct netlink_sock { struct module *module; }; -struct listeners_rcu_head { - struct rcu_head rcu_head; - void *ptr; +struct listeners { + struct rcu_head rcu; + unsigned long masks[0]; }; #define NETLINK_KERNEL_SOCKET 0x1 @@ -119,7 +119,7 @@ struct nl_pid_hash { struct netlink_table { struct nl_pid_hash hash; struct hlist_head mc_list; - unsigned long *listeners; + struct listeners __rcu *listeners; unsigned int nl_nonroot; unsigned int groups; struct mutex *cb_mutex; @@ -338,7 +338,7 @@ netlink_update_listeners(struct sock *sk) if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } - tbl->listeners[i] = mask; + tbl->listeners->masks[i] = mask; } /* this function is only called with the netlink table "grabbed", which * makes sure updates are visible before bind or setsockopt return. */ @@ -936,7 +936,7 @@ EXPORT_SYMBOL(netlink_unicast); int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; - unsigned long *listeners; + struct listeners *listeners; BUG_ON(!netlink_is_kernel(sk)); @@ -944,7 +944,7 @@ int netlink_has_listeners(struct sock *sk, unsigned int group) listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); if (group - 1 < nl_table[sk->sk_protocol].groups) - res = test_bit(group - 1, listeners); + res = test_bit(group - 1, listeners->masks); rcu_read_unlock(); @@ -1498,7 +1498,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, struct socket *sock; struct sock *sk; struct netlink_sock *nlk; - unsigned long *listeners = NULL; + struct listeners *listeners = NULL; BUG_ON(!nl_table); @@ -1523,8 +1523,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, if (groups < 32) groups = 32; - listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head), - GFP_KERNEL); + listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; @@ -1541,7 +1540,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, netlink_table_grab(); if (!nl_table[unit].registered) { nl_table[unit].groups = groups; - nl_table[unit].listeners = listeners; + rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; nl_table[unit].registered = 1; @@ -1572,43 +1571,28 @@ netlink_kernel_release(struct sock *sk) EXPORT_SYMBOL(netlink_kernel_release); -static void netlink_free_old_listeners(struct rcu_head *rcu_head) +static void listeners_free_rcu(struct rcu_head *head) { - struct listeners_rcu_head *lrh; - - lrh = container_of(rcu_head, struct listeners_rcu_head, rcu_head); - kfree(lrh->ptr); + kfree(container_of(head, struct listeners, rcu)); } int __netlink_change_ngroups(struct sock *sk, unsigned int groups) { - unsigned long *listeners, *old = NULL; - struct listeners_rcu_head *old_rcu_head; + struct listeners *new, *old; struct netlink_table *tbl = &nl_table[sk->sk_protocol]; if (groups < 32) groups = 32; if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { - listeners = kzalloc(NLGRPSZ(groups) + - sizeof(struct listeners_rcu_head), - GFP_ATOMIC); - if (!listeners) + new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); + if (!new) return -ENOMEM; - old = tbl->listeners; - memcpy(listeners, old, NLGRPSZ(tbl->groups)); - rcu_assign_pointer(tbl->listeners, listeners); - /* - * Free the old memory after an RCU grace period so we - * don't leak it. We use call_rcu() here in order to be - * able to call this function from atomic contexts. The - * allocation of this memory will have reserved enough - * space for struct listeners_rcu_head at the end. - */ - old_rcu_head = (void *)(tbl->listeners + - NLGRPLONGS(tbl->groups)); - old_rcu_head->ptr = old; - call_rcu(&old_rcu_head->rcu_head, netlink_free_old_listeners); + old = rcu_dereference_raw(tbl->listeners); + memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); + rcu_assign_pointer(tbl->listeners, new); + + call_rcu(&old->rcu, listeners_free_rcu); } tbl->groups = groups; @@ -2104,18 +2088,17 @@ static void __net_exit netlink_net_exit(struct net *net) static void __init netlink_add_usersock_entry(void) { - unsigned long *listeners; + struct listeners *listeners; int groups = 32; - listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head), - GFP_KERNEL); + listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) - panic("netlink_add_usersock_entry: Cannot allocate listneres\n"); + panic("netlink_add_usersock_entry: Cannot allocate listeners\n"); netlink_table_grab(); nl_table[NETLINK_USERSOCK].groups = groups; - nl_table[NETLINK_USERSOCK].listeners = listeners; + rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3616f27b9d46..8298e676f5a0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1610,9 +1610,11 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, err = -EINVAL; vnet_hdr_len = sizeof(vnet_hdr); - if ((len -= vnet_hdr_len) < 0) + if (len < vnet_hdr_len) goto out_free; + len -= vnet_hdr_len; + if (skb_is_gso(skb)) { struct skb_shared_info *sinfo = skb_shinfo(skb); @@ -1719,7 +1721,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); if (dev) - strlcpy(uaddr->sa_data, dev->name, 15); + strncpy(uaddr->sa_data, dev->name, 14); else memset(uaddr->sa_data, 0, 14); rcu_read_unlock(); @@ -1742,6 +1744,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, sll->sll_family = AF_PACKET; sll->sll_ifindex = po->ifindex; sll->sll_protocol = po->num; + sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); if (dev) { diff --git a/net/rds/loop.c b/net/rds/loop.c index c390156b426f..aeec1d483b17 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -134,8 +134,12 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) static void rds_loop_conn_free(void *arg) { struct rds_loop_connection *lc = arg; + unsigned long flags; + rdsdebug("lc %p\n", lc); + spin_lock_irqsave(&loop_conns_lock, flags); list_del(&lc->loop_node); + spin_unlock_irqrestore(&loop_conns_lock, flags); kfree(lc); } diff --git a/net/rds/message.c b/net/rds/message.c index a84545dae370..1fd3d29023d7 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -224,6 +224,9 @@ struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); WARN_ON(!nents); + if (rm->m_used_sgs + nents > rm->m_total_sgs) + return NULL; + sg_ret = &sg_first[rm->m_used_sgs]; sg_init_table(sg_ret, nents); rm->m_used_sgs += nents; @@ -246,6 +249,10 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); rm->data.op_nents = ceil(total_len, PAGE_SIZE); rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); + if (!rm->data.op_sg) { + rds_message_put(rm); + return ERR_PTR(-ENOMEM); + } for (i = 0; i < rm->data.op_nents; ++i) { sg_set_page(&rm->data.op_sg[i], diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 1a41debca1ce..4e37c1cbe8b2 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -479,13 +479,38 @@ void rds_atomic_free_op(struct rm_atomic_op *ao) /* - * Count the number of pages needed to describe an incoming iovec. + * Count the number of pages needed to describe an incoming iovec array. */ -static int rds_rdma_pages(struct rds_rdma_args *args) +static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) +{ + int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + /* figure out the number of pages in the vector */ + for (i = 0; i < nr_iovecs; i++) { + nr_pages = rds_pages_in_vec(&iov[i]); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } + + return tot_pages; +} + +int rds_rdma_extra_size(struct rds_rdma_args *args) { struct rds_iovec vec; struct rds_iovec __user *local_vec; - unsigned int tot_pages = 0; + int tot_pages = 0; unsigned int nr_pages; unsigned int i; @@ -502,14 +527,16 @@ static int rds_rdma_pages(struct rds_rdma_args *args) return -EINVAL; tot_pages += nr_pages; - } - return tot_pages; -} + /* + * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, + * so tot_pages cannot overflow without first going negative. + */ + if (tot_pages < 0) + return -EINVAL; + } -int rds_rdma_extra_size(struct rds_rdma_args *args) -{ - return rds_rdma_pages(args) * sizeof(struct scatterlist); + return tot_pages * sizeof(struct scatterlist); } /* @@ -520,13 +547,12 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { struct rds_rdma_args *args; - struct rds_iovec vec; struct rm_rdma_op *op = &rm->rdma; int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; - struct rds_iovec __user *local_vec; - unsigned int nr; + struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; + int iov_size; unsigned int i, j; int ret = 0; @@ -541,14 +567,31 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out; } - if (args->nr_local > (u64)UINT_MAX) { + if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; goto out; } - nr_pages = rds_rdma_pages(args); - if (nr_pages < 0) + /* Check whether to allocate the iovec area */ + iov_size = args->nr_local * sizeof(struct rds_iovec); + if (args->nr_local > UIO_FASTIOV) { + iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); + if (!iovs) { + ret = -ENOMEM; + goto out; + } + } + + if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { + ret = -EFAULT; + goto out; + } + + nr_pages = rds_rdma_pages(iovs, args->nr_local); + if (nr_pages < 0) { + ret = -EINVAL; goto out; + } pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { @@ -564,6 +607,10 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, op->op_recverr = rs->rs_recverr; WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages); + if (!op->op_sg) { + ret = -ENOMEM; + goto out; + } if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because @@ -597,50 +644,40 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, (unsigned long long)args->remote_vec.addr, op->op_rkey); - local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; - for (i = 0; i < args->nr_local; i++) { - if (copy_from_user(&vec, &local_vec[i], - sizeof(struct rds_iovec))) { - ret = -EFAULT; - goto out; - } - - nr = rds_pages_in_vec(&vec); - if (nr == 0) { - ret = -EINVAL; - goto out; - } + struct rds_iovec *iov = &iovs[i]; + /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ + unsigned int nr = rds_pages_in_vec(iov); - rs->rs_user_addr = vec.addr; - rs->rs_user_bytes = vec.bytes; + rs->rs_user_addr = iov->addr; + rs->rs_user_bytes = iov->bytes; /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ - ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write); + ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if (ret < 0) goto out; - rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", - nr_bytes, nr, vec.bytes, vec.addr); + rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", + nr_bytes, nr, iov->bytes, iov->addr); - nr_bytes += vec.bytes; + nr_bytes += iov->bytes; for (j = 0; j < nr; j++) { - unsigned int offset = vec.addr & ~PAGE_MASK; + unsigned int offset = iov->addr & ~PAGE_MASK; struct scatterlist *sg; sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], - min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), + min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); - rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", - sg->offset, sg->length, vec.addr, vec.bytes); + rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", + sg->offset, sg->length, iov->addr, iov->bytes); - vec.addr += sg->length; - vec.bytes -= sg->length; + iov->addr += sg->length; + iov->bytes -= sg->length; } op->op_nents += nr; @@ -655,13 +692,14 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, } op->op_bytes = nr_bytes; - ret = 0; out: + if (iovs != iovstack) + sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); kfree(pages); if (ret) rds_rdma_free_op(op); - - rds_stats_inc(s_send_rdma); + else + rds_stats_inc(s_send_rdma); return ret; } @@ -773,6 +811,10 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); + if (!rm->atomic.op_sg) { + ret = -ENOMEM; + goto err; + } /* verify 8 byte-aligned */ if (args->local_addr & 0x7) { diff --git a/net/rds/send.c b/net/rds/send.c index 0bc9db17a87d..35b9c2e9caf1 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -973,6 +973,10 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, /* Attach data to the rm */ if (payload_len) { rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + if (!rm->data.op_sg) { + ret = -ENOMEM; + goto out; + } ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); if (ret) goto out; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 08a8c6cf2d10..8e0a32001c90 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -221,7 +221,13 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) static void rds_tcp_conn_free(void *arg) { struct rds_tcp_connection *tc = arg; + unsigned long flags; rdsdebug("freeing tc %p\n", tc); + + spin_lock_irqsave(&rds_tcp_conn_lock, flags); + list_del(&tc->t_tcp_node); + spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); + kmem_cache_free(rds_tcp_conn_slab, tc); } diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index efd4f95fd050..f23d9155b1ef 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -268,6 +268,10 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh, goto nla_put_failure; nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0) + goto nla_put_failure; + return skb->len; nla_put_failure: diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 37dff78e9cb1..d49c40fb7e09 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -34,8 +34,6 @@ struct cgroup_subsys net_cls_subsys = { .populate = cgrp_populate, #ifdef CONFIG_NET_CLS_CGROUP .subsys_id = net_cls_subsys_id, -#else -#define net_cls_subsys_id net_cls_subsys.subsys_id #endif .module = THIS_MODULE, }; diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 763253257411..ea8f566e720c 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -103,7 +103,8 @@ retry: static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m) { - textsearch_destroy(EM_TEXT_PRIV(m)->config); + if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) + textsearch_destroy(EM_TEXT_PRIV(m)->config); } static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 1ef29c74d85e..e58f9476f29c 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -92,7 +92,7 @@ static struct sctp_af *sctp_af_v6_specific; struct kmem_cache *sctp_chunk_cachep __read_mostly; struct kmem_cache *sctp_bucket_cachep __read_mostly; -int sysctl_sctp_mem[3]; +long sysctl_sctp_mem[3]; int sysctl_sctp_rmem[3]; int sysctl_sctp_wmem[3]; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e34ca9cc1167..6bd554323a34 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -111,12 +111,12 @@ static void sctp_sock_migrate(struct sock *, struct sock *, static char *sctp_hmac_alg = SCTP_COOKIE_HMAC_ALG; extern struct kmem_cache *sctp_bucket_cachep; -extern int sysctl_sctp_mem[3]; +extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; static int sctp_memory_pressure; -static atomic_t sctp_memory_allocated; +static atomic_long_t sctp_memory_allocated; struct percpu_counter sctp_sockets_allocated; static void sctp_enter_memory_pressure(struct sock *sk) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 832590bbe0c0..50cb57f0919e 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -54,7 +54,7 @@ static int sack_timer_max = 500; static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ static int rwnd_scale_max = 16; -extern int sysctl_sctp_mem[3]; +extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; @@ -203,7 +203,7 @@ static ctl_table sctp_table[] = { .data = &sysctl_sctp_mem, .maxlen = sizeof(sysctl_sctp_mem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax }, { .procname = "sctp_rmem", diff --git a/net/socket.c b/net/socket.c index abf3e2561521..3ca2fd9e3720 100644 --- a/net/socket.c +++ b/net/socket.c @@ -305,19 +305,17 @@ static const struct super_operations sockfs_ops = { .statfs = simple_statfs, }; -static int sockfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *sockfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC, - mnt); + return mount_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC); } static struct vfsmount *sock_mnt __read_mostly; static struct file_system_type sock_fs_type = { .name = "sockfs", - .get_sb = sockfs_get_sb, + .mount = sockfs_mount, .kill_sb = kill_anon_super, }; @@ -377,7 +375,7 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags) &socket_file_ops); if (unlikely(!file)) { /* drop dentry, keep inode */ - atomic_inc(&path.dentry->d_inode->i_count); + ihold(path.dentry->d_inode); path_put(&path); put_unused_fd(fd); return -ENFILE; @@ -480,6 +478,7 @@ static struct socket *sock_alloc(void) sock = SOCKET_I(inode); kmemcheck_annotate_bitfield(sock, type); + inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); @@ -1145,7 +1144,7 @@ call_kill: } EXPORT_SYMBOL(sock_wake_async); -static int __sock_create(struct net *net, int family, int type, int protocol, +int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; @@ -1257,6 +1256,7 @@ out_release: rcu_read_unlock(); goto out_sock_release; } +EXPORT_SYMBOL(__sock_create); int sock_create(int family, int type, int protocol, struct socket **res) { @@ -1652,6 +1652,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, struct iovec iov; int fput_needed; + if (len > INT_MAX) + len = INT_MAX; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; @@ -1709,6 +1711,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, int err, err2; int fput_needed; + if (size > INT_MAX) + size = INT_MAX; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 3376d7657185..8873fd8ddacd 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -36,22 +36,3 @@ config RPCSEC_GSS_KRB5 Kerberos support should be installed. If unsure, say Y. - -config RPCSEC_GSS_SPKM3 - tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)" - depends on SUNRPC && EXPERIMENTAL - select SUNRPC_GSS - select CRYPTO - select CRYPTO_MD5 - select CRYPTO_DES - select CRYPTO_CAST5 - select CRYPTO_CBC - help - Choose Y here to enable Secure RPC using the SPKM3 public key - GSS-API mechanism (RFC 2025). - - Secure RPC calls with SPKM3 require an auxiliary userspace - daemon which may be found in the Linux nfs-utils package - available from http://linux-nfs.org/. - - If unsure, say N. diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index e9eaaf7d43c1..afe67849269f 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -595,7 +595,7 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, int rpcauth_refreshcred(struct rpc_task *task) { - struct rpc_cred *cred = task->tk_rqstp->rq_cred; + struct rpc_cred *cred; int err; cred = task->tk_rqstp->rq_cred; @@ -658,7 +658,7 @@ out1: return err; } -void __exit rpcauth_remove_module(void) +void rpcauth_remove_module(void) { rpc_destroy_authunix(); rpc_destroy_generic_auth(); diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 43162bb3b78f..e010a015d996 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -158,7 +158,7 @@ int __init rpc_init_generic_auth(void) return rpcauth_init_credcache(&generic_auth); } -void __exit rpc_destroy_generic_auth(void) +void rpc_destroy_generic_auth(void) { rpcauth_destroy_credcache(&generic_auth); } diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index 74a231735f67..7350d86a32ee 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -11,8 +11,3 @@ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o - -obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o - -rpcsec_gss_spkm3-objs := gss_spkm3_mech.o gss_spkm3_seal.o gss_spkm3_unseal.o \ - gss_spkm3_token.o diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 778e5dfc5144..f375decc024b 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -427,7 +427,7 @@ static int context_derive_keys_rc4(struct krb5_ctx *ctx) { struct crypto_hash *hmac; - char sigkeyconstant[] = "signaturekey"; + static const char sigkeyconstant[] = "signaturekey"; int slen = strlen(sigkeyconstant) + 1; /* include null terminator */ struct hash_desc desc; struct scatterlist sg[1]; diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c deleted file mode 100644 index adade3d313f2..000000000000 --- a/net/sunrpc/auth_gss/gss_spkm3_mech.c +++ /dev/null @@ -1,247 +0,0 @@ -/* - * linux/net/sunrpc/gss_spkm3_mech.c - * - * Copyright (c) 2003 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@umich.edu> - * J. Bruce Fields <bfields@umich.edu> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/err.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/sunrpc/auth.h> -#include <linux/in.h> -#include <linux/sunrpc/svcauth_gss.h> -#include <linux/sunrpc/gss_spkm3.h> -#include <linux/sunrpc/xdr.h> -#include <linux/crypto.h> - -#ifdef RPC_DEBUG -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - -static const void * -simple_get_bytes(const void *p, const void *end, void *res, int len) -{ - const void *q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - memcpy(res, p, len); - return q; -} - -static const void * -simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) -{ - const void *q; - unsigned int len; - p = simple_get_bytes(p, end, &len, sizeof(len)); - if (IS_ERR(p)) - return p; - res->len = len; - if (len == 0) { - res->data = NULL; - return p; - } - q = (const void *)((const char *)p + len); - if (unlikely(q > end || q < p)) - return ERR_PTR(-EFAULT); - res->data = kmemdup(p, len, GFP_NOFS); - if (unlikely(res->data == NULL)) - return ERR_PTR(-ENOMEM); - return q; -} - -static int -gss_import_sec_context_spkm3(const void *p, size_t len, - struct gss_ctx *ctx_id, - gfp_t gfp_mask) -{ - const void *end = (const void *)((const char *)p + len); - struct spkm3_ctx *ctx; - int version; - - if (!(ctx = kzalloc(sizeof(*ctx), gfp_mask))) - goto out_err; - - p = simple_get_bytes(p, end, &version, sizeof(version)); - if (IS_ERR(p)) - goto out_err_free_ctx; - if (version != 1) { - dprintk("RPC: unknown spkm3 token format: " - "obsolete nfs-utils?\n"); - p = ERR_PTR(-EINVAL); - goto out_err_free_ctx; - } - - p = simple_get_netobj(p, end, &ctx->ctx_id); - if (IS_ERR(p)) - goto out_err_free_ctx; - - p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); - if (IS_ERR(p)) - goto out_err_free_ctx_id; - - p = simple_get_netobj(p, end, &ctx->mech_used); - if (IS_ERR(p)) - goto out_err_free_ctx_id; - - p = simple_get_bytes(p, end, &ctx->ret_flags, sizeof(ctx->ret_flags)); - if (IS_ERR(p)) - goto out_err_free_mech; - - p = simple_get_netobj(p, end, &ctx->conf_alg); - if (IS_ERR(p)) - goto out_err_free_mech; - - p = simple_get_netobj(p, end, &ctx->derived_conf_key); - if (IS_ERR(p)) - goto out_err_free_conf_alg; - - p = simple_get_netobj(p, end, &ctx->intg_alg); - if (IS_ERR(p)) - goto out_err_free_conf_key; - - p = simple_get_netobj(p, end, &ctx->derived_integ_key); - if (IS_ERR(p)) - goto out_err_free_intg_alg; - - if (p != end) { - p = ERR_PTR(-EFAULT); - goto out_err_free_intg_key; - } - - ctx_id->internal_ctx_id = ctx; - - dprintk("RPC: Successfully imported new spkm context.\n"); - return 0; - -out_err_free_intg_key: - kfree(ctx->derived_integ_key.data); -out_err_free_intg_alg: - kfree(ctx->intg_alg.data); -out_err_free_conf_key: - kfree(ctx->derived_conf_key.data); -out_err_free_conf_alg: - kfree(ctx->conf_alg.data); -out_err_free_mech: - kfree(ctx->mech_used.data); -out_err_free_ctx_id: - kfree(ctx->ctx_id.data); -out_err_free_ctx: - kfree(ctx); -out_err: - return PTR_ERR(p); -} - -static void -gss_delete_sec_context_spkm3(void *internal_ctx) -{ - struct spkm3_ctx *sctx = internal_ctx; - - kfree(sctx->derived_integ_key.data); - kfree(sctx->intg_alg.data); - kfree(sctx->derived_conf_key.data); - kfree(sctx->conf_alg.data); - kfree(sctx->mech_used.data); - kfree(sctx->ctx_id.data); - kfree(sctx); -} - -static u32 -gss_verify_mic_spkm3(struct gss_ctx *ctx, - struct xdr_buf *signbuf, - struct xdr_netobj *checksum) -{ - u32 maj_stat = 0; - struct spkm3_ctx *sctx = ctx->internal_ctx_id; - - maj_stat = spkm3_read_token(sctx, checksum, signbuf, SPKM_MIC_TOK); - - dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat); - return maj_stat; -} - -static u32 -gss_get_mic_spkm3(struct gss_ctx *ctx, - struct xdr_buf *message_buffer, - struct xdr_netobj *message_token) -{ - u32 err = 0; - struct spkm3_ctx *sctx = ctx->internal_ctx_id; - - err = spkm3_make_token(sctx, message_buffer, - message_token, SPKM_MIC_TOK); - dprintk("RPC: gss_get_mic_spkm3 returning %d\n", err); - return err; -} - -static const struct gss_api_ops gss_spkm3_ops = { - .gss_import_sec_context = gss_import_sec_context_spkm3, - .gss_get_mic = gss_get_mic_spkm3, - .gss_verify_mic = gss_verify_mic_spkm3, - .gss_delete_sec_context = gss_delete_sec_context_spkm3, -}; - -static struct pf_desc gss_spkm3_pfs[] = { - {RPC_AUTH_GSS_SPKM, RPC_GSS_SVC_NONE, "spkm3"}, - {RPC_AUTH_GSS_SPKMI, RPC_GSS_SVC_INTEGRITY, "spkm3i"}, -}; - -static struct gss_api_mech gss_spkm3_mech = { - .gm_name = "spkm3", - .gm_owner = THIS_MODULE, - .gm_oid = {7, "\053\006\001\005\005\001\003"}, - .gm_ops = &gss_spkm3_ops, - .gm_pf_num = ARRAY_SIZE(gss_spkm3_pfs), - .gm_pfs = gss_spkm3_pfs, -}; - -static int __init init_spkm3_module(void) -{ - int status; - - status = gss_mech_register(&gss_spkm3_mech); - if (status) - printk("Failed to register spkm3 gss mechanism!\n"); - return status; -} - -static void __exit cleanup_spkm3_module(void) -{ - gss_mech_unregister(&gss_spkm3_mech); -} - -MODULE_LICENSE("GPL"); -module_init(init_spkm3_module); -module_exit(cleanup_spkm3_module); diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c deleted file mode 100644 index 5a3a65a0e2b4..000000000000 --- a/net/sunrpc/auth_gss/gss_spkm3_seal.c +++ /dev/null @@ -1,186 +0,0 @@ -/* - * linux/net/sunrpc/gss_spkm3_seal.c - * - * Copyright (c) 2003 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@umich.edu> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/types.h> -#include <linux/jiffies.h> -#include <linux/sunrpc/gss_spkm3.h> -#include <linux/random.h> -#include <linux/crypto.h> -#include <linux/pagemap.h> -#include <linux/scatterlist.h> -#include <linux/sunrpc/xdr.h> - -#ifdef RPC_DEBUG -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - -const struct xdr_netobj hmac_md5_oid = { 8, "\x2B\x06\x01\x05\x05\x08\x01\x01"}; -const struct xdr_netobj cast5_cbc_oid = {9, "\x2A\x86\x48\x86\xF6\x7D\x07\x42\x0A"}; - -/* - * spkm3_make_token() - * - * Only SPKM_MIC_TOK with md5 intg-alg is supported - */ - -u32 -spkm3_make_token(struct spkm3_ctx *ctx, - struct xdr_buf * text, struct xdr_netobj * token, - int toktype) -{ - s32 checksum_type; - char tokhdrbuf[25]; - char cksumdata[16]; - struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; - struct xdr_netobj mic_hdr = {.len = 0, .data = tokhdrbuf}; - int tokenlen = 0; - unsigned char *ptr; - s32 now; - int ctxelen = 0, ctxzbit = 0; - int md5elen = 0, md5zbit = 0; - - now = jiffies; - - if (ctx->ctx_id.len != 16) { - dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n", - ctx->ctx_id.len); - goto out_err; - } - - if (!g_OID_equal(&ctx->intg_alg, &hmac_md5_oid)) { - dprintk("RPC: gss_spkm3_seal: unsupported I-ALG " - "algorithm. only support hmac-md5 I-ALG.\n"); - goto out_err; - } else - checksum_type = CKSUMTYPE_HMAC_MD5; - - if (!g_OID_equal(&ctx->conf_alg, &cast5_cbc_oid)) { - dprintk("RPC: gss_spkm3_seal: unsupported C-ALG " - "algorithm\n"); - goto out_err; - } - - if (toktype == SPKM_MIC_TOK) { - /* Calculate checksum over the mic-header */ - asn1_bitstring_len(&ctx->ctx_id, &ctxelen, &ctxzbit); - spkm3_mic_header(&mic_hdr.data, &mic_hdr.len, ctx->ctx_id.data, - ctxelen, ctxzbit); - if (make_spkm3_checksum(checksum_type, &ctx->derived_integ_key, - (char *)mic_hdr.data, mic_hdr.len, - text, 0, &md5cksum)) - goto out_err; - - asn1_bitstring_len(&md5cksum, &md5elen, &md5zbit); - tokenlen = 10 + ctxelen + 1 + md5elen + 1; - - /* Create token header using generic routines */ - token->len = g_token_size(&ctx->mech_used, tokenlen + 2); - - ptr = token->data; - g_make_token_header(&ctx->mech_used, tokenlen + 2, &ptr); - - spkm3_make_mic_token(&ptr, tokenlen, &mic_hdr, &md5cksum, md5elen, md5zbit); - } else if (toktype == SPKM_WRAP_TOK) { /* Not Supported */ - dprintk("RPC: gss_spkm3_seal: SPKM_WRAP_TOK " - "not supported\n"); - goto out_err; - } - - /* XXX need to implement sequence numbers, and ctx->expired */ - - return GSS_S_COMPLETE; -out_err: - token->data = NULL; - token->len = 0; - return GSS_S_FAILURE; -} - -static int -spkm3_checksummer(struct scatterlist *sg, void *data) -{ - struct hash_desc *desc = data; - - return crypto_hash_update(desc, sg, sg->length); -} - -/* checksum the plaintext data and hdrlen bytes of the token header */ -s32 -make_spkm3_checksum(s32 cksumtype, struct xdr_netobj *key, char *header, - unsigned int hdrlen, struct xdr_buf *body, - unsigned int body_offset, struct xdr_netobj *cksum) -{ - char *cksumname; - struct hash_desc desc; /* XXX add to ctx? */ - struct scatterlist sg[1]; - int err; - - switch (cksumtype) { - case CKSUMTYPE_HMAC_MD5: - cksumname = "hmac(md5)"; - break; - default: - dprintk("RPC: spkm3_make_checksum:" - " unsupported checksum %d", cksumtype); - return GSS_S_FAILURE; - } - - if (key->data == NULL || key->len <= 0) return GSS_S_FAILURE; - - desc.tfm = crypto_alloc_hash(cksumname, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(desc.tfm)) - return GSS_S_FAILURE; - cksum->len = crypto_hash_digestsize(desc.tfm); - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - err = crypto_hash_setkey(desc.tfm, key->data, key->len); - if (err) - goto out; - - err = crypto_hash_init(&desc); - if (err) - goto out; - - sg_init_one(sg, header, hdrlen); - crypto_hash_update(&desc, sg, sg->length); - - xdr_process_buf(body, body_offset, body->len - body_offset, - spkm3_checksummer, &desc); - crypto_hash_final(&desc, cksum->data); - -out: - crypto_free_hash(desc.tfm); - - return err ? GSS_S_FAILURE : 0; -} diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c deleted file mode 100644 index a99825d7caa0..000000000000 --- a/net/sunrpc/auth_gss/gss_spkm3_token.c +++ /dev/null @@ -1,267 +0,0 @@ -/* - * linux/net/sunrpc/gss_spkm3_token.c - * - * Copyright (c) 2003 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@umich.edu> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/jiffies.h> -#include <linux/sunrpc/gss_spkm3.h> -#include <linux/random.h> -#include <linux/crypto.h> - -#ifdef RPC_DEBUG -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - -/* - * asn1_bitstring_len() - * - * calculate the asn1 bitstring length of the xdr_netobject - */ -void -asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits) -{ - int i, zbit = 0,elen = in->len; - char *ptr; - - ptr = &in->data[in->len -1]; - - /* count trailing 0's */ - for(i = in->len; i > 0; i--) { - if (*ptr == 0) { - ptr--; - elen--; - } else - break; - } - - /* count number of 0 bits in final octet */ - ptr = &in->data[elen - 1]; - for(i = 0; i < 8; i++) { - short mask = 0x01; - - if (!((mask << i) & *ptr)) - zbit++; - else - break; - } - *enclen = elen; - *zerobits = zbit; -} - -/* - * decode_asn1_bitstring() - * - * decode a bitstring into a buffer of the expected length. - * enclen = bit string length - * explen = expected length (define in rfc) - */ -int -decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, int explen) -{ - if (!(out->data = kzalloc(explen,GFP_NOFS))) - return 0; - out->len = explen; - memcpy(out->data, in, enclen); - return 1; -} - -/* - * SPKMInnerContextToken choice SPKM_MIC asn1 token layout - * - * contextid is always 16 bytes plain data. max asn1 bitstring len = 17. - * - * tokenlen = pos[0] to end of token (max pos[45] with MD5 cksum) - * - * pos value - * ---------- - * [0] a4 SPKM-MIC tag - * [1] ?? innertoken length (max 44) - * - * - * tok_hdr piece of checksum data starts here - * - * the maximum mic-header len = 9 + 17 = 26 - * mic-header - * ---------- - * [2] 30 SEQUENCE tag - * [3] ?? mic-header length: (max 23) = TokenID + ContextID - * - * TokenID - all fields constant and can be hardcoded - * ------- - * [4] 02 Type 2 - * [5] 02 Length 2 - * [6][7] 01 01 TokenID (SPKM_MIC_TOK) - * - * ContextID - encoded length not constant, calculated - * --------- - * [8] 03 Type 3 - * [9] ?? encoded length - * [10] ?? ctxzbit - * [11] contextid - * - * mic_header piece of checksum data ends here. - * - * int-cksum - encoded length not constant, calculated - * --------- - * [??] 03 Type 3 - * [??] ?? encoded length - * [??] ?? md5zbit - * [??] int-cksum (NID_md5 = 16) - * - * maximum SPKM-MIC innercontext token length = - * 10 + encoded contextid_size(17 max) + 2 + encoded - * cksum_size (17 maxfor NID_md5) = 46 - */ - -/* - * spkm3_mic_header() - * - * Prepare the SPKM_MIC_TOK mic-header for check-sum calculation - * elen: 16 byte context id asn1 bitstring encoded length - */ -void -spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, unsigned char *ctxdata, int elen, int zbit) -{ - char *hptr = *hdrbuf; - char *top = *hdrbuf; - - *(u8 *)hptr++ = 0x30; - *(u8 *)hptr++ = elen + 7; /* on the wire header length */ - - /* tokenid */ - *(u8 *)hptr++ = 0x02; - *(u8 *)hptr++ = 0x02; - *(u8 *)hptr++ = 0x01; - *(u8 *)hptr++ = 0x01; - - /* coniextid */ - *(u8 *)hptr++ = 0x03; - *(u8 *)hptr++ = elen + 1; /* add 1 to include zbit */ - *(u8 *)hptr++ = zbit; - memcpy(hptr, ctxdata, elen); - hptr += elen; - *hdrlen = hptr - top; -} - -/* - * spkm3_mic_innercontext_token() - * - * *tokp points to the beginning of the SPKM_MIC token described - * in rfc 2025, section 3.2.1: - * - * toklen is the inner token length - */ -void -spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hdr, struct xdr_netobj *md5cksum, int md5elen, int md5zbit) -{ - unsigned char *ict = *tokp; - - *(u8 *)ict++ = 0xa4; - *(u8 *)ict++ = toklen; - memcpy(ict, mic_hdr->data, mic_hdr->len); - ict += mic_hdr->len; - - *(u8 *)ict++ = 0x03; - *(u8 *)ict++ = md5elen + 1; /* add 1 to include zbit */ - *(u8 *)ict++ = md5zbit; - memcpy(ict, md5cksum->data, md5elen); -} - -u32 -spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **cksum) -{ - struct xdr_netobj spkm3_ctx_id = {.len =0, .data = NULL}; - unsigned char *ptr = *tokp; - int ctxelen; - u32 ret = GSS_S_DEFECTIVE_TOKEN; - - /* spkm3 innercontext token preamble */ - if ((ptr[0] != 0xa4) || (ptr[2] != 0x30)) { - dprintk("RPC: BAD SPKM ictoken preamble\n"); - goto out; - } - - *mic_hdrlen = ptr[3]; - - /* token type */ - if ((ptr[4] != 0x02) || (ptr[5] != 0x02)) { - dprintk("RPC: BAD asn1 SPKM3 token type\n"); - goto out; - } - - /* only support SPKM_MIC_TOK */ - if((ptr[6] != 0x01) || (ptr[7] != 0x01)) { - dprintk("RPC: ERROR unsupported SPKM3 token\n"); - goto out; - } - - /* contextid */ - if (ptr[8] != 0x03) { - dprintk("RPC: BAD SPKM3 asn1 context-id type\n"); - goto out; - } - - ctxelen = ptr[9]; - if (ctxelen > 17) { /* length includes asn1 zbit octet */ - dprintk("RPC: BAD SPKM3 contextid len %d\n", ctxelen); - goto out; - } - - /* ignore ptr[10] */ - - if(!decode_asn1_bitstring(&spkm3_ctx_id, &ptr[11], ctxelen - 1, 16)) - goto out; - - /* - * in the current implementation: the optional int-alg is not present - * so the default int-alg (md5) is used the optional snd-seq field is - * also not present - */ - - if (*mic_hdrlen != 6 + ctxelen) { - dprintk("RPC: BAD SPKM_ MIC_TOK header len %d: we only " - "support default int-alg (should be absent) " - "and do not support snd-seq\n", *mic_hdrlen); - goto out; - } - /* checksum */ - *cksum = (&ptr[10] + ctxelen); /* ctxelen includes ptr[10] */ - - ret = GSS_S_COMPLETE; -out: - kfree(spkm3_ctx_id.data); - return ret; -} - diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c deleted file mode 100644 index cc21ee860bb6..000000000000 --- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * linux/net/sunrpc/gss_spkm3_unseal.c - * - * Copyright (c) 2003 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@umich.edu> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/jiffies.h> -#include <linux/sunrpc/gss_spkm3.h> -#include <linux/crypto.h> - -#ifdef RPC_DEBUG -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - -/* - * spkm3_read_token() - * - * only SPKM_MIC_TOK with md5 intg-alg is supported - */ -u32 -spkm3_read_token(struct spkm3_ctx *ctx, - struct xdr_netobj *read_token, /* checksum */ - struct xdr_buf *message_buffer, /* signbuf */ - int toktype) -{ - s32 checksum_type; - s32 code; - struct xdr_netobj wire_cksum = {.len =0, .data = NULL}; - char cksumdata[16]; - struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; - unsigned char *ptr = (unsigned char *)read_token->data; - unsigned char *cksum; - int bodysize, md5elen; - int mic_hdrlen; - u32 ret = GSS_S_DEFECTIVE_TOKEN; - - if (g_verify_token_header((struct xdr_netobj *) &ctx->mech_used, - &bodysize, &ptr, read_token->len)) - goto out; - - /* decode the token */ - - if (toktype != SPKM_MIC_TOK) { - dprintk("RPC: BAD SPKM3 token type: %d\n", toktype); - goto out; - } - - if ((ret = spkm3_verify_mic_token(&ptr, &mic_hdrlen, &cksum))) - goto out; - - if (*cksum++ != 0x03) { - dprintk("RPC: spkm3_read_token BAD checksum type\n"); - goto out; - } - md5elen = *cksum++; - cksum++; /* move past the zbit */ - - if (!decode_asn1_bitstring(&wire_cksum, cksum, md5elen - 1, 16)) - goto out; - - /* HARD CODED FOR MD5 */ - - /* compute the checksum of the message. - * ptr + 2 = start of header piece of checksum - * mic_hdrlen + 2 = length of header piece of checksum - */ - ret = GSS_S_DEFECTIVE_TOKEN; - if (!g_OID_equal(&ctx->intg_alg, &hmac_md5_oid)) { - dprintk("RPC: gss_spkm3_seal: unsupported I-ALG " - "algorithm\n"); - goto out; - } - - checksum_type = CKSUMTYPE_HMAC_MD5; - - code = make_spkm3_checksum(checksum_type, - &ctx->derived_integ_key, ptr + 2, mic_hdrlen + 2, - message_buffer, 0, &md5cksum); - - if (code) - goto out; - - ret = GSS_S_BAD_SIG; - code = memcmp(md5cksum.data, wire_cksum.data, wire_cksum.len); - if (code) { - dprintk("RPC: bad MIC checksum\n"); - goto out; - } - - - /* XXX: need to add expiration and sequencing */ - ret = GSS_S_COMPLETE; -out: - kfree(wire_cksum.data); - return ret; -} diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index cc385b3a59c2..dec2a6fc7c12 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -964,7 +964,7 @@ svcauth_gss_set_client(struct svc_rqst *rqstp) if (rqstp->rq_gssclient == NULL) return SVC_DENIED; stat = svcauth_unix_set_client(rqstp); - if (stat == SVC_DROP) + if (stat == SVC_DROP || stat == SVC_CLOSE) return stat; return SVC_OK; } @@ -1018,7 +1018,7 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp, return SVC_DENIED; memset(&rsikey, 0, sizeof(rsikey)); if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) - return SVC_DROP; + return SVC_CLOSE; *authp = rpc_autherr_badverf; if (svc_safe_getnetobj(argv, &tmpobj)) { kfree(rsikey.in_handle.data); @@ -1026,38 +1026,35 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp, } if (dup_netobj(&rsikey.in_token, &tmpobj)) { kfree(rsikey.in_handle.data); - return SVC_DROP; + return SVC_CLOSE; } /* Perform upcall, or find upcall result: */ rsip = rsi_lookup(&rsikey); rsi_free(&rsikey); if (!rsip) - return SVC_DROP; - switch (cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) { - case -EAGAIN: - case -ETIMEDOUT: - case -ENOENT: + return SVC_CLOSE; + if (cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle) < 0) /* No upcall result: */ - return SVC_DROP; - case 0: - ret = SVC_DROP; - /* Got an answer to the upcall; use it: */ - if (gss_write_init_verf(rqstp, rsip)) - goto out; - if (resv->iov_len + 4 > PAGE_SIZE) - goto out; - svc_putnl(resv, RPC_SUCCESS); - if (svc_safe_putnetobj(resv, &rsip->out_handle)) - goto out; - if (resv->iov_len + 3 * 4 > PAGE_SIZE) - goto out; - svc_putnl(resv, rsip->major_status); - svc_putnl(resv, rsip->minor_status); - svc_putnl(resv, GSS_SEQ_WIN); - if (svc_safe_putnetobj(resv, &rsip->out_token)) - goto out; - } + return SVC_CLOSE; + + ret = SVC_CLOSE; + /* Got an answer to the upcall; use it: */ + if (gss_write_init_verf(rqstp, rsip)) + goto out; + if (resv->iov_len + 4 > PAGE_SIZE) + goto out; + svc_putnl(resv, RPC_SUCCESS); + if (svc_safe_putnetobj(resv, &rsip->out_handle)) + goto out; + if (resv->iov_len + 3 * 4 > PAGE_SIZE) + goto out; + svc_putnl(resv, rsip->major_status); + svc_putnl(resv, rsip->minor_status); + svc_putnl(resv, GSS_SEQ_WIN); + if (svc_safe_putnetobj(resv, &rsip->out_token)) + goto out; + ret = SVC_COMPLETE; out: cache_put(&rsip->h, &rsi_cache); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 7dce81a926c5..e433e7580e27 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -33,15 +33,16 @@ #include <linux/sunrpc/cache.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include "netns.h" #define RPCDBG_FACILITY RPCDBG_CACHE -static int cache_defer_req(struct cache_req *req, struct cache_head *item); +static void cache_defer_req(struct cache_req *req, struct cache_head *item); static void cache_revisit_request(struct cache_head *item); static void cache_init(struct cache_head *h) { - time_t now = get_seconds(); + time_t now = seconds_since_boot(); h->next = NULL; h->flags = 0; kref_init(&h->ref); @@ -51,7 +52,7 @@ static void cache_init(struct cache_head *h) static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h) { - return (h->expiry_time < get_seconds()) || + return (h->expiry_time < seconds_since_boot()) || (detail->flush_time > h->last_refresh); } @@ -126,7 +127,7 @@ static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch); static void cache_fresh_locked(struct cache_head *head, time_t expiry) { head->expiry_time = expiry; - head->last_refresh = get_seconds(); + head->last_refresh = seconds_since_boot(); set_bit(CACHE_VALID, &head->flags); } @@ -237,7 +238,7 @@ int cache_check(struct cache_detail *detail, /* now see if we want to start an upcall */ refresh_age = (h->expiry_time - h->last_refresh); - age = get_seconds() - h->last_refresh; + age = seconds_since_boot() - h->last_refresh; if (rqstp == NULL) { if (rv == -EAGAIN) @@ -252,7 +253,7 @@ int cache_check(struct cache_detail *detail, cache_revisit_request(h); if (rv == -EAGAIN) { set_bit(CACHE_NEGATIVE, &h->flags); - cache_fresh_locked(h, get_seconds()+CACHE_NEW_EXPIRY); + cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY); cache_fresh_unlocked(h, detail); rv = -ENOENT; } @@ -267,7 +268,8 @@ int cache_check(struct cache_detail *detail, } if (rv == -EAGAIN) { - if (cache_defer_req(rqstp, h) < 0) { + cache_defer_req(rqstp, h); + if (!test_bit(CACHE_PENDING, &h->flags)) { /* Request is not deferred */ rv = cache_is_valid(detail, h); if (rv == -EAGAIN) @@ -387,11 +389,11 @@ static int cache_clean(void) return -1; } current_detail = list_entry(next, struct cache_detail, others); - if (current_detail->nextcheck > get_seconds()) + if (current_detail->nextcheck > seconds_since_boot()) current_index = current_detail->hash_size; else { current_index = 0; - current_detail->nextcheck = get_seconds()+30*60; + current_detail->nextcheck = seconds_since_boot()+30*60; } } @@ -476,7 +478,7 @@ EXPORT_SYMBOL_GPL(cache_flush); void cache_purge(struct cache_detail *detail) { detail->flush_time = LONG_MAX; - detail->nextcheck = get_seconds(); + detail->nextcheck = seconds_since_boot(); cache_flush(); detail->flush_time = 1; } @@ -505,81 +507,155 @@ EXPORT_SYMBOL_GPL(cache_purge); static DEFINE_SPINLOCK(cache_defer_lock); static LIST_HEAD(cache_defer_list); -static struct list_head cache_defer_hash[DFR_HASHSIZE]; +static struct hlist_head cache_defer_hash[DFR_HASHSIZE]; static int cache_defer_cnt; -static int cache_defer_req(struct cache_req *req, struct cache_head *item) +static void __unhash_deferred_req(struct cache_deferred_req *dreq) +{ + hlist_del_init(&dreq->hash); + if (!list_empty(&dreq->recent)) { + list_del_init(&dreq->recent); + cache_defer_cnt--; + } +} + +static void __hash_deferred_req(struct cache_deferred_req *dreq, struct cache_head *item) { - struct cache_deferred_req *dreq, *discard; int hash = DFR_HASH(item); - if (cache_defer_cnt >= DFR_MAX) { - /* too much in the cache, randomly drop this one, - * or continue and drop the oldest below - */ - if (net_random()&1) - return -ENOMEM; - } - dreq = req->defer(req); - if (dreq == NULL) - return -ENOMEM; + INIT_LIST_HEAD(&dreq->recent); + hlist_add_head(&dreq->hash, &cache_defer_hash[hash]); +} + +static void setup_deferral(struct cache_deferred_req *dreq, + struct cache_head *item, + int count_me) +{ dreq->item = item; spin_lock(&cache_defer_lock); - list_add(&dreq->recent, &cache_defer_list); - - if (cache_defer_hash[hash].next == NULL) - INIT_LIST_HEAD(&cache_defer_hash[hash]); - list_add(&dreq->hash, &cache_defer_hash[hash]); + __hash_deferred_req(dreq, item); - /* it is in, now maybe clean up */ - discard = NULL; - if (++cache_defer_cnt > DFR_MAX) { - discard = list_entry(cache_defer_list.prev, - struct cache_deferred_req, recent); - list_del_init(&discard->recent); - list_del_init(&discard->hash); - cache_defer_cnt--; + if (count_me) { + cache_defer_cnt++; + list_add(&dreq->recent, &cache_defer_list); } + spin_unlock(&cache_defer_lock); +} + +struct thread_deferred_req { + struct cache_deferred_req handle; + struct completion completion; +}; + +static void cache_restart_thread(struct cache_deferred_req *dreq, int too_many) +{ + struct thread_deferred_req *dr = + container_of(dreq, struct thread_deferred_req, handle); + complete(&dr->completion); +} + +static void cache_wait_req(struct cache_req *req, struct cache_head *item) +{ + struct thread_deferred_req sleeper; + struct cache_deferred_req *dreq = &sleeper.handle; + + sleeper.completion = COMPLETION_INITIALIZER_ONSTACK(sleeper.completion); + dreq->revisit = cache_restart_thread; + + setup_deferral(dreq, item, 0); + + if (!test_bit(CACHE_PENDING, &item->flags) || + wait_for_completion_interruptible_timeout( + &sleeper.completion, req->thread_wait) <= 0) { + /* The completion wasn't completed, so we need + * to clean up + */ + spin_lock(&cache_defer_lock); + if (!hlist_unhashed(&sleeper.handle.hash)) { + __unhash_deferred_req(&sleeper.handle); + spin_unlock(&cache_defer_lock); + } else { + /* cache_revisit_request already removed + * this from the hash table, but hasn't + * called ->revisit yet. It will very soon + * and we need to wait for it. + */ + spin_unlock(&cache_defer_lock); + wait_for_completion(&sleeper.completion); + } + } +} + +static void cache_limit_defers(void) +{ + /* Make sure we haven't exceed the limit of allowed deferred + * requests. + */ + struct cache_deferred_req *discard = NULL; + + if (cache_defer_cnt <= DFR_MAX) + return; + + spin_lock(&cache_defer_lock); + + /* Consider removing either the first or the last */ + if (cache_defer_cnt > DFR_MAX) { + if (net_random() & 1) + discard = list_entry(cache_defer_list.next, + struct cache_deferred_req, recent); + else + discard = list_entry(cache_defer_list.prev, + struct cache_deferred_req, recent); + __unhash_deferred_req(discard); + } + spin_unlock(&cache_defer_lock); if (discard) - /* there was one too many */ discard->revisit(discard, 1); +} - if (!test_bit(CACHE_PENDING, &item->flags)) { - /* must have just been validated... */ - cache_revisit_request(item); - return -EAGAIN; +static void cache_defer_req(struct cache_req *req, struct cache_head *item) +{ + struct cache_deferred_req *dreq; + + if (req->thread_wait) { + cache_wait_req(req, item); + if (!test_bit(CACHE_PENDING, &item->flags)) + return; } - return 0; + dreq = req->defer(req); + if (dreq == NULL) + return; + setup_deferral(dreq, item, 1); + if (!test_bit(CACHE_PENDING, &item->flags)) + /* Bit could have been cleared before we managed to + * set up the deferral, so need to revisit just in case + */ + cache_revisit_request(item); + + cache_limit_defers(); } static void cache_revisit_request(struct cache_head *item) { struct cache_deferred_req *dreq; struct list_head pending; - - struct list_head *lp; + struct hlist_node *lp, *tmp; int hash = DFR_HASH(item); INIT_LIST_HEAD(&pending); spin_lock(&cache_defer_lock); - lp = cache_defer_hash[hash].next; - if (lp) { - while (lp != &cache_defer_hash[hash]) { - dreq = list_entry(lp, struct cache_deferred_req, hash); - lp = lp->next; - if (dreq->item == item) { - list_del_init(&dreq->hash); - list_move(&dreq->recent, &pending); - cache_defer_cnt--; - } + hlist_for_each_entry_safe(dreq, lp, tmp, &cache_defer_hash[hash], hash) + if (dreq->item == item) { + __unhash_deferred_req(dreq); + list_add(&dreq->recent, &pending); } - } + spin_unlock(&cache_defer_lock); while (!list_empty(&pending)) { @@ -600,9 +676,8 @@ void cache_clean_deferred(void *owner) list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) { if (dreq->owner == owner) { - list_del_init(&dreq->hash); - list_move(&dreq->recent, &pending); - cache_defer_cnt--; + __unhash_deferred_req(dreq); + list_add(&dreq->recent, &pending); } } spin_unlock(&cache_defer_lock); @@ -901,7 +976,7 @@ static int cache_release(struct inode *inode, struct file *filp, filp->private_data = NULL; kfree(rp); - cd->last_close = get_seconds(); + cd->last_close = seconds_since_boot(); atomic_dec(&cd->readers); } module_put(cd->owner); @@ -1014,6 +1089,23 @@ static void warn_no_listener(struct cache_detail *detail) } } +static bool cache_listeners_exist(struct cache_detail *detail) +{ + if (atomic_read(&detail->readers)) + return true; + if (detail->last_close == 0) + /* This cache was never opened */ + return false; + if (detail->last_close < seconds_since_boot() - 30) + /* + * We allow for the possibility that someone might + * restart a userspace daemon without restarting the + * server; but after 30 seconds, we give up. + */ + return false; + return true; +} + /* * register an upcall request to user-space and queue it up for read() by the * upcall daemon. @@ -1032,10 +1124,9 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h, char *bp; int len; - if (atomic_read(&detail->readers) == 0 && - detail->last_close < get_seconds() - 30) { - warn_no_listener(detail); - return -EINVAL; + if (!cache_listeners_exist(detail)) { + warn_no_listener(detail); + return -EINVAL; } buf = kmalloc(PAGE_SIZE, GFP_KERNEL); @@ -1094,13 +1185,19 @@ int qword_get(char **bpp, char *dest, int bufsize) if (bp[0] == '\\' && bp[1] == 'x') { /* HEX STRING */ bp += 2; - while (isxdigit(bp[0]) && isxdigit(bp[1]) && len < bufsize) { - int byte = isdigit(*bp) ? *bp-'0' : toupper(*bp)-'A'+10; - bp++; - byte <<= 4; - byte |= isdigit(*bp) ? *bp-'0' : toupper(*bp)-'A'+10; - *dest++ = byte; - bp++; + while (len < bufsize) { + int h, l; + + h = hex_to_bin(bp[0]); + if (h < 0) + break; + + l = hex_to_bin(bp[1]); + if (l < 0) + break; + + *dest++ = (h << 4) | l; + bp += 2; len++; } } else { @@ -1218,7 +1315,8 @@ static int c_show(struct seq_file *m, void *p) ifdebug(CACHE) seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n", - cp->expiry_time, atomic_read(&cp->ref.refcount), cp->flags); + convert_to_wallclock(cp->expiry_time), + atomic_read(&cp->ref.refcount), cp->flags); cache_get(cp); if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ @@ -1284,7 +1382,7 @@ static ssize_t read_flush(struct file *file, char __user *buf, unsigned long p = *ppos; size_t len; - sprintf(tbuf, "%lu\n", cd->flush_time); + sprintf(tbuf, "%lu\n", convert_to_wallclock(cd->flush_time)); len = strlen(tbuf); if (p >= len) return 0; @@ -1302,19 +1400,20 @@ static ssize_t write_flush(struct file *file, const char __user *buf, struct cache_detail *cd) { char tbuf[20]; - char *ep; - long flushtime; + char *bp, *ep; + if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; if (copy_from_user(tbuf, buf, count)) return -EFAULT; tbuf[count] = 0; - flushtime = simple_strtoul(tbuf, &ep, 0); + simple_strtoul(tbuf, &ep, 0); if (*ep && *ep != '\n') return -EINVAL; - cd->flush_time = flushtime; - cd->nextcheck = get_seconds(); + bp = tbuf; + cd->flush_time = get_expiry(&bp); + cd->nextcheck = seconds_since_boot(); cache_flush(); *ppos += count; @@ -1438,8 +1537,10 @@ static const struct file_operations cache_flush_operations_procfs = { .llseek = no_llseek, }; -static void remove_cache_proc_entries(struct cache_detail *cd) +static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net) { + struct sunrpc_net *sn; + if (cd->u.procfs.proc_ent == NULL) return; if (cd->u.procfs.flush_ent) @@ -1449,15 +1550,18 @@ static void remove_cache_proc_entries(struct cache_detail *cd) if (cd->u.procfs.content_ent) remove_proc_entry("content", cd->u.procfs.proc_ent); cd->u.procfs.proc_ent = NULL; - remove_proc_entry(cd->name, proc_net_rpc); + sn = net_generic(net, sunrpc_net_id); + remove_proc_entry(cd->name, sn->proc_net_rpc); } #ifdef CONFIG_PROC_FS -static int create_cache_proc_entries(struct cache_detail *cd) +static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) { struct proc_dir_entry *p; + struct sunrpc_net *sn; - cd->u.procfs.proc_ent = proc_mkdir(cd->name, proc_net_rpc); + sn = net_generic(net, sunrpc_net_id); + cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc); if (cd->u.procfs.proc_ent == NULL) goto out_nomem; cd->u.procfs.channel_ent = NULL; @@ -1488,11 +1592,11 @@ static int create_cache_proc_entries(struct cache_detail *cd) } return 0; out_nomem: - remove_cache_proc_entries(cd); + remove_cache_proc_entries(cd, net); return -ENOMEM; } #else /* CONFIG_PROC_FS */ -static int create_cache_proc_entries(struct cache_detail *cd) +static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) { return 0; } @@ -1503,23 +1607,33 @@ void __init cache_initialize(void) INIT_DELAYED_WORK_DEFERRABLE(&cache_cleaner, do_cache_clean); } -int cache_register(struct cache_detail *cd) +int cache_register_net(struct cache_detail *cd, struct net *net) { int ret; sunrpc_init_cache_detail(cd); - ret = create_cache_proc_entries(cd); + ret = create_cache_proc_entries(cd, net); if (ret) sunrpc_destroy_cache_detail(cd); return ret; } + +int cache_register(struct cache_detail *cd) +{ + return cache_register_net(cd, &init_net); +} EXPORT_SYMBOL_GPL(cache_register); -void cache_unregister(struct cache_detail *cd) +void cache_unregister_net(struct cache_detail *cd, struct net *net) { - remove_cache_proc_entries(cd); + remove_cache_proc_entries(cd, net); sunrpc_destroy_cache_detail(cd); } + +void cache_unregister(struct cache_detail *cd) +{ + cache_unregister_net(cd, &init_net); +} EXPORT_SYMBOL_GPL(cache_unregister); static ssize_t cache_read_pipefs(struct file *filp, char __user *buf, diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index fa5549079d79..9dab9573be41 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -284,6 +284,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) struct rpc_xprt *xprt; struct rpc_clnt *clnt; struct xprt_create xprtargs = { + .net = args->net, .ident = args->protocol, .srcaddr = args->saddress, .dstaddr = args->address, @@ -1675,7 +1676,7 @@ rpc_verify_header(struct rpc_task *task) rpcauth_invalcred(task); /* Ensure we obtain a new XID! */ xprt_release(task); - task->tk_action = call_refresh; + task->tk_action = call_reserve; goto out_retry; case RPC_AUTH_BADCRED: case RPC_AUTH_BADVERF: diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h new file mode 100644 index 000000000000..d013bf211cae --- /dev/null +++ b/net/sunrpc/netns.h @@ -0,0 +1,19 @@ +#ifndef __SUNRPC_NETNS_H__ +#define __SUNRPC_NETNS_H__ + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +struct cache_detail; + +struct sunrpc_net { + struct proc_dir_entry *proc_net_rpc; + struct cache_detail *ip_map_cache; +}; + +extern int sunrpc_net_id; + +int ip_map_cache_create(struct net *); +void ip_map_cache_destroy(struct net *); + +#endif diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 52f252432144..10a17a37ec4e 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -28,7 +28,7 @@ #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/cache.h> -static struct vfsmount *rpc_mount __read_mostly; +static struct vfsmount *rpc_mnt __read_mostly; static int rpc_mount_count; static struct file_system_type rpc_pipe_fs_type; @@ -417,16 +417,16 @@ struct vfsmount *rpc_get_mount(void) { int err; - err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mount, &rpc_mount_count); + err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mnt, &rpc_mount_count); if (err != 0) return ERR_PTR(err); - return rpc_mount; + return rpc_mnt; } EXPORT_SYMBOL_GPL(rpc_get_mount); void rpc_put_mount(void) { - simple_release_fs(&rpc_mount, &rpc_mount_count); + simple_release_fs(&rpc_mnt, &rpc_mount_count); } EXPORT_SYMBOL_GPL(rpc_put_mount); @@ -445,6 +445,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode) struct inode *inode = new_inode(sb); if (!inode) return NULL; + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch(mode & S_IFMT) { @@ -1017,17 +1018,17 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static int -rpc_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry * +rpc_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, rpc_fill_super, mnt); + return mount_single(fs_type, flags, data, rpc_fill_super); } static struct file_system_type rpc_pipe_fs_type = { .owner = THIS_MODULE, .name = "rpc_pipefs", - .get_sb = rpc_get_sb, + .mount = rpc_mount, .kill_sb = kill_litter_super, }; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index dac219a56ae1..fa6d7ca2c851 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -177,6 +177,7 @@ static DEFINE_MUTEX(rpcb_create_local_mutex); static int rpcb_create_local(void) { struct rpc_create_args args = { + .net = &init_net, .protocol = XPRT_TRANSPORT_TCP, .address = (struct sockaddr *)&rpcb_inaddr_loopback, .addrsize = sizeof(rpcb_inaddr_loopback), @@ -211,8 +212,9 @@ static int rpcb_create_local(void) */ clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); if (IS_ERR(clnt4)) { - dprintk("RPC: failed to create local rpcbind v4 " - "cleint (errno %ld).\n", PTR_ERR(clnt4)); + dprintk("RPC: failed to bind second program to " + "rpcbind v4 client (errno %ld).\n", + PTR_ERR(clnt4)); clnt4 = NULL; } @@ -228,6 +230,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, size_t salen, int proto, u32 version) { struct rpc_create_args args = { + .net = &init_net, .protocol = proto, .address = srvaddr, .addrsize = salen, @@ -247,7 +250,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, ((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT); break; default: - return NULL; + return ERR_PTR(-EAFNOSUPPORT); } return rpc_create(&args); @@ -475,57 +478,6 @@ int rpcb_v4_register(const u32 program, const u32 version, return -EAFNOSUPPORT; } -/** - * rpcb_getport_sync - obtain the port for an RPC service on a given host - * @sin: address of remote peer - * @prog: RPC program number to bind - * @vers: RPC version number to bind - * @prot: transport protocol to use to make this request - * - * Return value is the requested advertised port number, - * or a negative errno value. - * - * Called from outside the RPC client in a synchronous task context. - * Uses default timeout parameters specified by underlying transport. - * - * XXX: Needs to support IPv6 - */ -int rpcb_getport_sync(struct sockaddr_in *sin, u32 prog, u32 vers, int prot) -{ - struct rpcbind_args map = { - .r_prog = prog, - .r_vers = vers, - .r_prot = prot, - .r_port = 0, - }; - struct rpc_message msg = { - .rpc_proc = &rpcb_procedures2[RPCBPROC_GETPORT], - .rpc_argp = &map, - .rpc_resp = &map, - }; - struct rpc_clnt *rpcb_clnt; - int status; - - dprintk("RPC: %s(%pI4, %u, %u, %d)\n", - __func__, &sin->sin_addr.s_addr, prog, vers, prot); - - rpcb_clnt = rpcb_create(NULL, (struct sockaddr *)sin, - sizeof(*sin), prot, RPCBVERS_2); - if (IS_ERR(rpcb_clnt)) - return PTR_ERR(rpcb_clnt); - - status = rpc_call_sync(rpcb_clnt, &msg, 0); - rpc_shutdown_client(rpcb_clnt); - - if (status >= 0) { - if (map.r_port != 0) - return map.r_port; - status = -EACCES; - } - return status; -} -EXPORT_SYMBOL_GPL(rpcb_getport_sync); - static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, struct rpc_procinfo *proc) { struct rpc_message msg = { diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index aa5dbda6608c..243fc09b164e 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -908,7 +908,7 @@ static int rpciod_start(void) * Create the rpciod thread and wait for it to start. */ dprintk("RPC: creating workqueue rpciod\n"); - wq = create_workqueue("rpciod"); + wq = alloc_workqueue("rpciod", WQ_RESCUER, 0); rpciod_workqueue = wq; return rpciod_workqueue != NULL; } diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index ea1046f3f9a3..f71a73107ae9 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -22,11 +22,10 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/metrics.h> -#include <net/net_namespace.h> -#define RPCDBG_FACILITY RPCDBG_MISC +#include "netns.h" -struct proc_dir_entry *proc_net_rpc = NULL; +#define RPCDBG_FACILITY RPCDBG_MISC /* * Get RPC client stats @@ -218,10 +217,11 @@ EXPORT_SYMBOL_GPL(rpc_print_iostats); static inline struct proc_dir_entry * do_register(const char *name, void *data, const struct file_operations *fops) { - rpc_proc_init(); - dprintk("RPC: registering /proc/net/rpc/%s\n", name); + struct sunrpc_net *sn; - return proc_create_data(name, 0, proc_net_rpc, fops, data); + dprintk("RPC: registering /proc/net/rpc/%s\n", name); + sn = net_generic(&init_net, sunrpc_net_id); + return proc_create_data(name, 0, sn->proc_net_rpc, fops, data); } struct proc_dir_entry * @@ -234,7 +234,10 @@ EXPORT_SYMBOL_GPL(rpc_proc_register); void rpc_proc_unregister(const char *name) { - remove_proc_entry(name, proc_net_rpc); + struct sunrpc_net *sn; + + sn = net_generic(&init_net, sunrpc_net_id); + remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(rpc_proc_unregister); @@ -248,25 +251,29 @@ EXPORT_SYMBOL_GPL(svc_proc_register); void svc_proc_unregister(const char *name) { - remove_proc_entry(name, proc_net_rpc); + struct sunrpc_net *sn; + + sn = net_generic(&init_net, sunrpc_net_id); + remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(svc_proc_unregister); -void -rpc_proc_init(void) +int rpc_proc_init(struct net *net) { + struct sunrpc_net *sn; + dprintk("RPC: registering /proc/net/rpc\n"); - if (!proc_net_rpc) - proc_net_rpc = proc_mkdir("rpc", init_net.proc_net); + sn = net_generic(net, sunrpc_net_id); + sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net); + if (sn->proc_net_rpc == NULL) + return -ENOMEM; + + return 0; } -void -rpc_proc_exit(void) +void rpc_proc_exit(struct net *net) { dprintk("RPC: unregistering /proc/net/rpc\n"); - if (proc_net_rpc) { - proc_net_rpc = NULL; - remove_proc_entry("rpc", init_net.proc_net); - } + remove_proc_entry("rpc", net->proc_net); } diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index c0d085013a2b..9d0809160994 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -22,7 +22,44 @@ #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/xprtsock.h> -extern struct cache_detail ip_map_cache, unix_gid_cache; +#include "netns.h" + +int sunrpc_net_id; + +static __net_init int sunrpc_init_net(struct net *net) +{ + int err; + + err = rpc_proc_init(net); + if (err) + goto err_proc; + + err = ip_map_cache_create(net); + if (err) + goto err_ipmap; + + return 0; + +err_ipmap: + rpc_proc_exit(net); +err_proc: + return err; +} + +static __net_exit void sunrpc_exit_net(struct net *net) +{ + ip_map_cache_destroy(net); + rpc_proc_exit(net); +} + +static struct pernet_operations sunrpc_net_ops = { + .init = sunrpc_init_net, + .exit = sunrpc_exit_net, + .id = &sunrpc_net_id, + .size = sizeof(struct sunrpc_net), +}; + +extern struct cache_detail unix_gid_cache; extern void cleanup_rpcb_clnt(void); @@ -38,18 +75,22 @@ init_sunrpc(void) err = rpcauth_init_module(); if (err) goto out3; + + cache_initialize(); + + err = register_pernet_subsys(&sunrpc_net_ops); + if (err) + goto out4; #ifdef RPC_DEBUG rpc_register_sysctl(); #endif -#ifdef CONFIG_PROC_FS - rpc_proc_init(); -#endif - cache_initialize(); - cache_register(&ip_map_cache); cache_register(&unix_gid_cache); svc_init_xprt_sock(); /* svc sock transport */ init_socket_xprt(); /* clnt sock transport */ return 0; + +out4: + rpcauth_remove_module(); out3: rpc_destroy_mempool(); out2: @@ -67,14 +108,11 @@ cleanup_sunrpc(void) svc_cleanup_xprt_sock(); unregister_rpc_pipefs(); rpc_destroy_mempool(); - cache_unregister(&ip_map_cache); cache_unregister(&unix_gid_cache); + unregister_pernet_subsys(&sunrpc_net_ops); #ifdef RPC_DEBUG rpc_unregister_sysctl(); #endif -#ifdef CONFIG_PROC_FS - rpc_proc_exit(); -#endif rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_LICENSE("GPL"); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d9017d64597e..6359c42c4941 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1055,6 +1055,9 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto err_bad; case SVC_DENIED: goto err_bad_auth; + case SVC_CLOSE: + if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) + svc_close_xprt(rqstp->rq_xprt); case SVC_DROP: goto dropit; case SVC_COMPLETE: diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index cbc084939dd8..c82fe739fbdc 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -100,16 +100,14 @@ EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); */ int svc_print_xprts(char *buf, int maxlen) { - struct list_head *le; + struct svc_xprt_class *xcl; char tmpstr[80]; int len = 0; buf[0] = '\0'; spin_lock(&svc_xprt_class_lock); - list_for_each(le, &svc_xprt_class_list) { + list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { int slen; - struct svc_xprt_class *xcl = - list_entry(le, struct svc_xprt_class, xcl_list); sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); slen = strlen(tmpstr); @@ -128,9 +126,9 @@ static void svc_xprt_free(struct kref *kref) struct svc_xprt *xprt = container_of(kref, struct svc_xprt, xpt_ref); struct module *owner = xprt->xpt_class->xcl_owner; - if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags) && - xprt->xpt_auth_cache != NULL) - svcauth_unix_info_release(xprt->xpt_auth_cache); + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) + svcauth_unix_info_release(xprt); + put_net(xprt->xpt_net); xprt->xpt_ops->xpo_free(xprt); module_put(owner); } @@ -156,15 +154,18 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, INIT_LIST_HEAD(&xprt->xpt_list); INIT_LIST_HEAD(&xprt->xpt_ready); INIT_LIST_HEAD(&xprt->xpt_deferred); + INIT_LIST_HEAD(&xprt->xpt_users); mutex_init(&xprt->xpt_mutex); spin_lock_init(&xprt->xpt_lock); set_bit(XPT_BUSY, &xprt->xpt_flags); rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending"); + xprt->xpt_net = get_net(&init_net); } EXPORT_SYMBOL_GPL(svc_xprt_init); static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, struct svc_serv *serv, + struct net *net, const int family, const unsigned short port, int flags) @@ -199,12 +200,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, return ERR_PTR(-EAFNOSUPPORT); } - return xcl->xcl_ops->xpo_create(serv, sap, len, flags); + return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); } int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, - const int family, const unsigned short port, - int flags) + struct net *net, const int family, + const unsigned short port, int flags) { struct svc_xprt_class *xcl; @@ -220,7 +221,7 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, goto err; spin_unlock(&svc_xprt_class_lock); - newxprt = __svc_xpo_create(xcl, serv, family, port, flags); + newxprt = __svc_xpo_create(xcl, serv, net, family, port, flags); if (IS_ERR(newxprt)) { module_put(xcl->xcl_owner); return PTR_ERR(newxprt); @@ -329,12 +330,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) "svc_xprt_enqueue: " "threads and transports both waiting??\n"); - if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { - /* Don't enqueue dead transports */ - dprintk("svc: transport %p is dead, not enqueued\n", xprt); - goto out_unlock; - } - pool->sp_stats.packets++; /* Mark transport as busy. It will remain in this state until @@ -651,6 +646,11 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (signalled() || kthread_should_stop()) return -EINTR; + /* Normally we will wait up to 5 seconds for any required + * cache information to be provided. + */ + rqstp->rq_chandle.thread_wait = 5*HZ; + spin_lock_bh(&pool->sp_lock); xprt = svc_xprt_dequeue(pool); if (xprt) { @@ -658,6 +658,12 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) svc_xprt_get(xprt); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + + /* As there is a shortage of threads and this request + * had to be queued, don't allow the thread to wait so + * long for cache updates. + */ + rqstp->rq_chandle.thread_wait = 1*HZ; } else { /* No data pending. Go to sleep */ svc_thread_enqueue(pool, rqstp); @@ -868,6 +874,19 @@ static void svc_age_temp_xprts(unsigned long closure) mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } +static void call_xpt_users(struct svc_xprt *xprt) +{ + struct svc_xpt_user *u; + + spin_lock(&xprt->xpt_lock); + while (!list_empty(&xprt->xpt_users)) { + u = list_first_entry(&xprt->xpt_users, struct svc_xpt_user, list); + list_del(&u->list); + u->callback(u); + } + spin_unlock(&xprt->xpt_lock); +} + /* * Remove a dead transport */ @@ -878,7 +897,7 @@ void svc_delete_xprt(struct svc_xprt *xprt) /* Only do this once */ if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) - return; + BUG(); dprintk("svc: svc_delete_xprt(%p)\n", xprt); xprt->xpt_ops->xpo_detach(xprt); @@ -900,6 +919,7 @@ void svc_delete_xprt(struct svc_xprt *xprt) while ((dr = svc_deferred_dequeue(xprt)) != NULL) kfree(dr); + call_xpt_users(xprt); svc_xprt_put(xprt); } @@ -910,10 +930,7 @@ void svc_close_xprt(struct svc_xprt *xprt) /* someone else will have to effect the close */ return; - svc_xprt_get(xprt); svc_delete_xprt(xprt); - clear_bit(XPT_BUSY, &xprt->xpt_flags); - svc_xprt_put(xprt); } EXPORT_SYMBOL_GPL(svc_close_xprt); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 207311610988..560677d187f1 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -18,6 +18,8 @@ #include <linux/sunrpc/clnt.h> +#include "netns.h" + /* * AUTHUNIX and AUTHNULL credentials are both handled here. * AUTHNULL is treated just like AUTHUNIX except that the uid/gid @@ -92,7 +94,6 @@ struct ip_map { struct unix_domain *m_client; int m_add_change; }; -static struct cache_head *ip_table[IP_HASHMAX]; static void ip_map_put(struct kref *kref) { @@ -178,8 +179,8 @@ static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h) return sunrpc_cache_pipe_upcall(cd, h, ip_map_request); } -static struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr); -static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry); +static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr); +static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry); static int ip_map_parse(struct cache_detail *cd, char *mesg, int mlen) @@ -219,10 +220,9 @@ static int ip_map_parse(struct cache_detail *cd, switch (address.sa.sa_family) { case AF_INET: /* Form a mapped IPv4 address in sin6 */ - memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; - sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); - sin6.sin6_addr.s6_addr32[3] = address.s4.sin_addr.s_addr; + ipv6_addr_set_v4mapped(address.s4.sin_addr.s_addr, + &sin6.sin6_addr); break; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: @@ -249,9 +249,9 @@ static int ip_map_parse(struct cache_detail *cd, dom = NULL; /* IPv6 scope IDs are ignored for now */ - ipmp = ip_map_lookup(class, &sin6.sin6_addr); + ipmp = __ip_map_lookup(cd, class, &sin6.sin6_addr); if (ipmp) { - err = ip_map_update(ipmp, + err = __ip_map_update(cd, ipmp, container_of(dom, struct unix_domain, h), expiry); } else @@ -294,29 +294,15 @@ static int ip_map_show(struct seq_file *m, } -struct cache_detail ip_map_cache = { - .owner = THIS_MODULE, - .hash_size = IP_HASHMAX, - .hash_table = ip_table, - .name = "auth.unix.ip", - .cache_put = ip_map_put, - .cache_upcall = ip_map_upcall, - .cache_parse = ip_map_parse, - .cache_show = ip_map_show, - .match = ip_map_match, - .init = ip_map_init, - .update = update, - .alloc = ip_map_alloc, -}; - -static struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr) +static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, + struct in6_addr *addr) { struct ip_map ip; struct cache_head *ch; strcpy(ip.m_class, class); ipv6_addr_copy(&ip.m_addr, addr); - ch = sunrpc_cache_lookup(&ip_map_cache, &ip.h, + ch = sunrpc_cache_lookup(cd, &ip.h, hash_str(class, IP_HASHBITS) ^ hash_ip6(*addr)); @@ -326,7 +312,17 @@ static struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr) return NULL; } -static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry) +static inline struct ip_map *ip_map_lookup(struct net *net, char *class, + struct in6_addr *addr) +{ + struct sunrpc_net *sn; + + sn = net_generic(net, sunrpc_net_id); + return __ip_map_lookup(sn->ip_map_cache, class, addr); +} + +static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, + struct unix_domain *udom, time_t expiry) { struct ip_map ip; struct cache_head *ch; @@ -344,17 +340,25 @@ static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t ex ip.m_add_change++; } ip.h.expiry_time = expiry; - ch = sunrpc_cache_update(&ip_map_cache, - &ip.h, &ipm->h, + ch = sunrpc_cache_update(cd, &ip.h, &ipm->h, hash_str(ipm->m_class, IP_HASHBITS) ^ hash_ip6(ipm->m_addr)); if (!ch) return -ENOMEM; - cache_put(ch, &ip_map_cache); + cache_put(ch, cd); return 0; } -int auth_unix_add_addr(struct in6_addr *addr, struct auth_domain *dom) +static inline int ip_map_update(struct net *net, struct ip_map *ipm, + struct unix_domain *udom, time_t expiry) +{ + struct sunrpc_net *sn; + + sn = net_generic(net, sunrpc_net_id); + return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry); +} + +int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom) { struct unix_domain *udom; struct ip_map *ipmp; @@ -362,10 +366,10 @@ int auth_unix_add_addr(struct in6_addr *addr, struct auth_domain *dom) if (dom->flavour != &svcauth_unix) return -EINVAL; udom = container_of(dom, struct unix_domain, h); - ipmp = ip_map_lookup("nfsd", addr); + ipmp = ip_map_lookup(net, "nfsd", addr); if (ipmp) - return ip_map_update(ipmp, udom, NEVER); + return ip_map_update(net, ipmp, udom, NEVER); else return -ENOMEM; } @@ -383,16 +387,18 @@ int auth_unix_forget_old(struct auth_domain *dom) } EXPORT_SYMBOL_GPL(auth_unix_forget_old); -struct auth_domain *auth_unix_lookup(struct in6_addr *addr) +struct auth_domain *auth_unix_lookup(struct net *net, struct in6_addr *addr) { struct ip_map *ipm; struct auth_domain *rv; + struct sunrpc_net *sn; - ipm = ip_map_lookup("nfsd", addr); + sn = net_generic(net, sunrpc_net_id); + ipm = ip_map_lookup(net, "nfsd", addr); if (!ipm) return NULL; - if (cache_check(&ip_map_cache, &ipm->h, NULL)) + if (cache_check(sn->ip_map_cache, &ipm->h, NULL)) return NULL; if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) { @@ -403,22 +409,29 @@ struct auth_domain *auth_unix_lookup(struct in6_addr *addr) rv = &ipm->m_client->h; kref_get(&rv->ref); } - cache_put(&ipm->h, &ip_map_cache); + cache_put(&ipm->h, sn->ip_map_cache); return rv; } EXPORT_SYMBOL_GPL(auth_unix_lookup); void svcauth_unix_purge(void) { - cache_purge(&ip_map_cache); + struct net *net; + + for_each_net(net) { + struct sunrpc_net *sn; + + sn = net_generic(net, sunrpc_net_id); + cache_purge(sn->ip_map_cache); + } } EXPORT_SYMBOL_GPL(svcauth_unix_purge); static inline struct ip_map * -ip_map_cached_get(struct svc_rqst *rqstp) +ip_map_cached_get(struct svc_xprt *xprt) { struct ip_map *ipm = NULL; - struct svc_xprt *xprt = rqstp->rq_xprt; + struct sunrpc_net *sn; if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { spin_lock(&xprt->xpt_lock); @@ -430,9 +443,10 @@ ip_map_cached_get(struct svc_rqst *rqstp) * remembered, e.g. by a second mount from the * same IP address. */ + sn = net_generic(xprt->xpt_net, sunrpc_net_id); xprt->xpt_auth_cache = NULL; spin_unlock(&xprt->xpt_lock); - cache_put(&ipm->h, &ip_map_cache); + cache_put(&ipm->h, sn->ip_map_cache); return NULL; } cache_get(&ipm->h); @@ -443,10 +457,8 @@ ip_map_cached_get(struct svc_rqst *rqstp) } static inline void -ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) +ip_map_cached_put(struct svc_xprt *xprt, struct ip_map *ipm) { - struct svc_xprt *xprt = rqstp->rq_xprt; - if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { spin_lock(&xprt->xpt_lock); if (xprt->xpt_auth_cache == NULL) { @@ -456,15 +468,26 @@ ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) } spin_unlock(&xprt->xpt_lock); } - if (ipm) - cache_put(&ipm->h, &ip_map_cache); + if (ipm) { + struct sunrpc_net *sn; + + sn = net_generic(xprt->xpt_net, sunrpc_net_id); + cache_put(&ipm->h, sn->ip_map_cache); + } } void -svcauth_unix_info_release(void *info) +svcauth_unix_info_release(struct svc_xprt *xpt) { - struct ip_map *ipm = info; - cache_put(&ipm->h, &ip_map_cache); + struct ip_map *ipm; + + ipm = xpt->xpt_auth_cache; + if (ipm != NULL) { + struct sunrpc_net *sn; + + sn = net_generic(xpt->xpt_net, sunrpc_net_id); + cache_put(&ipm->h, sn->ip_map_cache); + } } /**************************************************************************** @@ -674,6 +697,8 @@ static struct group_info *unix_gid_find(uid_t uid, struct svc_rqst *rqstp) switch (ret) { case -ENOENT: return ERR_PTR(-ENOENT); + case -ETIMEDOUT: + return ERR_PTR(-ESHUTDOWN); case 0: gi = get_group_info(ug->gi); cache_put(&ug->h, &unix_gid_cache); @@ -691,6 +716,9 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) struct ip_map *ipm; struct group_info *gi; struct svc_cred *cred = &rqstp->rq_cred; + struct svc_xprt *xprt = rqstp->rq_xprt; + struct net *net = xprt->xpt_net; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); switch (rqstp->rq_addr.ss_family) { case AF_INET: @@ -709,26 +737,27 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) if (rqstp->rq_proc == 0) return SVC_OK; - ipm = ip_map_cached_get(rqstp); + ipm = ip_map_cached_get(xprt); if (ipm == NULL) - ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class, + ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class, &sin6->sin6_addr); if (ipm == NULL) return SVC_DENIED; - switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) { + switch (cache_check(sn->ip_map_cache, &ipm->h, &rqstp->rq_chandle)) { default: BUG(); - case -EAGAIN: case -ETIMEDOUT: + return SVC_CLOSE; + case -EAGAIN: return SVC_DROP; case -ENOENT: return SVC_DENIED; case 0: rqstp->rq_client = &ipm->m_client->h; kref_get(&rqstp->rq_client->ref); - ip_map_cached_put(rqstp, ipm); + ip_map_cached_put(xprt, ipm); break; } @@ -736,6 +765,8 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) switch (PTR_ERR(gi)) { case -EAGAIN: return SVC_DROP; + case -ESHUTDOWN: + return SVC_CLOSE; case -ENOENT: break; default: @@ -776,7 +807,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) cred->cr_gid = (gid_t) -1; cred->cr_group_info = groups_alloc(0); if (cred->cr_group_info == NULL) - return SVC_DROP; /* kmalloc failure - client must retry */ + return SVC_CLOSE; /* kmalloc failure - client must retry */ /* Put NULL verifier */ svc_putnl(resv, RPC_AUTH_NULL); @@ -840,7 +871,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) goto badcred; cred->cr_group_info = groups_alloc(slen); if (cred->cr_group_info == NULL) - return SVC_DROP; + return SVC_CLOSE; for (i = 0; i < slen; i++) GROUP_AT(cred->cr_group_info, i) = svc_getnl(argv); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { @@ -886,3 +917,56 @@ struct auth_ops svcauth_unix = { .set_client = svcauth_unix_set_client, }; +int ip_map_cache_create(struct net *net) +{ + int err = -ENOMEM; + struct cache_detail *cd; + struct cache_head **tbl; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL); + if (cd == NULL) + goto err_cd; + + tbl = kzalloc(IP_HASHMAX * sizeof(struct cache_head *), GFP_KERNEL); + if (tbl == NULL) + goto err_tbl; + + cd->owner = THIS_MODULE, + cd->hash_size = IP_HASHMAX, + cd->hash_table = tbl, + cd->name = "auth.unix.ip", + cd->cache_put = ip_map_put, + cd->cache_upcall = ip_map_upcall, + cd->cache_parse = ip_map_parse, + cd->cache_show = ip_map_show, + cd->match = ip_map_match, + cd->init = ip_map_init, + cd->update = update, + cd->alloc = ip_map_alloc, + + err = cache_register_net(cd, net); + if (err) + goto err_reg; + + sn->ip_map_cache = cd; + return 0; + +err_reg: + kfree(tbl); +err_tbl: + kfree(cd); +err_cd: + return err; +} + +void ip_map_cache_destroy(struct net *net) +{ + struct sunrpc_net *sn; + + sn = net_generic(net, sunrpc_net_id); + cache_purge(sn->ip_map_cache); + cache_unregister_net(sn->ip_map_cache, net); + kfree(sn->ip_map_cache->hash_table); + kfree(sn->ip_map_cache); +} diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 7e534dd09077..07919e16be3e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -64,7 +64,8 @@ static void svc_tcp_sock_detach(struct svc_xprt *); static void svc_sock_free(struct svc_xprt *); static struct svc_xprt *svc_create_socket(struct svc_serv *, int, - struct sockaddr *, int, int); + struct net *, struct sockaddr *, + int, int); #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key svc_key[2]; static struct lock_class_key svc_slock_key[2]; @@ -657,10 +658,11 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt) } static struct svc_xprt *svc_udp_create(struct svc_serv *serv, + struct net *net, struct sockaddr *sa, int salen, int flags) { - return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags); + return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags); } static struct svc_xprt_ops svc_udp_ops = { @@ -1133,9 +1135,6 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) reclen = htonl(0x80000000|((xbufp->len ) - 4)); memcpy(xbufp->head[0].iov_base, &reclen, 4); - if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags)) - return -ENOTCONN; - sent = svc_sendto(rqstp, &rqstp->rq_res); if (sent != xbufp->len) { printk(KERN_NOTICE @@ -1178,10 +1177,11 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) } static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, + struct net *net, struct sockaddr *sa, int salen, int flags) { - return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags); + return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags); } static struct svc_xprt_ops svc_tcp_ops = { @@ -1258,19 +1258,13 @@ void svc_sock_update_bufs(struct svc_serv *serv) * The number of server threads has changed. Update * rcvbuf and sndbuf accordingly on all sockets */ - struct list_head *le; + struct svc_sock *svsk; spin_lock_bh(&serv->sv_lock); - list_for_each(le, &serv->sv_permsocks) { - struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_xprt.xpt_list); + list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); - } - list_for_each(le, &serv->sv_tempsocks) { - struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_xprt.xpt_list); + list_for_each_entry(svsk, &serv->sv_tempsocks, sk_xprt.xpt_list) set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); - } spin_unlock_bh(&serv->sv_lock); } EXPORT_SYMBOL_GPL(svc_sock_update_bufs); @@ -1385,6 +1379,7 @@ EXPORT_SYMBOL_GPL(svc_addsock); */ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, int protocol, + struct net *net, struct sockaddr *sin, int len, int flags) { @@ -1421,7 +1416,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, return ERR_PTR(-EINVAL); } - error = sock_create_kern(family, type, protocol, &sock); + error = __sock_create(net, family, type, protocol, &sock, 1); if (error < 0) return ERR_PTR(error); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index a1f82a87d34d..cd9e841e7492 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -111,6 +111,23 @@ xdr_decode_string_inplace(__be32 *p, char **sp, } EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); +/** + * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf + * @buf: XDR buffer where string resides + * @len: length of string, in bytes + * + */ +void +xdr_terminate_string(struct xdr_buf *buf, const u32 len) +{ + char *kaddr; + + kaddr = kmap_atomic(buf->pages[0], KM_USER0); + kaddr[buf->page_base + len] = '\0'; + kunmap_atomic(kaddr, KM_USER0); +} +EXPORT_SYMBOL(xdr_terminate_string); + void xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base, unsigned int len) @@ -395,24 +412,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) { struct kvec *tail; size_t copy; - char *p; unsigned int pglen = buf->page_len; + unsigned int tailbuf_len; tail = buf->tail; BUG_ON (len > pglen); + tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; + /* Shift the tail first */ - if (tail->iov_len != 0) { - p = (char *)tail->iov_base + len; + if (tailbuf_len != 0) { + unsigned int free_space = tailbuf_len - tail->iov_len; + + if (len < free_space) + free_space = len; + tail->iov_len += free_space; + + copy = len; if (tail->iov_len > len) { - copy = tail->iov_len - len; - memmove(p, tail->iov_base, copy); + char *p = (char *)tail->iov_base + len; + memmove(p, tail->iov_base, tail->iov_len - len); } else - buf->buflen -= len; - /* Copy from the inlined pages into the tail */ - copy = len; - if (copy > tail->iov_len) copy = tail->iov_len; + /* Copy from the inlined pages into the tail */ _copy_from_pages((char *)tail->iov_base, buf->pages, buf->page_base + pglen - len, copy); @@ -551,6 +573,27 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) EXPORT_SYMBOL_GPL(xdr_init_decode); /** + * xdr_inline_peek - Allow read-ahead in the XDR data stream + * @xdr: pointer to xdr_stream struct + * @nbytes: number of bytes of data to decode + * + * Check if the input buffer is long enough to enable us to decode + * 'nbytes' more bytes of data starting at the current position. + * If so return the current pointer without updating the current + * pointer position. + */ +__be32 * xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr->p; + __be32 *q = p + XDR_QUADLEN(nbytes); + + if (unlikely(q > xdr->end || q < p)) + return NULL; + return p; +} +EXPORT_SYMBOL_GPL(xdr_inline_peek); + +/** * xdr_inline_decode - Retrieve non-page XDR data to decode * @xdr: pointer to xdr_stream struct * @nbytes: number of bytes of data to decode diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 970fb00f388c..4c8f18aff7c3 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -199,8 +199,6 @@ int xprt_reserve_xprt(struct rpc_task *task) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) return 1; - if (task == NULL) - return 0; goto out_sleep; } xprt->snd_task = task; @@ -757,13 +755,11 @@ static void xprt_connect_status(struct rpc_task *task) */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) { - struct list_head *pos; + struct rpc_rqst *entry; - list_for_each(pos, &xprt->recv) { - struct rpc_rqst *entry = list_entry(pos, struct rpc_rqst, rq_list); + list_for_each_entry(entry, &xprt->recv, rq_list) if (entry->rq_xid == xid) return entry; - } dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n", ntohl(xid)); @@ -962,6 +958,37 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) spin_unlock(&xprt->reserve_lock); } +struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req) +{ + struct rpc_xprt *xprt; + + xprt = kzalloc(size, GFP_KERNEL); + if (xprt == NULL) + goto out; + + xprt->max_reqs = max_req; + xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL); + if (xprt->slot == NULL) + goto out_free; + + xprt->xprt_net = get_net(net); + return xprt; + +out_free: + kfree(xprt); +out: + return NULL; +} +EXPORT_SYMBOL_GPL(xprt_alloc); + +void xprt_free(struct rpc_xprt *xprt) +{ + put_net(xprt->xprt_net); + kfree(xprt->slot); + kfree(xprt); +} +EXPORT_SYMBOL_GPL(xprt_free); + /** * xprt_reserve - allocate an RPC request slot * @task: RPC task requesting a slot allocation diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index d718b8fa9525..09af4fab1a45 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -43,6 +43,7 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/sysctl.h> +#include <linux/workqueue.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/svc_rdma.h> @@ -74,6 +75,8 @@ atomic_t rdma_stat_sq_prod; struct kmem_cache *svc_rdma_map_cachep; struct kmem_cache *svc_rdma_ctxt_cachep; +struct workqueue_struct *svc_rdma_wq; + /* * This function implements reading and resetting an atomic_t stat * variable through read/write to a proc file. Any write to the file @@ -231,7 +234,7 @@ static ctl_table svcrdma_root_table[] = { void svc_rdma_cleanup(void) { dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); - flush_scheduled_work(); + destroy_workqueue(svc_rdma_wq); if (svcrdma_table_header) { unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; @@ -249,6 +252,11 @@ int svc_rdma_init(void) dprintk("\tsq_depth : %d\n", svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); + + svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); + if (!svc_rdma_wq) + return -ENOMEM; + if (!svcrdma_table_header) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); @@ -283,6 +291,7 @@ int svc_rdma_init(void) kmem_cache_destroy(svc_rdma_map_cachep); err0: unregister_sysctl_table(svcrdma_table_header); + destroy_workqueue(svc_rdma_wq); return -ENOMEM; } MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 0194de814933..df67211c4baf 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -263,9 +263,9 @@ static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; for (page_no = 0; page_no < frmr->page_list_len; page_no++) { frmr->page_list->page_list[page_no] = - ib_dma_map_single(xprt->sc_cm_id->device, - page_address(rqstp->rq_arg.pages[page_no]), - PAGE_SIZE, DMA_FROM_DEVICE); + ib_dma_map_page(xprt->sc_cm_id->device, + rqstp->rq_arg.pages[page_no], 0, + PAGE_SIZE, DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, frmr->page_list->page_list[page_no])) goto fatal_err; @@ -309,17 +309,21 @@ static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, int count) { int i; + unsigned long off; ctxt->count = count; ctxt->direction = DMA_FROM_DEVICE; for (i = 0; i < count; i++) { ctxt->sge[i].length = 0; /* in case map fails */ if (!frmr) { + BUG_ON(0 == virt_to_page(vec[i].iov_base)); + off = (unsigned long)vec[i].iov_base & ~PAGE_MASK; ctxt->sge[i].addr = - ib_dma_map_single(xprt->sc_cm_id->device, - vec[i].iov_base, - vec[i].iov_len, - DMA_FROM_DEVICE); + ib_dma_map_page(xprt->sc_cm_id->device, + virt_to_page(vec[i].iov_base), + off, + vec[i].iov_len, + DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[i].addr)) return -EINVAL; @@ -491,6 +495,7 @@ next_sge: printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n", err); set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 0); goto out; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index b15e1ebb2bfa..249a835b703f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -70,8 +70,8 @@ * on extra page for the RPCRMDA header. */ static int fast_reg_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { int sge_no; u32 sge_bytes; @@ -96,21 +96,25 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt, vec->count = 2; sge_no++; - /* Build the FRMR */ + /* Map the XDR head */ frmr->kva = frva; frmr->direction = DMA_TO_DEVICE; frmr->access_flags = 0; frmr->map_len = PAGE_SIZE; frmr->page_list_len = 1; + page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; frmr->page_list->page_list[page_no] = - ib_dma_map_single(xprt->sc_cm_id->device, - (void *)xdr->head[0].iov_base, - PAGE_SIZE, DMA_TO_DEVICE); + ib_dma_map_page(xprt->sc_cm_id->device, + virt_to_page(xdr->head[0].iov_base), + page_off, + PAGE_SIZE - page_off, + DMA_TO_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, frmr->page_list->page_list[page_no])) goto fatal_err; atomic_inc(&xprt->sc_dma_used); + /* Map the XDR page list */ page_off = xdr->page_base; page_bytes = xdr->page_len + page_off; if (!page_bytes) @@ -128,9 +132,9 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt, page_bytes -= sge_bytes; frmr->page_list->page_list[page_no] = - ib_dma_map_single(xprt->sc_cm_id->device, - page_address(page), - PAGE_SIZE, DMA_TO_DEVICE); + ib_dma_map_page(xprt->sc_cm_id->device, + page, page_off, + sge_bytes, DMA_TO_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, frmr->page_list->page_list[page_no])) goto fatal_err; @@ -166,8 +170,10 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt, vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; frmr->page_list->page_list[page_no] = - ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE, - DMA_TO_DEVICE); + ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va), + page_off, + PAGE_SIZE, + DMA_TO_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, frmr->page_list->page_list[page_no])) goto fatal_err; @@ -245,6 +251,35 @@ static int map_xdr(struct svcxprt_rdma *xprt, return 0; } +static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + u32 xdr_off, size_t len, int dir) +{ + struct page *page; + dma_addr_t dma_addr; + if (xdr_off < xdr->head[0].iov_len) { + /* This offset is in the head */ + xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; + page = virt_to_page(xdr->head[0].iov_base); + } else { + xdr_off -= xdr->head[0].iov_len; + if (xdr_off < xdr->page_len) { + /* This offset is in the page list */ + page = xdr->pages[xdr_off >> PAGE_SHIFT]; + xdr_off &= ~PAGE_MASK; + } else { + /* This offset is in the tail */ + xdr_off -= xdr->page_len; + xdr_off += (unsigned long) + xdr->tail[0].iov_base & ~PAGE_MASK; + page = virt_to_page(xdr->tail[0].iov_base); + } + } + dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off, + min_t(size_t, PAGE_SIZE, len), dir); + return dma_addr; +} + /* Assumptions: * - We are using FRMR * - or - @@ -293,10 +328,9 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge[sge_no].length = sge_bytes; if (!vec->frmr) { sge[sge_no].addr = - ib_dma_map_single(xprt->sc_cm_id->device, - (void *) - vec->sge[xdr_sge_no].iov_base + sge_off, - sge_bytes, DMA_TO_DEVICE); + dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge[sge_no].addr)) goto err; @@ -333,6 +367,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, goto err; return 0; err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_frmr(xprt, vec->frmr); svc_rdma_put_context(ctxt, 0); /* Fatal error, close transport */ return -EIO; @@ -494,7 +530,8 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, * In all three cases, this function prepares the RPCRDMA header in * sge[0], the 'type' parameter indicates the type to place in the * RPCRDMA header, and the 'byte_count' field indicates how much of - * the XDR to include in this RDMA_SEND. + * the XDR to include in this RDMA_SEND. NB: The offset of the payload + * to send is zero in the XDR. */ static int send_reply(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, @@ -536,23 +573,24 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[0].lkey = rdma->sc_dma_lkey; ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); ctxt->sge[0].addr = - ib_dma_map_single(rdma->sc_cm_id->device, page_address(page), - ctxt->sge[0].length, DMA_TO_DEVICE); + ib_dma_map_page(rdma->sc_cm_id->device, page, 0, + ctxt->sge[0].length, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) goto err; atomic_inc(&rdma->sc_dma_used); ctxt->direction = DMA_TO_DEVICE; - /* Determine how many of our SGE are to be transmitted */ + /* Map the payload indicated by 'byte_count' */ for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { + int xdr_off = 0; sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; if (!vec->frmr) { ctxt->sge[sge_no].addr = - ib_dma_map_single(rdma->sc_cm_id->device, - vec->sge[sge_no].iov_base, - sge_bytes, DMA_TO_DEVICE); + dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, + sge_bytes, DMA_TO_DEVICE); + xdr_off += sge_bytes; if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[sge_no].addr)) goto err; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index edea15a54e51..9df1eadc912a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -45,6 +45,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/workqueue.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include <linux/sunrpc/svc_rdma.h> @@ -52,6 +53,7 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct net *net, struct sockaddr *sa, int salen, int flags); static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); @@ -89,6 +91,9 @@ struct svc_xprt_class svc_rdma_class = { /* WR context cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_ctxt_cachep; +/* Workqueue created in svc_rdma.c */ +extern struct workqueue_struct *svc_rdma_wq; + struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt; @@ -120,7 +125,7 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) */ if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { atomic_dec(&xprt->sc_dma_used); - ib_dma_unmap_single(xprt->sc_cm_id->device, + ib_dma_unmap_page(xprt->sc_cm_id->device, ctxt->sge[i].addr, ctxt->sge[i].length, ctxt->direction); @@ -502,8 +507,8 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) BUG_ON(sge_no >= xprt->sc_max_sge); page = svc_rdma_get_page(); ctxt->pages[sge_no] = page; - pa = ib_dma_map_single(xprt->sc_cm_id->device, - page_address(page), PAGE_SIZE, + pa = ib_dma_map_page(xprt->sc_cm_id->device, + page, 0, PAGE_SIZE, DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) goto err_put_ctxt; @@ -511,9 +516,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; + ctxt->count = sge_no + 1; buflen += PAGE_SIZE; } - ctxt->count = sge_no; recv_wr.next = NULL; recv_wr.sg_list = &ctxt->sge[0]; recv_wr.num_sge = ctxt->count; @@ -529,6 +534,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) return ret; err_put_ctxt: + svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); return -ENOMEM; } @@ -670,6 +676,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, * Create a listening RDMA service endpoint. */ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct net *net, struct sockaddr *sa, int salen, int flags) { @@ -798,8 +805,8 @@ static void frmr_unmap_dma(struct svcxprt_rdma *xprt, if (ib_dma_mapping_error(frmr->mr->device, addr)) continue; atomic_dec(&xprt->sc_dma_used); - ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE, - frmr->direction); + ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE, + frmr->direction); } } @@ -1184,7 +1191,7 @@ static void svc_rdma_free(struct svc_xprt *xprt) struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); INIT_WORK(&rdma->sc_work, __svc_rdma_free); - schedule_work(&rdma->sc_work); + queue_work(svc_rdma_wq, &rdma->sc_work); } static int svc_rdma_has_wspace(struct svc_xprt *xprt) @@ -1274,7 +1281,7 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) atomic_read(&xprt->sc_sq_count) < xprt->sc_sq_depth); if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) - return 0; + return -ENOTCONN; continue; } /* Take a transport ref for each WR posted */ @@ -1306,7 +1313,6 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, enum rpcrdma_errcode err) { struct ib_send_wr err_wr; - struct ib_sge sge; struct page *p; struct svc_rdma_op_ctxt *ctxt; u32 *va; @@ -1319,26 +1325,27 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, /* XDR encode error */ length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); + ctxt = svc_rdma_get_context(xprt); + ctxt->direction = DMA_FROM_DEVICE; + ctxt->count = 1; + ctxt->pages[0] = p; + /* Prepare SGE for local address */ - sge.addr = ib_dma_map_single(xprt->sc_cm_id->device, - page_address(p), PAGE_SIZE, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) { + ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device, + p, 0, length, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) { put_page(p); return; } atomic_inc(&xprt->sc_dma_used); - sge.lkey = xprt->sc_dma_lkey; - sge.length = length; - - ctxt = svc_rdma_get_context(xprt); - ctxt->count = 1; - ctxt->pages[0] = p; + ctxt->sge[0].lkey = xprt->sc_dma_lkey; + ctxt->sge[0].length = length; /* Prepare SEND WR */ memset(&err_wr, 0, sizeof err_wr); ctxt->wr_op = IB_WR_SEND; err_wr.wr_id = (unsigned long)ctxt; - err_wr.sg_list = &sge; + err_wr.sg_list = ctxt->sge; err_wr.num_sge = 1; err_wr.opcode = IB_WR_SEND; err_wr.send_flags = IB_SEND_SIGNALED; @@ -1348,9 +1355,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, if (ret) { dprintk("svcrdma: Error %d posting send for protocol error\n", ret); - ib_dma_unmap_single(xprt->sc_cm_id->device, - sge.addr, PAGE_SIZE, - DMA_FROM_DEVICE); + svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); } } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index a85e866a77f7..0867070bb5ca 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -237,8 +237,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) dprintk("RPC: %s: called\n", __func__); - cancel_delayed_work(&r_xprt->rdma_connect); - flush_scheduled_work(); + cancel_delayed_work_sync(&r_xprt->rdma_connect); xprt_clear_connected(xprt); @@ -251,9 +250,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) xprt_rdma_free_addresses(xprt); - kfree(xprt->slot); - xprt->slot = NULL; - kfree(xprt); + xprt_free(xprt); dprintk("RPC: %s: returning\n", __func__); @@ -285,23 +282,14 @@ xprt_setup_rdma(struct xprt_create *args) return ERR_PTR(-EBADF); } - xprt = kzalloc(sizeof(struct rpcrdma_xprt), GFP_KERNEL); + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), + xprt_rdma_slot_table_entries); if (xprt == NULL) { dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", __func__); return ERR_PTR(-ENOMEM); } - xprt->max_reqs = xprt_rdma_slot_table_entries; - xprt->slot = kcalloc(xprt->max_reqs, - sizeof(struct rpc_rqst), GFP_KERNEL); - if (xprt->slot == NULL) { - dprintk("RPC: %s: couldn't allocate %d slots\n", - __func__, xprt->max_reqs); - kfree(xprt); - return ERR_PTR(-ENOMEM); - } - /* 60 second timeout, no retries */ xprt->timeout = &xprt_rdma_default_timeout; xprt->bind_timeout = (60U * HZ); @@ -410,8 +398,7 @@ out3: out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: - kfree(xprt->slot); - kfree(xprt); + xprt_free(xprt); return ERR_PTR(rc); } @@ -460,7 +447,7 @@ xprt_rdma_connect(struct rpc_task *task) } else { schedule_delayed_work(&r_xprt->rdma_connect, 0); if (!RPC_IS_ASYNC(task)) - flush_scheduled_work(); + flush_delayed_work(&r_xprt->rdma_connect); } } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index fe9306bf10cc..dfcab5ac65af 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -774,8 +774,7 @@ static void xs_destroy(struct rpc_xprt *xprt) xs_close(xprt); xs_free_peer_addresses(xprt); - kfree(xprt->slot); - kfree(xprt); + xprt_free(xprt); module_put(THIS_MODULE); } @@ -1516,7 +1515,7 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) xs_update_peer_port(xprt); } -static unsigned short xs_get_srcport(struct sock_xprt *transport, struct socket *sock) +static unsigned short xs_get_srcport(struct sock_xprt *transport) { unsigned short port = transport->srcport; @@ -1525,7 +1524,7 @@ static unsigned short xs_get_srcport(struct sock_xprt *transport, struct socket return port; } -static unsigned short xs_next_srcport(struct sock_xprt *transport, struct socket *sock, unsigned short port) +static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port) { if (transport->srcport != 0) transport->srcport = 0; @@ -1535,23 +1534,18 @@ static unsigned short xs_next_srcport(struct sock_xprt *transport, struct socket return xprt_max_resvport; return --port; } - -static int xs_bind4(struct sock_xprt *transport, struct socket *sock) +static int xs_bind(struct sock_xprt *transport, struct socket *sock) { - struct sockaddr_in myaddr = { - .sin_family = AF_INET, - }; - struct sockaddr_in *sa; + struct sockaddr_storage myaddr; int err, nloop = 0; - unsigned short port = xs_get_srcport(transport, sock); + unsigned short port = xs_get_srcport(transport); unsigned short last; - sa = (struct sockaddr_in *)&transport->srcaddr; - myaddr.sin_addr = sa->sin_addr; + memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { - myaddr.sin_port = htons(port); - err = kernel_bind(sock, (struct sockaddr *) &myaddr, - sizeof(myaddr)); + rpc_set_port((struct sockaddr *)&myaddr, port); + err = kernel_bind(sock, (struct sockaddr *)&myaddr, + transport->xprt.addrlen); if (port == 0) break; if (err == 0) { @@ -1559,48 +1553,23 @@ static int xs_bind4(struct sock_xprt *transport, struct socket *sock) break; } last = port; - port = xs_next_srcport(transport, sock, port); + port = xs_next_srcport(transport, port); if (port > last) nloop++; } while (err == -EADDRINUSE && nloop != 2); - dprintk("RPC: %s %pI4:%u: %s (%d)\n", - __func__, &myaddr.sin_addr, - port, err ? "failed" : "ok", err); - return err; -} - -static int xs_bind6(struct sock_xprt *transport, struct socket *sock) -{ - struct sockaddr_in6 myaddr = { - .sin6_family = AF_INET6, - }; - struct sockaddr_in6 *sa; - int err, nloop = 0; - unsigned short port = xs_get_srcport(transport, sock); - unsigned short last; - sa = (struct sockaddr_in6 *)&transport->srcaddr; - myaddr.sin6_addr = sa->sin6_addr; - do { - myaddr.sin6_port = htons(port); - err = kernel_bind(sock, (struct sockaddr *) &myaddr, - sizeof(myaddr)); - if (port == 0) - break; - if (err == 0) { - transport->srcport = port; - break; - } - last = port; - port = xs_next_srcport(transport, sock, port); - if (port > last) - nloop++; - } while (err == -EADDRINUSE && nloop != 2); - dprintk("RPC: xs_bind6 %pI6:%u: %s (%d)\n", - &myaddr.sin6_addr, port, err ? "failed" : "ok", err); + if (myaddr.ss_family == AF_INET) + dprintk("RPC: %s %pI4:%u: %s (%d)\n", __func__, + &((struct sockaddr_in *)&myaddr)->sin_addr, + port, err ? "failed" : "ok", err); + else + dprintk("RPC: %s %pI6:%u: %s (%d)\n", __func__, + &((struct sockaddr_in6 *)&myaddr)->sin6_addr, + port, err ? "failed" : "ok", err); return err; } + #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key xs_key[2]; static struct lock_class_key xs_slock_key[2]; @@ -1622,6 +1591,18 @@ static inline void xs_reclassify_socket6(struct socket *sock) sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC", &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]); } + +static inline void xs_reclassify_socket(int family, struct socket *sock) +{ + switch (family) { + case AF_INET: + xs_reclassify_socket4(sock); + break; + case AF_INET6: + xs_reclassify_socket6(sock); + break; + } +} #else static inline void xs_reclassify_socket4(struct socket *sock) { @@ -1630,8 +1611,36 @@ static inline void xs_reclassify_socket4(struct socket *sock) static inline void xs_reclassify_socket6(struct socket *sock) { } + +static inline void xs_reclassify_socket(int family, struct socket *sock) +{ +} #endif +static struct socket *xs_create_sock(struct rpc_xprt *xprt, + struct sock_xprt *transport, int family, int type, int protocol) +{ + struct socket *sock; + int err; + + err = __sock_create(xprt->xprt_net, family, type, protocol, &sock, 1); + if (err < 0) { + dprintk("RPC: can't create %d transport socket (%d).\n", + protocol, -err); + goto out; + } + xs_reclassify_socket(family, sock); + + if (xs_bind(transport, sock)) { + sock_release(sock); + goto out; + } + + return sock; +out: + return ERR_PTR(err); +} + static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1661,82 +1670,23 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_udp_do_set_buffer_size(xprt); } -/** - * xs_udp_connect_worker4 - set up a UDP socket - * @work: RPC transport to connect - * - * Invoked by a work queue tasklet. - */ -static void xs_udp_connect_worker4(struct work_struct *work) +static void xs_udp_setup_socket(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; - int err, status = -EIO; + int status = -EIO; if (xprt->shutdown) goto out; /* Start by resetting any existing state */ xs_reset_transport(transport); - - err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (err < 0) { - dprintk("RPC: can't create UDP transport socket (%d).\n", -err); + sock = xs_create_sock(xprt, transport, + xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP); + if (IS_ERR(sock)) goto out; - } - xs_reclassify_socket4(sock); - - if (xs_bind4(transport, sock)) { - sock_release(sock); - goto out; - } - - dprintk("RPC: worker connecting xprt %p via %s to " - "%s (port %s)\n", xprt, - xprt->address_strings[RPC_DISPLAY_PROTO], - xprt->address_strings[RPC_DISPLAY_ADDR], - xprt->address_strings[RPC_DISPLAY_PORT]); - - xs_udp_finish_connecting(xprt, sock); - status = 0; -out: - xprt_clear_connecting(xprt); - xprt_wake_pending_tasks(xprt, status); -} - -/** - * xs_udp_connect_worker6 - set up a UDP socket - * @work: RPC transport to connect - * - * Invoked by a work queue tasklet. - */ -static void xs_udp_connect_worker6(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, connect_worker.work); - struct rpc_xprt *xprt = &transport->xprt; - struct socket *sock = transport->sock; - int err, status = -EIO; - - if (xprt->shutdown) - goto out; - - /* Start by resetting any existing state */ - xs_reset_transport(transport); - - err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (err < 0) { - dprintk("RPC: can't create UDP transport socket (%d).\n", -err); - goto out; - } - xs_reclassify_socket6(sock); - - if (xs_bind6(transport, sock) < 0) { - sock_release(sock); - goto out; - } dprintk("RPC: worker connecting xprt %p via %s to " "%s (port %s)\n", xprt, @@ -1755,12 +1705,12 @@ out: * We need to preserve the port number so the reply cache on the server can * find our cached RPC replies when we get around to reconnecting. */ -static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transport) +static void xs_abort_connection(struct sock_xprt *transport) { int result; struct sockaddr any; - dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt); + dprintk("RPC: disconnecting xprt %p to reuse port\n", transport); /* * Disconnect the transport socket by doing a connect operation @@ -1770,13 +1720,13 @@ static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transpo any.sa_family = AF_UNSPEC; result = kernel_connect(transport->sock, &any, sizeof(any), 0); if (!result) - xs_sock_mark_closed(xprt); + xs_sock_mark_closed(&transport->xprt); else dprintk("RPC: AF_UNSPEC connect return code %d\n", result); } -static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *transport) +static void xs_tcp_reuse_connection(struct sock_xprt *transport) { unsigned int state = transport->inet->sk_state; @@ -1799,7 +1749,7 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *tra "sk_shutdown set to %d\n", __func__, transport->inet->sk_shutdown); } - xs_abort_connection(xprt, transport); + xs_abort_connection(transport); } static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) @@ -1852,12 +1802,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) * * Invoked by a work queue tasklet. */ -static void xs_tcp_setup_socket(struct rpc_xprt *xprt, - struct sock_xprt *transport, - struct socket *(*create_sock)(struct rpc_xprt *, - struct sock_xprt *)) +static void xs_tcp_setup_socket(struct work_struct *work) { + struct sock_xprt *transport = + container_of(work, struct sock_xprt, connect_worker.work); struct socket *sock = transport->sock; + struct rpc_xprt *xprt = &transport->xprt; int status = -EIO; if (xprt->shutdown) @@ -1865,7 +1815,8 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt, if (!sock) { clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); - sock = create_sock(xprt, transport); + sock = xs_create_sock(xprt, transport, + xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP); if (IS_ERR(sock)) { status = PTR_ERR(sock); goto out; @@ -1876,7 +1827,7 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt, abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); /* "close" the socket, preserving the local port */ - xs_tcp_reuse_connection(xprt, transport); + xs_tcp_reuse_connection(transport); if (abort_and_exit) goto out_eagain; @@ -1925,84 +1876,6 @@ out: xprt_wake_pending_tasks(xprt, status); } -static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt, - struct sock_xprt *transport) -{ - struct socket *sock; - int err; - - /* start from scratch */ - err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (err < 0) { - dprintk("RPC: can't create TCP transport socket (%d).\n", - -err); - goto out_err; - } - xs_reclassify_socket4(sock); - - if (xs_bind4(transport, sock) < 0) { - sock_release(sock); - goto out_err; - } - return sock; -out_err: - return ERR_PTR(-EIO); -} - -/** - * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint - * @work: RPC transport to connect - * - * Invoked by a work queue tasklet. - */ -static void xs_tcp_connect_worker4(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, connect_worker.work); - struct rpc_xprt *xprt = &transport->xprt; - - xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock4); -} - -static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt, - struct sock_xprt *transport) -{ - struct socket *sock; - int err; - - /* start from scratch */ - err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock); - if (err < 0) { - dprintk("RPC: can't create TCP transport socket (%d).\n", - -err); - goto out_err; - } - xs_reclassify_socket6(sock); - - if (xs_bind6(transport, sock) < 0) { - sock_release(sock); - goto out_err; - } - return sock; -out_err: - return ERR_PTR(-EIO); -} - -/** - * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint - * @work: RPC transport to connect - * - * Invoked by a work queue tasklet. - */ -static void xs_tcp_connect_worker6(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, connect_worker.work); - struct rpc_xprt *xprt = &transport->xprt; - - xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock6); -} - /** * xs_connect - connect a socket to a remote endpoint * @task: address of RPC task that manages state of connect request @@ -2262,6 +2135,31 @@ static struct rpc_xprt_ops bc_tcp_ops = { .print_stats = xs_tcp_print_stats, }; +static int xs_init_anyaddr(const int family, struct sockaddr *sap) +{ + static const struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + static const struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + }; + + switch (family) { + case AF_INET: + memcpy(sap, &sin, sizeof(sin)); + break; + case AF_INET6: + memcpy(sap, &sin6, sizeof(sin6)); + break; + default: + dprintk("RPC: %s: Bad address family\n", __func__); + return -EAFNOSUPPORT; + } + return 0; +} + static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, unsigned int slot_table_size) { @@ -2273,27 +2171,25 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, return ERR_PTR(-EBADF); } - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (new == NULL) { + xprt = xprt_alloc(args->net, sizeof(*new), slot_table_size); + if (xprt == NULL) { dprintk("RPC: xs_setup_xprt: couldn't allocate " "rpc_xprt\n"); return ERR_PTR(-ENOMEM); } - xprt = &new->xprt; - - xprt->max_reqs = slot_table_size; - xprt->slot = kcalloc(xprt->max_reqs, sizeof(struct rpc_rqst), GFP_KERNEL); - if (xprt->slot == NULL) { - kfree(xprt); - dprintk("RPC: xs_setup_xprt: couldn't allocate slot " - "table\n"); - return ERR_PTR(-ENOMEM); - } + new = container_of(xprt, struct sock_xprt, xprt); memcpy(&xprt->addr, args->dstaddr, args->addrlen); xprt->addrlen = args->addrlen; if (args->srcaddr) memcpy(&new->srcaddr, args->srcaddr, args->addrlen); + else { + int err; + err = xs_init_anyaddr(args->dstaddr->sa_family, + (struct sockaddr *)&new->srcaddr); + if (err != 0) + return ERR_PTR(err); + } return xprt; } @@ -2341,7 +2237,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) xprt_set_bound(xprt); INIT_DELAYED_WORK(&transport->connect_worker, - xs_udp_connect_worker4); + xs_udp_setup_socket); xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP); break; case AF_INET6: @@ -2349,7 +2245,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) xprt_set_bound(xprt); INIT_DELAYED_WORK(&transport->connect_worker, - xs_udp_connect_worker6); + xs_udp_setup_socket); xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6); break; default: @@ -2371,8 +2267,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) return xprt; ret = ERR_PTR(-EINVAL); out_err: - kfree(xprt->slot); - kfree(xprt); + xprt_free(xprt); return ret; } @@ -2416,7 +2311,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt_set_bound(xprt); INIT_DELAYED_WORK(&transport->connect_worker, - xs_tcp_connect_worker4); + xs_tcp_setup_socket); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); break; case AF_INET6: @@ -2424,7 +2319,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt_set_bound(xprt); INIT_DELAYED_WORK(&transport->connect_worker, - xs_tcp_connect_worker6); + xs_tcp_setup_socket); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); break; default: @@ -2447,8 +2342,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) return xprt; ret = ERR_PTR(-EINVAL); out_err: - kfree(xprt->slot); - kfree(xprt); + xprt_free(xprt); return ret; } @@ -2507,15 +2401,10 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) goto out_err; } - if (xprt_bound(xprt)) - dprintk("RPC: set up xprt to %s (port %s) via %s\n", - xprt->address_strings[RPC_DISPLAY_ADDR], - xprt->address_strings[RPC_DISPLAY_PORT], - xprt->address_strings[RPC_DISPLAY_PROTO]); - else - dprintk("RPC: set up xprt to %s (autobind) via %s\n", - xprt->address_strings[RPC_DISPLAY_ADDR], - xprt->address_strings[RPC_DISPLAY_PROTO]); + dprintk("RPC: set up xprt to %s (port %s) via %s\n", + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT], + xprt->address_strings[RPC_DISPLAY_PROTO]); /* * Since we don't want connections for the backchannel, we set @@ -2528,8 +2417,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) return xprt; ret = ERR_PTR(-EINVAL); out_err: - kfree(xprt->slot); - kfree(xprt); + xprt_free(xprt); return ret; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 33217fc3d697..e9f0d5004483 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -396,6 +396,7 @@ static int get_name(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; struct tipc_sock *tsock = tipc_sk(sock->sk); + memset(addr, 0, sizeof(*addr)); if (peer) { if ((sock->state != SS_CONNECTED) && ((peer != 2) || (sock->state != SS_DISCONNECTING))) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0ebc777a6660..2268e6798124 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,7 +117,7 @@ static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); -static atomic_t unix_nr_socks = ATOMIC_INIT(0); +static atomic_long_t unix_nr_socks; #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk) if (u->addr) unix_release_addr(u->addr); - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); #ifdef UNIX_REFCNT_DEBUG - printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, - atomic_read(&unix_nr_socks)); + printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, + atomic_long_read(&unix_nr_socks)); #endif } @@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) struct sock *sk = NULL; struct unix_sock *u; - atomic_inc(&unix_nr_socks); - if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) + atomic_long_inc(&unix_nr_socks); + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); @@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) unix_insert_socket(unix_sockets_unbound, sk); out: if (sk == NULL) - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); else { local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -1343,9 +1343,25 @@ static void unix_destruct_scm(struct sk_buff *skb) sock_wfree(skb); } +#define MAX_RECURSION_LEVEL 4 + static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; + unsigned char max_level = 0; + int unix_sock_count = 0; + + for (i = scm->fp->count - 1; i >= 0; i--) { + struct sock *sk = unix_get_socket(scm->fp->fp[i]); + + if (sk) { + unix_sock_count++; + max_level = max(max_level, + unix_sk(sk)->recursion_level); + } + } + if (unlikely(max_level > MAX_RECURSION_LEVEL)) + return -ETOOMANYREFS; /* * Need to duplicate file references for the sake of garbage @@ -1356,9 +1372,11 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - for (i = scm->fp->count-1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); - return 0; + if (unix_sock_count) { + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->fp[i]); + } + return max_level; } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1393,6 +1411,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sk_buff *skb; long timeo; struct scm_cookie tmp_scm; + int max_level; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1431,8 +1450,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; err = unix_scm_to_skb(siocb->scm, skb, true); - if (err) + if (err < 0) goto out_free; + max_level = err + 1; unix_get_secdata(siocb->scm, skb); skb_reset_transport_header(skb); @@ -1514,6 +1534,8 @@ restart: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other, len); sock_put(other); @@ -1544,6 +1566,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, int sent = 0; struct scm_cookie tmp_scm; bool fds_sent = false; + int max_level; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1607,10 +1630,11 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, /* Only send the fds in the first buffer */ err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); - if (err) { + if (err < 0) { kfree_skb(skb); goto out_err; } + max_level = err + 1; fds_sent = true; err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); @@ -1626,6 +1650,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, goto pipe_err_free; skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other, size); sent += size; @@ -1845,6 +1871,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, unix_state_lock(sk); skb = skb_dequeue(&sk->sk_receive_queue); if (skb == NULL) { + unix_sk(sk)->recursion_level = 0; if (copied >= target) goto unlock; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index c8df6fda0b1f..f89f83bf828e 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -96,7 +96,7 @@ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); unsigned int unix_tot_inflight; -static struct sock *unix_get_socket(struct file *filp) +struct sock *unix_get_socket(struct file *filp) { struct sock *u_sock = NULL; struct inode *inode = filp->f_path.dentry->d_inode; @@ -259,9 +259,16 @@ static void inc_inflight_move_tail(struct unix_sock *u) } static bool gc_in_progress = false; +#define UNIX_INFLIGHT_TRIGGER_GC 16000 void wait_for_unix_gc(void) { + /* + * If number of inflight sockets is insane, + * force a garbage collect right now. + */ + if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) + unix_gc(); wait_event(unix_gc_wait, gc_in_progress == false); } diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c index 771bab00754b..55187c8f6420 100644 --- a/net/x25/x25_facilities.c +++ b/net/x25/x25_facilities.c @@ -61,6 +61,8 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities, while (len > 0) { switch (*p & X25_FAC_CLASS_MASK) { case X25_FAC_CLASS_A: + if (len < 2) + return 0; switch (*p) { case X25_FAC_REVERSE: if((p[1] & 0x81) == 0x81) { @@ -104,6 +106,8 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities, len -= 2; break; case X25_FAC_CLASS_B: + if (len < 3) + return 0; switch (*p) { case X25_FAC_PACKET_SIZE: facilities->pacsize_in = p[1]; @@ -125,6 +129,8 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities, len -= 3; break; case X25_FAC_CLASS_C: + if (len < 4) + return 0; printk(KERN_DEBUG "X.25: unknown facility %02X, " "values %02X, %02X, %02X\n", p[0], p[1], p[2], p[3]); @@ -132,26 +138,26 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities, len -= 4; break; case X25_FAC_CLASS_D: + if (len < p[1] + 2) + return 0; switch (*p) { case X25_FAC_CALLING_AE: - if (p[1] > X25_MAX_DTE_FACIL_LEN) - break; + if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1) + return 0; dte_facs->calling_len = p[2]; memcpy(dte_facs->calling_ae, &p[3], p[1] - 1); *vc_fac_mask |= X25_MASK_CALLING_AE; break; case X25_FAC_CALLED_AE: - if (p[1] > X25_MAX_DTE_FACIL_LEN) - break; + if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1) + return 0; dte_facs->called_len = p[2]; memcpy(dte_facs->called_ae, &p[3], p[1] - 1); *vc_fac_mask |= X25_MASK_CALLED_AE; break; default: printk(KERN_DEBUG "X.25: unknown facility %02X," - "length %d, values %02X, %02X, " - "%02X, %02X\n", - p[0], p[1], p[2], p[3], p[4], p[5]); + "length %d\n", p[0], p[1]); break; } len -= p[1] + 2; diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 63178961efac..f729f022be69 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -119,6 +119,8 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp &x25->vc_facil_mask); if (len > 0) skb_pull(skb, len); + else + return -1; /* * Copy any Call User Data. */ diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c index a2023ec52329..1e98bc0fe0a5 100644 --- a/net/xfrm/xfrm_hash.c +++ b/net/xfrm/xfrm_hash.c @@ -19,7 +19,7 @@ struct hlist_head *xfrm_hash_alloc(unsigned int sz) if (sz <= PAGE_SIZE) n = kzalloc(sz, GFP_KERNEL); else if (hashdist) - n = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + n = vzalloc(sz); else n = (struct hlist_head *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, |