From e0ab53deaa91293a7958d63d5a2cf4c5645ad6f0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Jul 2006 17:22:50 -0400 Subject: RPC: Ensure that we disconnect TCP socket when client requests error out If we're part way through transmitting a TCP request, and the client errors, then we need to disconnect and reconnect the TCP socket in order to avoid confusing the server. Signed-off-by: Trond Myklebust (cherry picked from 031a50c8b9ea82616abd4a4e18021a25848941ce commit) --- net/sunrpc/clnt.c | 52 ++++++++++++++++++++++++++++----------------------- net/sunrpc/xprt.c | 21 +++------------------ net/sunrpc/xprtsock.c | 29 +++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4ba271f892c8..d6409e757219 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -921,26 +921,43 @@ call_transmit(struct rpc_task *task) task->tk_status = xprt_prepare_transmit(task); if (task->tk_status != 0) return; + task->tk_action = call_transmit_status; /* Encode here so that rpcsec_gss can use correct sequence number. */ if (rpc_task_need_encode(task)) { - task->tk_rqstp->rq_bytes_sent = 0; + BUG_ON(task->tk_rqstp->rq_bytes_sent != 0); call_encode(task); /* Did the encode result in an error condition? */ if (task->tk_status != 0) - goto out_nosend; + return; } - task->tk_action = call_transmit_status; xprt_transmit(task); if (task->tk_status < 0) return; - if (!task->tk_msg.rpc_proc->p_decode) { - task->tk_action = rpc_exit_task; - rpc_wake_up_task(task); - } - return; -out_nosend: - /* release socket write lock before attempting to handle error */ - xprt_abort_transmit(task); + /* + * On success, ensure that we call xprt_end_transmit() before sleeping + * in order to allow access to the socket to other RPC requests. + */ + call_transmit_status(task); + if (task->tk_msg.rpc_proc->p_decode != NULL) + return; + task->tk_action = rpc_exit_task; + rpc_wake_up_task(task); +} + +/* + * 5a. Handle cleanup after a transmission + */ +static void +call_transmit_status(struct rpc_task *task) +{ + task->tk_action = call_status; + /* + * Special case: if we've been waiting on the socket's write_space() + * callback, then don't call xprt_end_transmit(). + */ + if (task->tk_status == -EAGAIN) + return; + xprt_end_transmit(task); rpc_task_force_reencode(task); } @@ -992,18 +1009,7 @@ call_status(struct rpc_task *task) } /* - * 6a. Handle transmission errors. - */ -static void -call_transmit_status(struct rpc_task *task) -{ - if (task->tk_status != -EAGAIN) - rpc_task_force_reencode(task); - call_status(task); -} - -/* - * 6b. Handle RPC timeout + * 6a. Handle RPC timeout * We do not release the request slot, so we keep using the * same XID for all retransmits. */ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 313b68d892c6..e8c2bc4977f3 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -707,12 +707,9 @@ out_unlock: return err; } -void -xprt_abort_transmit(struct rpc_task *task) +void xprt_end_transmit(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; - - xprt_release_write(xprt, task); + xprt_release_write(task->tk_xprt, task); } /** @@ -761,8 +758,6 @@ void xprt_transmit(struct rpc_task *task) task->tk_status = -ENOTCONN; else if (!req->rq_received) rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); - - xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); return; } @@ -772,18 +767,8 @@ void xprt_transmit(struct rpc_task *task) * schedq, and being picked up by a parallel run of rpciod(). */ task->tk_status = status; - - switch (status) { - case -ECONNREFUSED: + if (status == -ECONNREFUSED) rpc_sleep_on(&xprt->sending, task, NULL, NULL); - case -EAGAIN: - case -ENOTCONN: - return; - default: - break; - } - xprt_release_write(xprt, task); - return; } static inline void do_xprt_reserve(struct rpc_task *task) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ee678ed13b6f..441bd53f5eca 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -413,6 +413,33 @@ static int xs_tcp_send_request(struct rpc_task *task) return status; } +/** + * xs_tcp_release_xprt - clean up after a tcp transmission + * @xprt: transport + * @task: rpc task + * + * This cleans up if an error causes us to abort the transmission of a request. + * In this case, the socket may need to be reset in order to avoid confusing + * the server. + */ +static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpc_rqst *req; + + if (task != xprt->snd_task) + return; + if (task == NULL) + goto out_release; + req = task->tk_rqstp; + if (req->rq_bytes_sent == 0) + goto out_release; + if (req->rq_bytes_sent == req->rq_snd_buf.len) + goto out_release; + set_bit(XPRT_CLOSE_WAIT, &task->tk_xprt->state); +out_release: + xprt_release_xprt(xprt, task); +} + /** * xs_close - close a socket * @xprt: transport @@ -1250,7 +1277,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, - .release_xprt = xprt_release_xprt, + .release_xprt = xs_tcp_release_xprt, .set_port = xs_set_port, .connect = xs_connect, .buf_alloc = rpc_malloc, -- cgit v1.2.3 From 5c3e985a2c1908aa97221d3806f85ce7e2fbfa88 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 29 Jul 2006 17:37:40 -0400 Subject: SUNRPC: Fix obvious refcounting bugs in rpc_pipefs. Doh! Signed-off-by: Trond Myklebust (cherry picked from 496f408f2f0e7ee5481a7c2222189be6c4f5aa6c commit) --- net/sunrpc/rpc_pipe.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index dc6cb93c8830..a3bd2db2e024 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -667,10 +667,11 @@ rpc_mkdir(char *path, struct rpc_clnt *rpc_client) RPCAUTH_info, RPCAUTH_EOF); if (error) goto err_depopulate; + dget(dentry); out: mutex_unlock(&dir->i_mutex); rpc_release_path(&nd); - return dget(dentry); + return dentry; err_depopulate: rpc_depopulate(dentry); __rpc_rmdir(dir, dentry); @@ -731,10 +732,11 @@ rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags) rpci->flags = flags; rpci->ops = ops; inode_dir_notify(dir, DN_CREATE); + dget(dentry); out: mutex_unlock(&dir->i_mutex); rpc_release_path(&nd); - return dget(dentry); + return dentry; err_dput: dput(dentry); dentry = ERR_PTR(-ENOMEM); -- cgit v1.2.3 From bea1b42e1bb184cb75e6bbd95c83e4478dde4ab9 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 3 Aug 2006 16:24:02 -0700 Subject: [BRIDGE]: netlink status fix Fix code that passes back netlink status messages about bridge changes. Submitted by Aji_Srinivas@emc.com Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 06abb6634f5b..53086fb75089 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -85,7 +85,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) goto err_out; err = br_fill_ifinfo(skb, port, current->pid, 0, event, 0); - if (err) + if (err < 0) goto err_kfree; NETLINK_CB(skb).dst_group = RTNLGRP_LINK; -- cgit v1.2.3 From b9e2cc0f0e47ad351349156018ef8a365e9c6d25 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Thu, 3 Aug 2006 16:36:51 -0700 Subject: [PKT_SCHED]: Return ENOENT if qdisc module is unavailable Return ENOENT if qdisc module is unavailable Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c7844bacbbcb..a19eff12cf78 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -430,7 +430,7 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) } #endif - err = -EINVAL; + err = -ENOENT; if (ops == NULL) goto err_out; -- cgit v1.2.3 From 30a584d944fbd599d4a8f470f75bf7af1a15b466 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 3 Aug 2006 16:38:49 -0700 Subject: [LLX]: SOCK_DGRAM interface fixes The datagram interface of LLC is broken in a couple of ways. These were discovered when trying to use it to build an out-of-kernel version of STP. First it didn't pass the source address of the received packet in recvfrom(). It needs to copy the source address of received LLC packets into the socket control block. At the same time fix a security issue because there was uninitialized data leakage. Every recvfrom call was just copying out old data. Second, LLC should not merge multiple packets in one receive call on datagram sockets. LLC should preserve packet boundaries on SOCK_DGRAM. This fix goes against the old historical comments about UNIX98 semantics but without this fix SOCK_DGRAM is broken and useless. So either ANK's interpretation was incorect or UNIX98 standard was wrong. Signed-off-by: Stephen Hemminger Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/llc/af_llc.c | 20 ++++++++------------ net/llc/llc_sap.c | 4 ++-- 2 files changed, 10 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index d6cfe84d521b..2652ead96c64 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -784,24 +784,20 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, copied += used; len -= used; - if (used + offset < skb->len) - continue; - if (!(flags & MSG_PEEK)) { sk_eat_skb(sk, skb, 0); *seq = 0; } + + /* For non stream protcols we get one packet per recvmsg call */ + if (sk->sk_type != SOCK_STREAM) + goto copy_uaddr; + + /* Partial read */ + if (used + offset < skb->len) + continue; } while (len > 0); - /* - * According to UNIX98, msg_name/msg_namelen are ignored - * on connected socket. -ANK - * But... af_llc still doesn't have separate sets of methods for - * SOCK_DGRAM and SOCK_STREAM :-( So we have to do this test, will - * eventually fix this tho :-) -acme - */ - if (sk->sk_type == SOCK_DGRAM) - goto copy_uaddr; out: release_sock(sk); return copied; diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 20c4eb5c1ac6..42eb0c3a9780 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -51,10 +51,10 @@ void llc_save_primitive(struct sock *sk, struct sk_buff* skb, u8 prim) { struct sockaddr_llc *addr; - if (skb->sk->sk_type == SOCK_STREAM) /* See UNIX98 */ - return; /* save primitive for use by the user. */ addr = llc_ui_skb_cb(skb); + + memset(addr, 0, sizeof(*addr)); addr->sllc_family = sk->sk_family; addr->sllc_arphrd = skb->dev->type; addr->sllc_test = prim == LLC_TEST_PRIM; -- cgit v1.2.3 From d254bcdbf2199d9e2a52dbe4592e79ef3a456096 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 4 Aug 2006 16:57:42 -0700 Subject: [TCP]: Fixes IW > 2 cases when TCP is application limited MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Whenever a transfer is application limited, we are allowed at least initial window worth of data per window unless cwnd is previously less than that. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 738dad9f7d49..104af5d5bcbc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3541,7 +3541,8 @@ void tcp_cwnd_application_limited(struct sock *sk) if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ - u32 win_used = max(tp->snd_cwnd_used, 2U); + u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); + u32 win_used = max(tp->snd_cwnd_used, init_win); if (win_used < tp->snd_cwnd) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; -- cgit v1.2.3 From 558e10a57db10de355ee97712d2b6df49e9b7849 Mon Sep 17 00:00:00 2001 From: Diego Calleja Date: Sat, 5 Aug 2006 21:15:58 -0700 Subject: [LAPB]: Fix windowsize check In bug #6954, Norbert Reinartz reported the following issue: "Function lapb_setparms() in file net/lapb/lapb_iface.c checks if the given parameters are valid. If the given window size is in the range of 8 .. 127, lapb_setparms() fails and returns an error value of LAPB_INVALUE, even if bit LAPB_EXTENDED in parms->mode is set. If bit LAPB_EXTENDED in parms->mode is set and the window size is in the range of 8 .. 127, the first check "(parms->mode & LAPB_EXTENDED)" results true and the second check "(parms->window < 1 || parms->window > 127)" results false. Both checks in conjunction result to false, thus the third check "(parms->window < 1 || parms->window > 7)" is done by fault. This third check results true, so that we leave lapb_setparms() by 'goto out_put'. Seems that this bug doesn't cause any problems, because lapb_setparms() isn't used to change the default values of LAPB. We are using kernel lapb in our software project and also change the default parameters of lapb, so we found this bug" He also pasted a fix, that I've transformated into a patch: Signed-off-by: Diego Calleja Signed-off-by: David S. Miller --- net/lapb/lapb_iface.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index d504eed416f6..7e6bc41eeb21 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -238,11 +238,13 @@ int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) goto out_put; if (lapb->state == LAPB_STATE_0) { - if (((parms->mode & LAPB_EXTENDED) && - (parms->window < 1 || parms->window > 127)) || - (parms->window < 1 || parms->window > 7)) - goto out_put; - + if (parms->mode & LAPB_EXTENDED) { + if (parms->window < 1 || parms->window > 127) + goto out_put; + } else { + if (parms->window < 1 || parms->window > 7) + goto out_put; + } lapb->mode = parms->mode; lapb->window = parms->window; } -- cgit v1.2.3 From 2f34931fdc78b4895553aaa33748939cc7697c99 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Sat, 5 Aug 2006 12:14:29 -0700 Subject: [PATCH] knfsd: fix race related problem when adding items to and svcrpc auth cache If we don't find the item we are lookng for, we allocate a new one, and then grab the lock again and search to see if it has been added while we did the alloc. If it had been added we need to 'cache_put' the newly created item that we are never going to use. But as it hasn't been initialised properly, putting it can cause an oops. So move the ->init call earlier to that it will always be fully initilised if we have to put it. Thanks to Philipp Matthias Hahn for reporting the problem. Signed-off-by: Neil Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/cache.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 7026b0866b7b..00cb388ece03 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -71,7 +71,12 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, new = detail->alloc(); if (!new) return NULL; + /* must fully initialise 'new', else + * we might get lose if we need to + * cache_put it soon. + */ cache_init(new); + detail->init(new, key); write_lock(&detail->hash_lock); @@ -85,7 +90,6 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, return tmp; } } - detail->init(new, key); new->next = *head; *head = new; detail->entries++; -- cgit v1.2.3 From 7b2e497a06c0e93719fda88820e057b635e8fae2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Aug 2006 16:09:04 -0700 Subject: [NET]: Assign skb->dev in netdev_alloc_skb Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 022d8894c11d..c54f3664bce5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -268,8 +268,10 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, struct sk_buff *skb; skb = alloc_skb(length + NET_SKB_PAD, gfp_mask); - if (likely(skb)) + if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + } return skb; } -- cgit v1.2.3 From 8b5cc5ef40c83c6ea4c90b203bb2c8b17edfa11b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 7 Aug 2006 20:09:20 -0700 Subject: [IPX]: Header length validation needed This patch will linearize and check there is enough data. It handles the pprop case as well as avoiding a whole audit of the routing code. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipx/af_ipx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index aa34ff4b707c..c13e86b14f69 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1646,7 +1646,8 @@ static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_ty ipx_pktsize = ntohs(ipx->ipx_pktsize); /* Too small or invalid header? */ - if (ipx_pktsize < sizeof(struct ipxhdr) || ipx_pktsize > skb->len) + if (ipx_pktsize < sizeof(struct ipxhdr) || + !pskb_may_pull(skb, ipx_pktsize)) goto drop; if (ipx->ipx_checksum != IPX_NO_CHECKSUM && -- cgit v1.2.3 From 8d1502de27c46b365b5c86e17d173083d3d6c9ac Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Mon, 7 Aug 2006 20:44:22 -0700 Subject: [IPV4]: Limit rt cache size properly. From: Kirill Korotaev During OpenVZ stress testing we found that UDP traffic with random src can generate too much excessive rt hash growing leading finally to OOM and kernel panics. It was found that for 4GB i686 system (having 1048576 total pages and 225280 normal zone pages) kernel allocates the following route hash: syslog: IP route cache hash table entries: 262144 (order: 8, 1048576 bytes) => ip_rt_max_size = 4194304 entries, i.e. max rt size is 4194304 * 256b = 1Gb of RAM > normal_zone Attached the patch which removes HASH_HIGHMEM flag from alloc_large_system_hash() call. Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 19bd49d69d9f..b873cbcdd0b8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3157,7 +3157,7 @@ int __init ip_rt_init(void) rhash_entries, (num_physpages >= 128 * 1024) ? 15 : 17, - HASH_HIGHMEM, + 0, &rt_hash_log, &rt_hash_mask, 0); -- cgit v1.2.3 From aaf580601ff244df82324fff380ed6740f27ef03 Mon Sep 17 00:00:00 2001 From: Chen-Li Tien Date: Mon, 7 Aug 2006 20:49:07 -0700 Subject: [PKTGEN]: Fix oops when used with balance-tlb bonding Signed-off-by: Chen-Li Tien Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 67ed14ddabd2..b1743375eb6d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2149,6 +2149,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->mac.raw = ((u8 *) iph) - 14 - pkt_dev->nr_labels*sizeof(u32); skb->dev = odev; skb->pkt_type = PACKET_HOST; + skb->nh.iph = iph; + skb->h.uh = udph; if (pkt_dev->nfrags <= 0) pgh = (struct pktgen_hdr *)skb_put(skb, datalen); -- cgit v1.2.3 From 69d8c28c9578ce78b3dc1b9be36926d962282898 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 7 Aug 2006 20:52:10 -0700 Subject: [PKTGEN]: Make sure skb->{nh,h} are initialized in fill_packet_ipv6() too. Mirror the bug fix from fill_packet_ipv4() Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b1743375eb6d..6a7320b39ed0 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2462,6 +2462,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; + skb->nh.ipv6h = iph; + skb->h.uh = udph; if (pkt_dev->nfrags <= 0) pgh = (struct pktgen_hdr *)skb_put(skb, datalen); -- cgit v1.2.3 From bd37a088596ccdb2b2dd3299e25e333bca7a9a34 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 7 Aug 2006 21:04:15 -0700 Subject: [TCP]: SNMPv2 tcpOutSegs counter error Do not count retransmitted segments. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5c08ea20a18d..507adefbc17c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -466,7 +466,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) + TCP_INC_STATS(TCP_MIB_OUTSEGS); err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (likely(err <= 0)) @@ -2157,10 +2158,9 @@ int tcp_connect(struct sock *sk) skb_shinfo(buff)->gso_size = 0; skb_shinfo(buff)->gso_type = 0; buff->csum = 0; + tp->snd_nxt = tp->write_seq; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; - tp->snd_nxt = tp->write_seq; - tp->pushed_seq = tp->write_seq; /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; @@ -2170,6 +2170,12 @@ int tcp_connect(struct sock *sk) sk_charge_skb(sk, buff); tp->packets_out += tcp_skb_pcount(buff); tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); + + /* We change tp->snd_nxt after the tcp_transmit_skb() call + * in order to make this packet get counted in tcpOutSegs. + */ + tp->snd_nxt = tp->write_seq; + tp->pushed_seq = tp->write_seq; TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ -- cgit v1.2.3 From 70f8e78e150425b01c1099087ad3decacf7e4ccf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Aug 2006 16:47:37 -0700 Subject: [RTNETLINK]: Fix IFLA_ADDRESS handling. The ->set_mac_address handlers expect a pointer to a sockaddr which contains the MAC address, whereas IFLA_ADDRESS provides just the MAC address itself. So whip up a sockaddr to wrap around the netlink attribute for the ->set_mac_address call. Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 20e5bb73f147..30cc1ba6ed5c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -394,6 +394,9 @@ static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } if (ida[IFLA_ADDRESS - 1]) { + struct sockaddr *sa; + int len; + if (!dev->set_mac_address) { err = -EOPNOTSUPP; goto out; @@ -405,7 +408,17 @@ static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (ida[IFLA_ADDRESS - 1]->rta_len != RTA_LENGTH(dev->addr_len)) goto out; - err = dev->set_mac_address(dev, RTA_DATA(ida[IFLA_ADDRESS - 1])); + len = sizeof(sa_family_t) + dev->addr_len; + sa = kmalloc(len, GFP_KERNEL); + if (!sa) { + err = -ENOMEM; + goto out; + } + sa->sa_family = dev->type; + memcpy(sa->sa_data, RTA_DATA(ida[IFLA_ADDRESS - 1]), + dev->addr_len); + err = dev->set_mac_address(dev, sa); + kfree(sa); if (err) goto out; send_addr_notify = 1; -- cgit v1.2.3 From 7b1ba8de569460894efa892457af7a37c0d574f9 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 8 Aug 2006 16:48:51 -0700 Subject: [IPX]: Another nonlinear receive fix Need to check some more cases in IPX receive. If the skb is purely fragments, the IPX header needs to be extracted. The function pskb_may_pull() may in theory invalidate all the pointers in the skb, so references to ipx header must be refreshed. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipx/af_ipx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index c13e86b14f69..401964204866 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1642,14 +1642,17 @@ static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_ty if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) goto out; - ipx = ipx_hdr(skb); - ipx_pktsize = ntohs(ipx->ipx_pktsize); + if (!pskb_may_pull(skb, sizeof(struct ipxhdr))) + goto drop; + + ipx_pktsize = ntohs(ipxhdr(skb)->ipx_pktsize); /* Too small or invalid header? */ if (ipx_pktsize < sizeof(struct ipxhdr) || !pskb_may_pull(skb, ipx_pktsize)) goto drop; + ipx = ipx_hdr(skb); if (ipx->ipx_checksum != IPX_NO_CHECKSUM && ipx->ipx_checksum != ipx_cksum(ipx, ipx_pktsize)) goto drop; -- cgit v1.2.3 From 7c91767a6b701543c93ebcd611dab61deff3dad1 Mon Sep 17 00:00:00 2001 From: Dmitry Mishin Date: Wed, 9 Aug 2006 02:25:54 -0700 Subject: [NET]: add_timer -> mod_timer() in dst_run_gc() Patch from Dmitry Mishin : Replace add_timer() by mod_timer() in dst_run_gc in order to avoid BUG message. CPU1 CPU2 dst_run_gc() entered dst_run_gc() entered spin_lock(&dst_lock) ..... del_timer(&dst_gc_timer) fail to get lock .... mod_timer() <--- puts timer back to the list add_timer(&dst_gc_timer) <--- BUG because timer is in list already. Found during OpenVZ internal testing. At first we thought that it is OpenVZ specific as we added dst_run_gc(0) call in dst_dev_event(), but as Alexey pointed to me it is possible to trigger this condition in mainstream kernel. F.e. timer has fired on CPU2, but the handler was preeempted by an irq before dst_lock is tried. Meanwhile, someone on CPU1 adds an entry to gc list and starts the timer. If CPU2 was preempted long enough, this timer can expire simultaneously with resuming timer handler on CPU1, arriving exactly to the situation described. Signed-off-by: Dmitry Mishin Signed-off-by: Kirill Korotaev Signed-off-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/core/dst.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index 470c05bc4cb2..1a5e49da0e77 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -95,12 +95,11 @@ static void dst_run_gc(unsigned long dummy) dst_gc_timer_inc = DST_GC_INC; dst_gc_timer_expires = DST_GC_MIN; } - dst_gc_timer.expires = jiffies + dst_gc_timer_expires; #if RT_CACHE_DEBUG >= 2 printk("dst_total: %d/%d %ld\n", atomic_read(&dst_total), delayed, dst_gc_timer_expires); #endif - add_timer(&dst_gc_timer); + mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); out: spin_unlock(&dst_lock); -- cgit v1.2.3 From 06aebfb7faa13258af5230ff3d1587ece6c0250e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 9 Aug 2006 16:52:04 -0700 Subject: [IPV6]: The ifa lock is a BH lock The ifa lock is expected to be taken in BH context (by addrconf timers) so we must disable BH when accessing it from user context. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8ea1e36bf8eb..0c5042e7380d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1909,11 +1909,11 @@ static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen, ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags); if (!IS_ERR(ifp)) { - spin_lock(&ifp->lock); + spin_lock_bh(&ifp->lock); ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; ifp->tstamp = jiffies; - spin_unlock(&ifp->lock); + spin_unlock_bh(&ifp->lock); addrconf_dad_start(ifp, 0); in6_ifa_put(ifp); -- cgit v1.2.3 From fff642570dc47ab76491fe81ee6599269c4eb13e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 9 Aug 2006 17:36:15 -0700 Subject: [IPX]: Fix typo, ipxhdr() --> ipx_hdr() Noticed by Dave Jones. Signed-off-by: David S. Miller --- net/ipx/af_ipx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 401964204866..bef3f61569f7 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1645,7 +1645,7 @@ static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_ty if (!pskb_may_pull(skb, sizeof(struct ipxhdr))) goto drop; - ipx_pktsize = ntohs(ipxhdr(skb)->ipx_pktsize); + ipx_pktsize = ntohs(ipx_hdr(skb)->ipx_pktsize); /* Too small or invalid header? */ if (ipx_pktsize < sizeof(struct ipxhdr) || -- cgit v1.2.3 From 18b6fe64d4d1f6e0a2c71429a5e5074f43e29203 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 13 Aug 2006 18:05:09 -0700 Subject: [TCP]: Fix botched memory leak fix to tcpprobe_read(). Somehow I clobbered James's original fix and only my subsequent compiler warning change went in for that changeset. Get the real fix in there. Noticed by Jesper Juhl. Signed-off-by: David S. Miller --- net/ipv4/tcp_probe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index b3435324b573..dab37d2f65fc 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -130,11 +130,12 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf, error = wait_event_interruptible(tcpw.wait, __kfifo_len(tcpw.fifo) != 0); if (error) - return error; + goto out_free; cnt = kfifo_get(tcpw.fifo, tbuf, len); error = copy_to_user(buf, tbuf, cnt); +out_free: vfree(tbuf); return error ? error : cnt; -- cgit v1.2.3 From 97c802a113989800430a981b6f36b14c62163d37 Mon Sep 17 00:00:00 2001 From: Phil Oester Date: Sun, 13 Aug 2006 18:05:35 -0700 Subject: [NETFILTER]: xt_string: fix negation The xt_string match is broken with ! negation. This resolves a portion of netfilter bugzilla #497. Signed-off-by: Phil Oester Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/xt_string.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index d8e3891b5f8b..275330fcdaaa 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -37,7 +37,7 @@ static int match(const struct sk_buff *skb, return (skb_find_text((struct sk_buff *)skb, conf->from_offset, conf->to_offset, conf->config, &state) - != UINT_MAX) && !conf->invert; + != UINT_MAX) ^ conf->invert; } #define STRING_TEXT_PRIV(m) ((struct xt_string_info *) m) -- cgit v1.2.3 From 1c7628bd7a458faf7c96ef521f6d3a5ea9b106b8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 13 Aug 2006 18:06:02 -0700 Subject: [NETFILTER]: xt_hashlimit: fix limit off-by-one Hashlimit doesn't account for the first packet, which is inconsistent with the limit match. Reported by ryan.castellucci@gmail.com, netfilter bugzilla #500. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_hashlimit.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 6b662449e825..3bd2368e1fc9 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -454,15 +454,12 @@ hashlimit_match(const struct sk_buff *skb, dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * hinfo->cfg.burst); dh->rateinfo.cost = user2credits(hinfo->cfg.avg); - - spin_unlock_bh(&hinfo->lock); - return 1; + } else { + /* update expiration timeout */ + dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); + rateinfo_recalc(dh, now); } - /* update expiration timeout */ - dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); - - rateinfo_recalc(dh, now); if (dh->rateinfo.credit >= dh->rateinfo.cost) { /* We're underlimit. */ dh->rateinfo.credit -= dh->rateinfo.cost; -- cgit v1.2.3 From d49c73c729e2ef644558a1f441c044bfacdc9744 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 13 Aug 2006 18:55:53 -0700 Subject: [IPSEC]: Validate properly in xfrm_dst_check() If dst->obsolete is -1, this is a signal from the bundle creator that we want the XFRM dst and the dsts that it references to be validated on every use. I misunderstood this intention when I changed xfrm_dst_check() to always return NULL. Now, when we purge a dst entry, by running dst_free() on it. This will set the dst->obsolete to a positive integer, and we want to return NULL in that case so that the socket does a relookup for the route. Thus, if dst->obsolete<0, let stale_bundle() validate the state, else always return NULL. In general, we need to do things more intelligently here because we flush too much state during rule changes. Herbert Xu has some ideas wherein the key manager gives us some help in this area. We can also use smarter state management algorithms inside of the kernel as well. Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f35bc676128c..3da67ca2c3ce 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1134,12 +1134,33 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } EXPORT_SYMBOL(__xfrm_route_forward); +/* Optimize later using cookies and generation ids. */ + static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { - /* If it is marked obsolete, which is how we even get here, - * then we have purged it from the policy bundle list and we - * did that for a good reason. + /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete + * to "-1" to force all XFRM destinations to get validated by + * dst_ops->check on every use. We do this because when a + * normal route referenced by an XFRM dst is obsoleted we do + * not go looking around for all parent referencing XFRM dsts + * so that we can invalidate them. It is just too much work. + * Instead we make the checks here on every use. For example: + * + * XFRM dst A --> IPv4 dst X + * + * X is the "xdst->route" of A (X is also the "dst->path" of A + * in this example). If X is marked obsolete, "A" will not + * notice. That's what we are validating here via the + * stale_bundle() check. + * + * When a policy's bundle is pruned, we dst_free() the XFRM + * dst which causes it's ->obsolete field to be set to a + * positive non-zero integer. If an XFRM dst has been pruned + * like this, we want to force a new route lookup. */ + if (dst->obsolete < 0 && !stale_bundle(dst)) + return dst; + return NULL; } -- cgit v1.2.3 From 7ee66fcb94cb8be77d5f34cce7d315d11759f9c1 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 13 Aug 2006 18:56:26 -0700 Subject: [LLC]: multicast receive device match Fix from Aji_Srinivas@emc.com, STP packets are incorrectly received on all LLC datagram sockets, whichever interface they are bound to. The llc_sap datagram receive logic sends packets with a unicast destination MAC to one socket bound to that SAP and MAC, and multicast packets to all sockets bound to that SAP. STP packets are multicast, and we do need to know on which interface they were received. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/llc/llc_sap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 42eb0c3a9780..61cb8cf7d153 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -330,6 +330,9 @@ static void llc_sap_mcast(struct llc_sap *sap, if (llc->laddr.lsap != laddr->lsap) continue; + if (llc->dev != skb->dev) + continue; + skb1 = skb_clone(skb, GFP_ATOMIC); if (!skb1) break; -- cgit v1.2.3 From 0eff66e625306a794ecba4b29ed12f7a147ce219 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 13 Aug 2006 18:57:28 -0700 Subject: [NETFILTER]: {arp,ip,ip6}_tables: proper error recovery in init path Neither of {arp,ip,ip6}_tables cleans up behind itself when something goes wrong during initialization. Noticed by Rennie deGraaf Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/arp_tables.c | 27 ++++++++++++++++++++------- net/ipv4/netfilter/ip_tables.c | 33 +++++++++++++++++++++++++-------- net/ipv6/netfilter/ip6_tables.c | 34 +++++++++++++++++++++++++--------- 3 files changed, 70 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 80c73ca90116..df4854cf598b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1170,21 +1170,34 @@ static int __init arp_tables_init(void) { int ret; - xt_proto_init(NF_ARP); + ret = xt_proto_init(NF_ARP); + if (ret < 0) + goto err1; /* Noone else will be downing sem now, so we won't sleep */ - xt_register_target(&arpt_standard_target); - xt_register_target(&arpt_error_target); + ret = xt_register_target(&arpt_standard_target); + if (ret < 0) + goto err2; + ret = xt_register_target(&arpt_error_target); + if (ret < 0) + goto err3; /* Register setsockopt */ ret = nf_register_sockopt(&arpt_sockopts); - if (ret < 0) { - duprintf("Unable to register sockopts.\n"); - return ret; - } + if (ret < 0) + goto err4; printk("arp_tables: (C) 2002 David S. Miller\n"); return 0; + +err4: + xt_unregister_target(&arpt_error_target); +err3: + xt_unregister_target(&arpt_standard_target); +err2: + xt_proto_fini(NF_ARP); +err1: + return ret; } static void __exit arp_tables_fini(void) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index fc5bdd5eb7d3..f316ff5fd8a6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -2239,22 +2239,39 @@ static int __init ip_tables_init(void) { int ret; - xt_proto_init(AF_INET); + ret = xt_proto_init(AF_INET); + if (ret < 0) + goto err1; /* Noone else will be downing sem now, so we won't sleep */ - xt_register_target(&ipt_standard_target); - xt_register_target(&ipt_error_target); - xt_register_match(&icmp_matchstruct); + ret = xt_register_target(&ipt_standard_target); + if (ret < 0) + goto err2; + ret = xt_register_target(&ipt_error_target); + if (ret < 0) + goto err3; + ret = xt_register_match(&icmp_matchstruct); + if (ret < 0) + goto err4; /* Register setsockopt */ ret = nf_register_sockopt(&ipt_sockopts); - if (ret < 0) { - duprintf("Unable to register sockopts.\n"); - return ret; - } + if (ret < 0) + goto err5; printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n"); return 0; + +err5: + xt_unregister_match(&icmp_matchstruct); +err4: + xt_unregister_target(&ipt_error_target); +err3: + xt_unregister_target(&ipt_standard_target); +err2: + xt_proto_fini(AF_INET); +err1: + return ret; } static void __exit ip_tables_fini(void) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index f26898b00347..c9d6b23cd3f7 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1398,23 +1398,39 @@ static int __init ip6_tables_init(void) { int ret; - xt_proto_init(AF_INET6); + ret = xt_proto_init(AF_INET6); + if (ret < 0) + goto err1; /* Noone else will be downing sem now, so we won't sleep */ - xt_register_target(&ip6t_standard_target); - xt_register_target(&ip6t_error_target); - xt_register_match(&icmp6_matchstruct); + ret = xt_register_target(&ip6t_standard_target); + if (ret < 0) + goto err2; + ret = xt_register_target(&ip6t_error_target); + if (ret < 0) + goto err3; + ret = xt_register_match(&icmp6_matchstruct); + if (ret < 0) + goto err4; /* Register setsockopt */ ret = nf_register_sockopt(&ip6t_sockopts); - if (ret < 0) { - duprintf("Unable to register sockopts.\n"); - xt_proto_fini(AF_INET6); - return ret; - } + if (ret < 0) + goto err5; printk("ip6_tables: (C) 2000-2006 Netfilter Core Team\n"); return 0; + +err5: + xt_unregister_match(&icmp6_matchstruct); +err4: + xt_unregister_target(&ip6t_error_target); +err3: + xt_unregister_target(&ip6t_standard_target); +err2: + xt_proto_fini(AF_INET6); +err1: + return ret; } static void __exit ip6_tables_fini(void) -- cgit v1.2.3 From dcb7cd97f133f7cfbd181149a1e60215a869f895 Mon Sep 17 00:00:00 2001 From: Mark Huang Date: Sun, 13 Aug 2006 18:57:54 -0700 Subject: [NETFILTER]: ulog: fix panic on SMP kernels Fix kernel panic on various SMP machines. The culprit is a null ub->skb in ulog_send(). If ulog_timer() has already been scheduled on one CPU and is spinning on the lock, and ipt_ulog_packet() flushes the queue on another CPU by calling ulog_send() right before it exits, there will be no skbuff when ulog_timer() acquires the lock and calls ulog_send(). Cancelling the timer in ulog_send() doesn't help because it has already been scheduled and is running on the first CPU. Similar problem exists in ebt_ulog.c and nfnetlink_log.c. Signed-off-by: Mark Huang Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/bridge/netfilter/ebt_ulog.c | 3 +++ net/ipv4/netfilter/ipt_ULOG.c | 5 +++++ net/netfilter/nfnetlink_log.c | 3 +++ 3 files changed, 11 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 02693a230dc1..9f950db3b76f 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -74,6 +74,9 @@ static void ulog_send(unsigned int nlgroup) if (timer_pending(&ub->timer)) del_timer(&ub->timer); + if (!ub->skb) + return; + /* last nlmsg needs NLMSG_DONE */ if (ub->qlen > 1) ub->lastnlh->nlmsg_type = NLMSG_DONE; diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index d7dd7fe7051c..d46fd677fa11 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -115,6 +115,11 @@ static void ulog_send(unsigned int nlgroupnum) del_timer(&ub->timer); } + if (!ub->skb) { + DEBUGP("ipt_ULOG: ulog_send: nothing to send\n"); + return; + } + /* last nlmsg needs NLMSG_DONE */ if (ub->qlen > 1) ub->lastnlh->nlmsg_type = NLMSG_DONE; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 61cdda4e5d3b..b59d3b2bde21 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -366,6 +366,9 @@ __nfulnl_send(struct nfulnl_instance *inst) if (timer_pending(&inst->timer)) del_timer(&inst->timer); + if (!inst->skb) + return 0; + if (inst->qlen > 1) inst->lastnlh->nlmsg_type = NLMSG_DONE; -- cgit v1.2.3 From e9fa4f7bd291c29a785666e2fa5a9cf3241ee6c3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 13 Aug 2006 20:12:58 -0700 Subject: [INET]: Use pskb_trim_unique when trimming paged unique skbs The IPv4/IPv6 datagram output path was using skb_trim to trim paged packets because they know that the packet has not been cloned yet (since the packet hasn't been given to anything else in the system). This broke because skb_trim no longer allows paged packets to be trimmed. Paged packets must be given to one of the pskb_trim functions instead. This patch adds a new pskb_trim_unique function to cover the IPv4/IPv6 datagram output path scenario and replaces the corresponding skb_trim calls with it. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 +++++++++++++++ net/ipv4/ip_output.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3573ba9a2555..755e9cddac47 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1039,6 +1039,21 @@ static inline int pskb_trim(struct sk_buff *skb, unsigned int len) return (len < skb->len) ? __pskb_trim(skb, len) : 0; } +/** + * pskb_trim_unique - remove end from a paged unique (not cloned) buffer + * @skb: buffer to alter + * @len: new length + * + * This is identical to pskb_trim except that the caller knows that + * the skb is not cloned so we should never get an error due to out- + * of-memory. + */ +static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) +{ + int err = pskb_trim(skb, len); + BUG_ON(err); +} + /** * skb_orphan - orphan a buffer * @skb: buffer to orphan diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9bf307a29783..4c20f5546893 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -947,7 +947,7 @@ alloc_new_skb: skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; - skb_trim(skb_prev, maxfraglen); + pskb_trim_unique(skb_prev, maxfraglen); } copy = datalen - transhdrlen - fraggap; @@ -1142,7 +1142,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, data, fraggap, 0); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); - skb_trim(skb_prev, maxfraglen); + pskb_trim_unique(skb_prev, maxfraglen); } /* diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 69451af6abe7..4fb47a252913 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1095,7 +1095,7 @@ alloc_new_skb: skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; - skb_trim(skb_prev, maxfraglen); + pskb_trim_unique(skb_prev, maxfraglen); } copy = datalen - transhdrlen - fraggap; if (copy < 0) { -- cgit v1.2.3