From e0ab53deaa91293a7958d63d5a2cf4c5645ad6f0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 27 Jul 2006 17:22:50 -0400
Subject: RPC: Ensure that we disconnect TCP socket when client requests error
 out

If we're part way through transmitting a TCP request, and the client
errors, then we need to disconnect and reconnect the TCP socket in order to
avoid confusing the server.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
(cherry picked from 031a50c8b9ea82616abd4a4e18021a25848941ce commit)
---
 net/sunrpc/clnt.c     | 52 ++++++++++++++++++++++++++++-----------------------
 net/sunrpc/xprt.c     | 21 +++------------------
 net/sunrpc/xprtsock.c | 29 +++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 42 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 4ba271f892c8..d6409e757219 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -921,26 +921,43 @@ call_transmit(struct rpc_task *task)
 	task->tk_status = xprt_prepare_transmit(task);
 	if (task->tk_status != 0)
 		return;
+	task->tk_action = call_transmit_status;
 	/* Encode here so that rpcsec_gss can use correct sequence number. */
 	if (rpc_task_need_encode(task)) {
-		task->tk_rqstp->rq_bytes_sent = 0;
+		BUG_ON(task->tk_rqstp->rq_bytes_sent != 0);
 		call_encode(task);
 		/* Did the encode result in an error condition? */
 		if (task->tk_status != 0)
-			goto out_nosend;
+			return;
 	}
-	task->tk_action = call_transmit_status;
 	xprt_transmit(task);
 	if (task->tk_status < 0)
 		return;
-	if (!task->tk_msg.rpc_proc->p_decode) {
-		task->tk_action = rpc_exit_task;
-		rpc_wake_up_task(task);
-	}
-	return;
-out_nosend:
-	/* release socket write lock before attempting to handle error */
-	xprt_abort_transmit(task);
+	/*
+	 * On success, ensure that we call xprt_end_transmit() before sleeping
+	 * in order to allow access to the socket to other RPC requests.
+	 */
+	call_transmit_status(task);
+	if (task->tk_msg.rpc_proc->p_decode != NULL)
+		return;
+	task->tk_action = rpc_exit_task;
+	rpc_wake_up_task(task);
+}
+
+/*
+ * 5a.	Handle cleanup after a transmission
+ */
+static void
+call_transmit_status(struct rpc_task *task)
+{
+	task->tk_action = call_status;
+	/*
+	 * Special case: if we've been waiting on the socket's write_space()
+	 * callback, then don't call xprt_end_transmit().
+	 */
+	if (task->tk_status == -EAGAIN)
+		return;
+	xprt_end_transmit(task);
 	rpc_task_force_reencode(task);
 }
 
@@ -992,18 +1009,7 @@ call_status(struct rpc_task *task)
 }
 
 /*
- * 6a.	Handle transmission errors.
- */
-static void
-call_transmit_status(struct rpc_task *task)
-{
-	if (task->tk_status != -EAGAIN)
-		rpc_task_force_reencode(task);
-	call_status(task);
-}
-
-/*
- * 6b.	Handle RPC timeout
+ * 6a.	Handle RPC timeout
  * 	We do not release the request slot, so we keep using the
  *	same XID for all retransmits.
  */
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 313b68d892c6..e8c2bc4977f3 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -707,12 +707,9 @@ out_unlock:
 	return err;
 }
 
-void
-xprt_abort_transmit(struct rpc_task *task)
+void xprt_end_transmit(struct rpc_task *task)
 {
-	struct rpc_xprt	*xprt = task->tk_xprt;
-
-	xprt_release_write(xprt, task);
+	xprt_release_write(task->tk_xprt, task);
 }
 
 /**
@@ -761,8 +758,6 @@ void xprt_transmit(struct rpc_task *task)
 			task->tk_status = -ENOTCONN;
 		else if (!req->rq_received)
 			rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer);
-
-		xprt->ops->release_xprt(xprt, task);
 		spin_unlock_bh(&xprt->transport_lock);
 		return;
 	}
@@ -772,18 +767,8 @@ void xprt_transmit(struct rpc_task *task)
 	 *	 schedq, and being picked up by a parallel run of rpciod().
 	 */
 	task->tk_status = status;
-
-	switch (status) {
-	case -ECONNREFUSED:
+	if (status == -ECONNREFUSED)
 		rpc_sleep_on(&xprt->sending, task, NULL, NULL);
-	case -EAGAIN:
-	case -ENOTCONN:
-		return;
-	default:
-		break;
-	}
-	xprt_release_write(xprt, task);
-	return;
 }
 
 static inline void do_xprt_reserve(struct rpc_task *task)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index ee678ed13b6f..441bd53f5eca 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -413,6 +413,33 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	return status;
 }
 
+/**
+ * xs_tcp_release_xprt - clean up after a tcp transmission
+ * @xprt: transport
+ * @task: rpc task
+ *
+ * This cleans up if an error causes us to abort the transmission of a request.
+ * In this case, the socket may need to be reset in order to avoid confusing
+ * the server.
+ */
+static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	struct rpc_rqst *req;
+
+	if (task != xprt->snd_task)
+		return;
+	if (task == NULL)
+		goto out_release;
+	req = task->tk_rqstp;
+	if (req->rq_bytes_sent == 0)
+		goto out_release;
+	if (req->rq_bytes_sent == req->rq_snd_buf.len)
+		goto out_release;
+	set_bit(XPRT_CLOSE_WAIT, &task->tk_xprt->state);
+out_release:
+	xprt_release_xprt(xprt, task);
+}
+
 /**
  * xs_close - close a socket
  * @xprt: transport
@@ -1250,7 +1277,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 
 static struct rpc_xprt_ops xs_tcp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
-	.release_xprt		= xprt_release_xprt,
+	.release_xprt		= xs_tcp_release_xprt,
 	.set_port		= xs_set_port,
 	.connect		= xs_connect,
 	.buf_alloc		= rpc_malloc,
-- 
cgit v1.2.3


From 5c3e985a2c1908aa97221d3806f85ce7e2fbfa88 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 29 Jul 2006 17:37:40 -0400
Subject: SUNRPC: Fix obvious refcounting bugs in rpc_pipefs.

Doh!

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
(cherry picked from 496f408f2f0e7ee5481a7c2222189be6c4f5aa6c commit)
---
 net/sunrpc/rpc_pipe.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index dc6cb93c8830..a3bd2db2e024 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -667,10 +667,11 @@ rpc_mkdir(char *path, struct rpc_clnt *rpc_client)
 			RPCAUTH_info, RPCAUTH_EOF);
 	if (error)
 		goto err_depopulate;
+	dget(dentry);
 out:
 	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(&nd);
-	return dget(dentry);
+	return dentry;
 err_depopulate:
 	rpc_depopulate(dentry);
 	__rpc_rmdir(dir, dentry);
@@ -731,10 +732,11 @@ rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags)
 	rpci->flags = flags;
 	rpci->ops = ops;
 	inode_dir_notify(dir, DN_CREATE);
+	dget(dentry);
 out:
 	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(&nd);
-	return dget(dentry);
+	return dentry;
 err_dput:
 	dput(dentry);
 	dentry = ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From bea1b42e1bb184cb75e6bbd95c83e4478dde4ab9 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 3 Aug 2006 16:24:02 -0700
Subject: [BRIDGE]: netlink status fix

Fix code that passes back netlink status messages about
bridge changes. Submitted by Aji_Srinivas@emc.com

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 06abb6634f5b..53086fb75089 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -85,7 +85,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port)
 		goto err_out;
 
 	err = br_fill_ifinfo(skb, port, current->pid, 0, event, 0);
-	if (err)
+	if (err < 0)
 		goto err_kfree;
 
 	NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
-- 
cgit v1.2.3


From b9e2cc0f0e47ad351349156018ef8a365e9c6d25 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Thu, 3 Aug 2006 16:36:51 -0700
Subject: [PKT_SCHED]: Return ENOENT if qdisc module is unavailable

Return ENOENT if qdisc module is unavailable

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c7844bacbbcb..a19eff12cf78 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -430,7 +430,7 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
 	}
 #endif
 
-	err = -EINVAL;
+	err = -ENOENT;
 	if (ops == NULL)
 		goto err_out;
 
-- 
cgit v1.2.3


From 30a584d944fbd599d4a8f470f75bf7af1a15b466 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 3 Aug 2006 16:38:49 -0700
Subject: [LLX]: SOCK_DGRAM interface fixes

The datagram interface of LLC is broken in a couple of ways.
These were discovered when trying to use it to build an out-of-kernel
version of STP.

First it didn't pass the source address of the received packet
in recvfrom(). It needs to copy the source address of received LLC packets
into the socket control block. At the same time fix a security issue
because there was uninitialized data leakage. Every recvfrom call
was just copying out old data.

Second, LLC should not merge multiple packets in one receive call
on datagram sockets. LLC should preserve packet boundaries on
SOCK_DGRAM.

This fix goes against the old historical comments about UNIX98 semantics
but without this fix SOCK_DGRAM is broken and useless. So either ANK's
interpretation was incorect or UNIX98 standard was wrong.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Acked-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/af_llc.c  | 20 ++++++++------------
 net/llc/llc_sap.c |  4 ++--
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index d6cfe84d521b..2652ead96c64 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -784,24 +784,20 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copied += used;
 		len -= used;
 
-		if (used + offset < skb->len)
-			continue;
-
 		if (!(flags & MSG_PEEK)) {
 			sk_eat_skb(sk, skb, 0);
 			*seq = 0;
 		}
+
+		/* For non stream protcols we get one packet per recvmsg call */
+		if (sk->sk_type != SOCK_STREAM)
+			goto copy_uaddr;
+
+		/* Partial read */
+		if (used + offset < skb->len)
+			continue;
 	} while (len > 0);
 
-	/* 
-	 * According to UNIX98, msg_name/msg_namelen are ignored
-	 * on connected socket. -ANK
-	 * But... af_llc still doesn't have separate sets of methods for
-	 * SOCK_DGRAM and SOCK_STREAM :-( So we have to do this test, will
-	 * eventually fix this tho :-) -acme
-	 */
-	if (sk->sk_type == SOCK_DGRAM)
-		goto copy_uaddr;
 out:
 	release_sock(sk);
 	return copied;
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 20c4eb5c1ac6..42eb0c3a9780 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -51,10 +51,10 @@ void llc_save_primitive(struct sock *sk, struct sk_buff* skb, u8 prim)
 {
 	struct sockaddr_llc *addr;
 
-	if (skb->sk->sk_type == SOCK_STREAM) /* See UNIX98 */
-		return;
        /* save primitive for use by the user. */
 	addr		  = llc_ui_skb_cb(skb);
+
+	memset(addr, 0, sizeof(*addr));
 	addr->sllc_family = sk->sk_family;
 	addr->sllc_arphrd = skb->dev->type;
 	addr->sllc_test   = prim == LLC_TEST_PRIM;
-- 
cgit v1.2.3


From d254bcdbf2199d9e2a52dbe4592e79ef3a456096 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Fri, 4 Aug 2006 16:57:42 -0700
Subject: [TCP]: Fixes IW > 2 cases when TCP is application limited
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Whenever a transfer is application limited, we are allowed at least
initial window worth of data per window unless cwnd is previously
less than that.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 738dad9f7d49..104af5d5bcbc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3541,7 +3541,8 @@ void tcp_cwnd_application_limited(struct sock *sk)
 	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
 	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
 		/* Limited by application or receiver window. */
-		u32 win_used = max(tp->snd_cwnd_used, 2U);
+		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
+		u32 win_used = max(tp->snd_cwnd_used, init_win);
 		if (win_used < tp->snd_cwnd) {
 			tp->snd_ssthresh = tcp_current_ssthresh(sk);
 			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
-- 
cgit v1.2.3


From 558e10a57db10de355ee97712d2b6df49e9b7849 Mon Sep 17 00:00:00 2001
From: Diego Calleja <diegocg@gmail.com>
Date: Sat, 5 Aug 2006 21:15:58 -0700
Subject: [LAPB]: Fix windowsize check

In bug #6954, Norbert Reinartz reported the following issue:

"Function lapb_setparms() in file net/lapb/lapb_iface.c checks if the given
parameters are valid. If the given window size is in the range of 8 .. 127,
lapb_setparms() fails and returns an error value of LAPB_INVALUE, even if bit
LAPB_EXTENDED in parms->mode is set.
If bit LAPB_EXTENDED in parms->mode is set and the window size is in the range
of 8 .. 127, the first check "(parms->mode & LAPB_EXTENDED)" results true  and
the second check "(parms->window < 1 || parms->window > 127)" results false.
Both checks in conjunction result to false, thus the third check "(parms->window
< 1 || parms->window > 7)" is done by fault.
This third check results true, so that we leave lapb_setparms() by 'goto out_put'.
Seems that this bug doesn't cause any problems, because lapb_setparms() isn't
used to change the default values of LAPB. We are using kernel lapb in our
software project and also change the default parameters of lapb, so we found
this bug"

He also pasted a fix, that I've transformated into a patch:

Signed-off-by: Diego Calleja <diegocg@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/lapb/lapb_iface.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index d504eed416f6..7e6bc41eeb21 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -238,11 +238,13 @@ int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms)
 		goto out_put;
 
 	if (lapb->state == LAPB_STATE_0) {
-		if (((parms->mode & LAPB_EXTENDED) &&
-		     (parms->window < 1 || parms->window > 127)) ||
-		    (parms->window < 1 || parms->window > 7))
-			goto out_put;
-
+		if (parms->mode & LAPB_EXTENDED) {
+			if (parms->window < 1 || parms->window > 127)
+				goto out_put;
+		} else {
+			if (parms->window < 1 || parms->window > 7)
+				goto out_put;
+		}
 		lapb->mode    = parms->mode;
 		lapb->window  = parms->window;
 	}
-- 
cgit v1.2.3


From 2f34931fdc78b4895553aaa33748939cc7697c99 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Sat, 5 Aug 2006 12:14:29 -0700
Subject: [PATCH] knfsd: fix race related problem when adding items to and
 svcrpc auth cache

If we don't find the item we are lookng for, we allocate a new one, and
then grab the lock again and search to see if it has been added while we
did the alloc.  If it had been added we need to 'cache_put' the newly
created item that we are never going to use.  But as it hasn't been
initialised properly, putting it can cause an oops.

So move the ->init call earlier to that it will always be fully initilised
if we have to put it.

Thanks to Philipp Matthias Hahn <pmhahn@svs.Informatik.Uni-Oldenburg.de>
for reporting the problem.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/sunrpc/cache.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 7026b0866b7b..00cb388ece03 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -71,7 +71,12 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 	new = detail->alloc();
 	if (!new)
 		return NULL;
+	/* must fully initialise 'new', else
+	 * we might get lose if we need to
+	 * cache_put it soon.
+	 */
 	cache_init(new);
+	detail->init(new, key);
 
 	write_lock(&detail->hash_lock);
 
@@ -85,7 +90,6 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 			return tmp;
 		}
 	}
-	detail->init(new, key);
 	new->next = *head;
 	*head = new;
 	detail->entries++;
-- 
cgit v1.2.3


From 7b2e497a06c0e93719fda88820e057b635e8fae2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 7 Aug 2006 16:09:04 -0700
Subject: [NET]: Assign skb->dev in netdev_alloc_skb

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 022d8894c11d..c54f3664bce5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -268,8 +268,10 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 	struct sk_buff *skb;
 
 	skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
-	if (likely(skb))
+	if (likely(skb)) {
 		skb_reserve(skb, NET_SKB_PAD);
+		skb->dev = dev;
+	}
 	return skb;
 }
 
-- 
cgit v1.2.3


From 8b5cc5ef40c83c6ea4c90b203bb2c8b17edfa11b Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Mon, 7 Aug 2006 20:09:20 -0700
Subject: [IPX]: Header length validation needed

This patch will linearize and check there is enough data.
It handles the pprop case as well as avoiding a whole audit of
the routing code.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipx/af_ipx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index aa34ff4b707c..c13e86b14f69 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1646,7 +1646,8 @@ static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_ty
 	ipx_pktsize	= ntohs(ipx->ipx_pktsize);
 	
 	/* Too small or invalid header? */
-	if (ipx_pktsize < sizeof(struct ipxhdr) || ipx_pktsize > skb->len)
+	if (ipx_pktsize < sizeof(struct ipxhdr) ||
+	    !pskb_may_pull(skb, ipx_pktsize))
 		goto drop;
                         
 	if (ipx->ipx_checksum != IPX_NO_CHECKSUM &&
-- 
cgit v1.2.3


From 8d1502de27c46b365b5c86e17d173083d3d6c9ac Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Mon, 7 Aug 2006 20:44:22 -0700
Subject: [IPV4]: Limit rt cache size properly.

From: Kirill Korotaev <dev@sw.ru>

During OpenVZ stress testing we found that UDP traffic with random src
can generate too much excessive rt hash growing leading finally to OOM
and kernel panics.

It was found that for 4GB i686 system (having 1048576 total pages and
  225280 normal zone pages) kernel allocates the following route hash:
syslog: IP route cache hash table entries: 262144 (order: 8, 1048576
bytes) => ip_rt_max_size = 4194304 entries, i.e.  max rt size is
4194304 * 256b = 1Gb of RAM > normal_zone

Attached the patch which removes HASH_HIGHMEM flag from
alloc_large_system_hash() call.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 19bd49d69d9f..b873cbcdd0b8 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3157,7 +3157,7 @@ int __init ip_rt_init(void)
 					rhash_entries,
 					(num_physpages >= 128 * 1024) ?
 					15 : 17,
-					HASH_HIGHMEM,
+					0,
 					&rt_hash_log,
 					&rt_hash_mask,
 					0);
-- 
cgit v1.2.3


From aaf580601ff244df82324fff380ed6740f27ef03 Mon Sep 17 00:00:00 2001
From: Chen-Li Tien <cltien@gmail.com>
Date: Mon, 7 Aug 2006 20:49:07 -0700
Subject: [PKTGEN]: Fix oops when used with balance-tlb bonding

Signed-off-by: Chen-Li Tien <cltien@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 67ed14ddabd2..b1743375eb6d 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2149,6 +2149,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	skb->mac.raw = ((u8 *) iph) - 14 - pkt_dev->nr_labels*sizeof(u32);
 	skb->dev = odev;
 	skb->pkt_type = PACKET_HOST;
+	skb->nh.iph = iph;
+	skb->h.uh = udph;
 
 	if (pkt_dev->nfrags <= 0)
 		pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
-- 
cgit v1.2.3


From 69d8c28c9578ce78b3dc1b9be36926d962282898 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Mon, 7 Aug 2006 20:52:10 -0700
Subject: [PKTGEN]: Make sure skb->{nh,h} are initialized in fill_packet_ipv6()
 too.

Mirror the bug fix from fill_packet_ipv4()

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index b1743375eb6d..6a7320b39ed0 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2462,6 +2462,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 	skb->protocol = protocol;
 	skb->dev = odev;
 	skb->pkt_type = PACKET_HOST;
+	skb->nh.ipv6h = iph;
+	skb->h.uh = udph;
 
 	if (pkt_dev->nfrags <= 0)
 		pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
-- 
cgit v1.2.3


From bd37a088596ccdb2b2dd3299e25e333bca7a9a34 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@nanjing-fnst.com>
Date: Mon, 7 Aug 2006 21:04:15 -0700
Subject: [TCP]: SNMPv2 tcpOutSegs counter error

Do not count retransmitted segments.

Signed-off-by: Wei Yongjun <yjwei@nanjing-fnst.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5c08ea20a18d..507adefbc17c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -466,7 +466,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	if (skb->len != tcp_header_size)
 		tcp_event_data_sent(tp, skb, sk);
 
-	TCP_INC_STATS(TCP_MIB_OUTSEGS);
+	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
+		TCP_INC_STATS(TCP_MIB_OUTSEGS);
 
 	err = icsk->icsk_af_ops->queue_xmit(skb, 0);
 	if (likely(err <= 0))
@@ -2157,10 +2158,9 @@ int tcp_connect(struct sock *sk)
 	skb_shinfo(buff)->gso_size = 0;
 	skb_shinfo(buff)->gso_type = 0;
 	buff->csum = 0;
+	tp->snd_nxt = tp->write_seq;
 	TCP_SKB_CB(buff)->seq = tp->write_seq++;
 	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
-	tp->snd_nxt = tp->write_seq;
-	tp->pushed_seq = tp->write_seq;
 
 	/* Send it off. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -2170,6 +2170,12 @@ int tcp_connect(struct sock *sk)
 	sk_charge_skb(sk, buff);
 	tp->packets_out += tcp_skb_pcount(buff);
 	tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
+
+	/* We change tp->snd_nxt after the tcp_transmit_skb() call
+	 * in order to make this packet get counted in tcpOutSegs.
+	 */
+	tp->snd_nxt = tp->write_seq;
+	tp->pushed_seq = tp->write_seq;
 	TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
 
 	/* Timer for repeating the SYN until an answer. */
-- 
cgit v1.2.3


From 70f8e78e150425b01c1099087ad3decacf7e4ccf Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 8 Aug 2006 16:47:37 -0700
Subject: [RTNETLINK]: Fix IFLA_ADDRESS handling.

The ->set_mac_address handlers expect a pointer to a
sockaddr which contains the MAC address, whereas
IFLA_ADDRESS provides just the MAC address itself.

So whip up a sockaddr to wrap around the netlink
attribute for the ->set_mac_address call.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 20e5bb73f147..30cc1ba6ed5c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -394,6 +394,9 @@ static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	}
 
 	if (ida[IFLA_ADDRESS - 1]) {
+		struct sockaddr *sa;
+		int len;
+
 		if (!dev->set_mac_address) {
 			err = -EOPNOTSUPP;
 			goto out;
@@ -405,7 +408,17 @@ static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		if (ida[IFLA_ADDRESS - 1]->rta_len != RTA_LENGTH(dev->addr_len))
 			goto out;
 
-		err = dev->set_mac_address(dev, RTA_DATA(ida[IFLA_ADDRESS - 1]));
+		len = sizeof(sa_family_t) + dev->addr_len;
+		sa = kmalloc(len, GFP_KERNEL);
+		if (!sa) {
+			err = -ENOMEM;
+			goto out;
+		}
+		sa->sa_family = dev->type;
+		memcpy(sa->sa_data, RTA_DATA(ida[IFLA_ADDRESS - 1]),
+		       dev->addr_len);
+		err = dev->set_mac_address(dev, sa);
+		kfree(sa);
 		if (err)
 			goto out;
 		send_addr_notify = 1;
-- 
cgit v1.2.3


From 7b1ba8de569460894efa892457af7a37c0d574f9 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 8 Aug 2006 16:48:51 -0700
Subject: [IPX]: Another nonlinear receive fix

Need to check some more cases in IPX receive.  If the skb is purely
fragments, the IPX header needs to be extracted. The function
pskb_may_pull() may in theory invalidate all the pointers in the skb,
so references to ipx header must be refreshed.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipx/af_ipx.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index c13e86b14f69..401964204866 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1642,14 +1642,17 @@ static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_ty
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
 		goto out;
 
-	ipx		= ipx_hdr(skb);
-	ipx_pktsize	= ntohs(ipx->ipx_pktsize);
+	if (!pskb_may_pull(skb, sizeof(struct ipxhdr)))
+		goto drop;
+
+	ipx_pktsize = ntohs(ipxhdr(skb)->ipx_pktsize);
 	
 	/* Too small or invalid header? */
 	if (ipx_pktsize < sizeof(struct ipxhdr) ||
 	    !pskb_may_pull(skb, ipx_pktsize))
 		goto drop;
                         
+	ipx = ipx_hdr(skb);
 	if (ipx->ipx_checksum != IPX_NO_CHECKSUM &&
 	   ipx->ipx_checksum != ipx_cksum(ipx, ipx_pktsize))
 		goto drop;
-- 
cgit v1.2.3


From 7c91767a6b701543c93ebcd611dab61deff3dad1 Mon Sep 17 00:00:00 2001
From: Dmitry Mishin <dim@openvz.org>
Date: Wed, 9 Aug 2006 02:25:54 -0700
Subject: [NET]: add_timer -> mod_timer() in dst_run_gc()

Patch from Dmitry Mishin <dim@openvz.org>:

Replace add_timer() by mod_timer() in dst_run_gc
in order to avoid BUG message.

       CPU1                            CPU2
dst_run_gc()  entered           dst_run_gc() entered
spin_lock(&dst_lock)                   .....
del_timer(&dst_gc_timer)         fail to get lock
       ....                         mod_timer() <--- puts
                                                 timer back
                                                 to the list
add_timer(&dst_gc_timer) <--- BUG because timer is in list already.

Found during OpenVZ internal testing.

At first we thought that it is OpenVZ specific as we
added dst_run_gc(0) call in dst_dev_event(),
but as Alexey pointed to me it is possible to trigger
this condition in mainstream kernel.

F.e. timer has fired on CPU2, but the handler was preeempted
by an irq before dst_lock is tried.
Meanwhile, someone on CPU1 adds an entry to gc list and
starts the timer.
If CPU2 was preempted long enough, this timer can expire
simultaneously with resuming timer handler on CPU1, arriving
exactly to the situation described.

Signed-off-by: Dmitry Mishin <dim@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dst.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dst.c b/net/core/dst.c
index 470c05bc4cb2..1a5e49da0e77 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -95,12 +95,11 @@ static void dst_run_gc(unsigned long dummy)
 		dst_gc_timer_inc = DST_GC_INC;
 		dst_gc_timer_expires = DST_GC_MIN;
 	}
-	dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
 #if RT_CACHE_DEBUG >= 2
 	printk("dst_total: %d/%d %ld\n",
 	       atomic_read(&dst_total), delayed,  dst_gc_timer_expires);
 #endif
-	add_timer(&dst_gc_timer);
+	mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
 
 out:
 	spin_unlock(&dst_lock);
-- 
cgit v1.2.3


From 06aebfb7faa13258af5230ff3d1587ece6c0250e Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 9 Aug 2006 16:52:04 -0700
Subject: [IPV6]: The ifa lock is a BH lock

The ifa lock is expected to be taken in BH context (by addrconf timers)
so we must disable BH when accessing it from user context.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8ea1e36bf8eb..0c5042e7380d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1909,11 +1909,11 @@ static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen,
 	ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags);
 
 	if (!IS_ERR(ifp)) {
-		spin_lock(&ifp->lock);
+		spin_lock_bh(&ifp->lock);
 		ifp->valid_lft = valid_lft;
 		ifp->prefered_lft = prefered_lft;
 		ifp->tstamp = jiffies;
-		spin_unlock(&ifp->lock);
+		spin_unlock_bh(&ifp->lock);
 
 		addrconf_dad_start(ifp, 0);
 		in6_ifa_put(ifp);
-- 
cgit v1.2.3


From fff642570dc47ab76491fe81ee6599269c4eb13e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Wed, 9 Aug 2006 17:36:15 -0700
Subject: [IPX]: Fix typo, ipxhdr() --> ipx_hdr()

Noticed by Dave Jones.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipx/af_ipx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 401964204866..bef3f61569f7 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1645,7 +1645,7 @@ static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_ty
 	if (!pskb_may_pull(skb, sizeof(struct ipxhdr)))
 		goto drop;
 
-	ipx_pktsize = ntohs(ipxhdr(skb)->ipx_pktsize);
+	ipx_pktsize = ntohs(ipx_hdr(skb)->ipx_pktsize);
 	
 	/* Too small or invalid header? */
 	if (ipx_pktsize < sizeof(struct ipxhdr) ||
-- 
cgit v1.2.3


From 18b6fe64d4d1f6e0a2c71429a5e5074f43e29203 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 13 Aug 2006 18:05:09 -0700
Subject: [TCP]: Fix botched memory leak fix to tcpprobe_read().

Somehow I clobbered James's original fix and only my
subsequent compiler warning change went in for that
changeset.

Get the real fix in there.

Noticed by Jesper Juhl.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_probe.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index b3435324b573..dab37d2f65fc 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -130,11 +130,12 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
 	error = wait_event_interruptible(tcpw.wait,
 					 __kfifo_len(tcpw.fifo) != 0);
 	if (error)
-		return error;
+		goto out_free;
 
 	cnt = kfifo_get(tcpw.fifo, tbuf, len);
 	error = copy_to_user(buf, tbuf, cnt);
 
+out_free:
 	vfree(tbuf);
 
 	return error ? error : cnt;
-- 
cgit v1.2.3


From 97c802a113989800430a981b6f36b14c62163d37 Mon Sep 17 00:00:00 2001
From: Phil Oester <kernel@linuxace.com>
Date: Sun, 13 Aug 2006 18:05:35 -0700
Subject: [NETFILTER]: xt_string: fix negation

The xt_string match is broken with ! negation.
This resolves a portion of netfilter bugzilla #497.

Signed-off-by: Phil Oester <kernel@linuxace.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/xt_string.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index d8e3891b5f8b..275330fcdaaa 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -37,7 +37,7 @@ static int match(const struct sk_buff *skb,
 
 	return (skb_find_text((struct sk_buff *)skb, conf->from_offset, 
 			     conf->to_offset, conf->config, &state) 
-			     != UINT_MAX) && !conf->invert;
+			     != UINT_MAX) ^ conf->invert;
 }
 
 #define STRING_TEXT_PRIV(m) ((struct xt_string_info *) m)
-- 
cgit v1.2.3


From 1c7628bd7a458faf7c96ef521f6d3a5ea9b106b8 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 13 Aug 2006 18:06:02 -0700
Subject: [NETFILTER]: xt_hashlimit: fix limit off-by-one

Hashlimit doesn't account for the first packet, which is inconsistent
with the limit match.

Reported by ryan.castellucci@gmail.com, netfilter bugzilla #500.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_hashlimit.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 6b662449e825..3bd2368e1fc9 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -454,15 +454,12 @@ hashlimit_match(const struct sk_buff *skb,
 		dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * 
 							hinfo->cfg.burst);
 		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
-
-		spin_unlock_bh(&hinfo->lock);
-		return 1;
+	} else {
+		/* update expiration timeout */
+		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
+		rateinfo_recalc(dh, now);
 	}
 
-	/* update expiration timeout */
-	dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-
-	rateinfo_recalc(dh, now);
 	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
 		/* We're underlimit. */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
-- 
cgit v1.2.3


From d49c73c729e2ef644558a1f441c044bfacdc9744 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Sun, 13 Aug 2006 18:55:53 -0700
Subject: [IPSEC]: Validate properly in xfrm_dst_check()

If dst->obsolete is -1, this is a signal from the
bundle creator that we want the XFRM dst and the
dsts that it references to be validated on every
use.

I misunderstood this intention when I changed
xfrm_dst_check() to always return NULL.

Now, when we purge a dst entry, by running dst_free()
on it.  This will set the dst->obsolete to a positive
integer, and we want to return NULL in that case so
that the socket does a relookup for the route.

Thus, if dst->obsolete<0, let stale_bundle() validate
the state, else always return NULL.

In general, we need to do things more intelligently
here because we flush too much state during rule
changes.  Herbert Xu has some ideas wherein the key
manager gives us some help in this area.  We can also
use smarter state management algorithms inside of
the kernel as well.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_policy.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f35bc676128c..3da67ca2c3ce 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1134,12 +1134,33 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 }
 EXPORT_SYMBOL(__xfrm_route_forward);
 
+/* Optimize later using cookies and generation ids. */
+
 static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
 {
-	/* If it is marked obsolete, which is how we even get here,
-	 * then we have purged it from the policy bundle list and we
-	 * did that for a good reason.
+	/* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
+	 * to "-1" to force all XFRM destinations to get validated by
+	 * dst_ops->check on every use.  We do this because when a
+	 * normal route referenced by an XFRM dst is obsoleted we do
+	 * not go looking around for all parent referencing XFRM dsts
+	 * so that we can invalidate them.  It is just too much work.
+	 * Instead we make the checks here on every use.  For example:
+	 *
+	 *	XFRM dst A --> IPv4 dst X
+	 *
+	 * X is the "xdst->route" of A (X is also the "dst->path" of A
+	 * in this example).  If X is marked obsolete, "A" will not
+	 * notice.  That's what we are validating here via the
+	 * stale_bundle() check.
+	 *
+	 * When a policy's bundle is pruned, we dst_free() the XFRM
+	 * dst which causes it's ->obsolete field to be set to a
+	 * positive non-zero integer.  If an XFRM dst has been pruned
+	 * like this, we want to force a new route lookup.
 	 */
+	if (dst->obsolete < 0 && !stale_bundle(dst))
+		return dst;
+
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 7ee66fcb94cb8be77d5f34cce7d315d11759f9c1 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 13 Aug 2006 18:56:26 -0700
Subject: [LLC]: multicast receive device match

Fix from Aji_Srinivas@emc.com, STP packets are incorrectly received on
all LLC datagram sockets, whichever interface they are bound to.  The
llc_sap datagram receive logic sends packets with a unicast
destination MAC to one socket bound to that SAP and MAC, and multicast
packets to all sockets bound to that SAP. STP packets are multicast,
and we do need to know on which interface they were received.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/llc_sap.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 42eb0c3a9780..61cb8cf7d153 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -330,6 +330,9 @@ static void llc_sap_mcast(struct llc_sap *sap,
 		if (llc->laddr.lsap != laddr->lsap)
 			continue;
 
+		if (llc->dev != skb->dev)
+			continue;
+
 		skb1 = skb_clone(skb, GFP_ATOMIC);
 		if (!skb1)
 			break;
-- 
cgit v1.2.3


From 0eff66e625306a794ecba4b29ed12f7a147ce219 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sun, 13 Aug 2006 18:57:28 -0700
Subject: [NETFILTER]: {arp,ip,ip6}_tables: proper error recovery in init path

Neither of {arp,ip,ip6}_tables cleans up behind itself when something goes
wrong during initialization.

Noticed by Rennie deGraaf <degraaf@cpsc.ucalgary.ca>

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/arp_tables.c | 27 ++++++++++++++++++++-------
 net/ipv4/netfilter/ip_tables.c  | 33 +++++++++++++++++++++++++--------
 net/ipv6/netfilter/ip6_tables.c | 34 +++++++++++++++++++++++++---------
 3 files changed, 70 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 80c73ca90116..df4854cf598b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1170,21 +1170,34 @@ static int __init arp_tables_init(void)
 {
 	int ret;
 
-	xt_proto_init(NF_ARP);
+	ret = xt_proto_init(NF_ARP);
+	if (ret < 0)
+		goto err1;
 
 	/* Noone else will be downing sem now, so we won't sleep */
-	xt_register_target(&arpt_standard_target);
-	xt_register_target(&arpt_error_target);
+	ret = xt_register_target(&arpt_standard_target);
+	if (ret < 0)
+		goto err2;
+	ret = xt_register_target(&arpt_error_target);
+	if (ret < 0)
+		goto err3;
 
 	/* Register setsockopt */
 	ret = nf_register_sockopt(&arpt_sockopts);
-	if (ret < 0) {
-		duprintf("Unable to register sockopts.\n");
-		return ret;
-	}
+	if (ret < 0)
+		goto err4;
 
 	printk("arp_tables: (C) 2002 David S. Miller\n");
 	return 0;
+
+err4:
+	xt_unregister_target(&arpt_error_target);
+err3:
+	xt_unregister_target(&arpt_standard_target);
+err2:
+	xt_proto_fini(NF_ARP);
+err1:
+	return ret;
 }
 
 static void __exit arp_tables_fini(void)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index fc5bdd5eb7d3..f316ff5fd8a6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -2239,22 +2239,39 @@ static int __init ip_tables_init(void)
 {
 	int ret;
 
-	xt_proto_init(AF_INET);
+	ret = xt_proto_init(AF_INET);
+	if (ret < 0)
+		goto err1;
 
 	/* Noone else will be downing sem now, so we won't sleep */
-	xt_register_target(&ipt_standard_target);
-	xt_register_target(&ipt_error_target);
-	xt_register_match(&icmp_matchstruct);
+	ret = xt_register_target(&ipt_standard_target);
+	if (ret < 0)
+		goto err2;
+	ret = xt_register_target(&ipt_error_target);
+	if (ret < 0)
+		goto err3;
+	ret = xt_register_match(&icmp_matchstruct);
+	if (ret < 0)
+		goto err4;
 
 	/* Register setsockopt */
 	ret = nf_register_sockopt(&ipt_sockopts);
-	if (ret < 0) {
-		duprintf("Unable to register sockopts.\n");
-		return ret;
-	}
+	if (ret < 0)
+		goto err5;
 
 	printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n");
 	return 0;
+
+err5:
+	xt_unregister_match(&icmp_matchstruct);
+err4:
+	xt_unregister_target(&ipt_error_target);
+err3:
+	xt_unregister_target(&ipt_standard_target);
+err2:
+	xt_proto_fini(AF_INET);
+err1:
+	return ret;
 }
 
 static void __exit ip_tables_fini(void)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index f26898b00347..c9d6b23cd3f7 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1398,23 +1398,39 @@ static int __init ip6_tables_init(void)
 {
 	int ret;
 
-	xt_proto_init(AF_INET6);
+	ret = xt_proto_init(AF_INET6);
+	if (ret < 0)
+		goto err1;
 
 	/* Noone else will be downing sem now, so we won't sleep */
-	xt_register_target(&ip6t_standard_target);
-	xt_register_target(&ip6t_error_target);
-	xt_register_match(&icmp6_matchstruct);
+	ret = xt_register_target(&ip6t_standard_target);
+	if (ret < 0)
+		goto err2;
+	ret = xt_register_target(&ip6t_error_target);
+	if (ret < 0)
+		goto err3;
+	ret = xt_register_match(&icmp6_matchstruct);
+	if (ret < 0)
+		goto err4;
 
 	/* Register setsockopt */
 	ret = nf_register_sockopt(&ip6t_sockopts);
-	if (ret < 0) {
-		duprintf("Unable to register sockopts.\n");
-		xt_proto_fini(AF_INET6);
-		return ret;
-	}
+	if (ret < 0)
+		goto err5;
 
 	printk("ip6_tables: (C) 2000-2006 Netfilter Core Team\n");
 	return 0;
+
+err5:
+	xt_unregister_match(&icmp6_matchstruct);
+err4:
+	xt_unregister_target(&ip6t_error_target);
+err3:
+	xt_unregister_target(&ip6t_standard_target);
+err2:
+	xt_proto_fini(AF_INET6);
+err1:
+	return ret;
 }
 
 static void __exit ip6_tables_fini(void)
-- 
cgit v1.2.3


From dcb7cd97f133f7cfbd181149a1e60215a869f895 Mon Sep 17 00:00:00 2001
From: Mark Huang <mlhuang@cs.princeton.edu>
Date: Sun, 13 Aug 2006 18:57:54 -0700
Subject: [NETFILTER]: ulog: fix panic on SMP kernels

Fix kernel panic on various SMP machines. The culprit is a null
ub->skb in ulog_send(). If ulog_timer() has already been scheduled on
one CPU and is spinning on the lock, and ipt_ulog_packet() flushes the
queue on another CPU by calling ulog_send() right before it exits,
there will be no skbuff when ulog_timer() acquires the lock and calls
ulog_send(). Cancelling the timer in ulog_send() doesn't help because
it has already been scheduled and is running on the first CPU.

Similar problem exists in ebt_ulog.c and nfnetlink_log.c.

Signed-off-by: Mark Huang <mlhuang@cs.princeton.edu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/ebt_ulog.c | 3 +++
 net/ipv4/netfilter/ipt_ULOG.c   | 5 +++++
 net/netfilter/nfnetlink_log.c   | 3 +++
 3 files changed, 11 insertions(+)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 02693a230dc1..9f950db3b76f 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -74,6 +74,9 @@ static void ulog_send(unsigned int nlgroup)
 	if (timer_pending(&ub->timer))
 		del_timer(&ub->timer);
 
+	if (!ub->skb)
+		return;
+
 	/* last nlmsg needs NLMSG_DONE */
 	if (ub->qlen > 1)
 		ub->lastnlh->nlmsg_type = NLMSG_DONE;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index d7dd7fe7051c..d46fd677fa11 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -115,6 +115,11 @@ static void ulog_send(unsigned int nlgroupnum)
 		del_timer(&ub->timer);
 	}
 
+	if (!ub->skb) {
+		DEBUGP("ipt_ULOG: ulog_send: nothing to send\n");
+		return;
+	}
+
 	/* last nlmsg needs NLMSG_DONE */
 	if (ub->qlen > 1)
 		ub->lastnlh->nlmsg_type = NLMSG_DONE;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 61cdda4e5d3b..b59d3b2bde21 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -366,6 +366,9 @@ __nfulnl_send(struct nfulnl_instance *inst)
 	if (timer_pending(&inst->timer))
 		del_timer(&inst->timer);
 
+	if (!inst->skb)
+		return 0;
+
 	if (inst->qlen > 1)
 		inst->lastnlh->nlmsg_type = NLMSG_DONE;
 
-- 
cgit v1.2.3


From e9fa4f7bd291c29a785666e2fa5a9cf3241ee6c3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 13 Aug 2006 20:12:58 -0700
Subject: [INET]: Use pskb_trim_unique when trimming paged unique skbs

The IPv4/IPv6 datagram output path was using skb_trim to trim paged
packets because they know that the packet has not been cloned yet
(since the packet hasn't been given to anything else in the system).

This broke because skb_trim no longer allows paged packets to be
trimmed.  Paged packets must be given to one of the pskb_trim functions
instead.

This patch adds a new pskb_trim_unique function to cover the IPv4/IPv6
datagram output path scenario and replaces the corresponding skb_trim
calls with it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 15 +++++++++++++++
 net/ipv4/ip_output.c   |  4 ++--
 net/ipv6/ip6_output.c  |  2 +-
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3573ba9a2555..755e9cddac47 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1039,6 +1039,21 @@ static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
 	return (len < skb->len) ? __pskb_trim(skb, len) : 0;
 }
 
+/**
+ *	pskb_trim_unique - remove end from a paged unique (not cloned) buffer
+ *	@skb: buffer to alter
+ *	@len: new length
+ *
+ *	This is identical to pskb_trim except that the caller knows that
+ *	the skb is not cloned so we should never get an error due to out-
+ *	of-memory.
+ */
+static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
+{
+	int err = pskb_trim(skb, len);
+	BUG_ON(err);
+}
+
 /**
  *	skb_orphan - orphan a buffer
  *	@skb: buffer to orphan
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9bf307a29783..4c20f5546893 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -947,7 +947,7 @@ alloc_new_skb:
 				skb_prev->csum = csum_sub(skb_prev->csum,
 							  skb->csum);
 				data += fraggap;
-				skb_trim(skb_prev, maxfraglen);
+				pskb_trim_unique(skb_prev, maxfraglen);
 			}
 
 			copy = datalen - transhdrlen - fraggap;
@@ -1142,7 +1142,7 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,
 					data, fraggap, 0);
 				skb_prev->csum = csum_sub(skb_prev->csum,
 							  skb->csum);
-				skb_trim(skb_prev, maxfraglen);
+				pskb_trim_unique(skb_prev, maxfraglen);
 			}
 
 			/*
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 69451af6abe7..4fb47a252913 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1095,7 +1095,7 @@ alloc_new_skb:
 				skb_prev->csum = csum_sub(skb_prev->csum,
 							  skb->csum);
 				data += fraggap;
-				skb_trim(skb_prev, maxfraglen);
+				pskb_trim_unique(skb_prev, maxfraglen);
 			}
 			copy = datalen - transhdrlen - fraggap;
 			if (copy < 0) {
-- 
cgit v1.2.3