From 91c66c6893a3e2bb8a88a30cb76007d5d49d32c9 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Fri, 29 Jul 2011 16:38:49 +0200 Subject: netfilter: ip_queue: Fix small leak in ipq_build_packet_message() ipq_build_packet_message() in net/ipv4/netfilter/ip_queue.c and net/ipv6/netfilter/ip6_queue.c contain a small potential mem leak as far as I can tell. We allocate memory for 'skb' with alloc_skb() annd then call nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); NLMSG_PUT is a macro NLMSG_PUT(skb, pid, seq, type, len) \ NLMSG_NEW(skb, pid, seq, type, len, 0) that expands to NLMSG_NEW, which is also a macro which expands to: NLMSG_NEW(skb, pid, seq, type, len, flags) \ ({ if (unlikely(skb_tailroom(skb) < (int)NLMSG_SPACE(len))) \ goto nlmsg_failure; \ __nlmsg_put(skb, pid, seq, type, len, flags); }) If we take the true branch of the 'if' statement and 'goto nlmsg_failure', then we'll, at that point, return from ipq_build_packet_message() without having assigned 'skb' to anything and we'll leak the memory we allocated for it when it goes out of scope. Fix this by placing a 'kfree(skb)' at 'nlmsg_failure'. I admit that I do not know how likely this to actually happen or even if there's something that guarantees that it will never happen - I'm not that familiar with this code, but if that is so, I've not been able to spot it. Signed-off-by: Jesper Juhl Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ip_queue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 5c9b9d963918..48f7d5b4ff37 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -218,6 +218,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) return skb; nlmsg_failure: + kfree_skb(skb); *errp = -EINVAL; printk(KERN_ERR "ip_queue: error creating packet message\n"); return NULL; -- cgit v1.2.3 From d547f727df86059104af2234804fdd538e112015 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 7 Aug 2011 22:20:20 -0700 Subject: ipv4: fix the reusing of routing cache entries compare_keys and ip_route_input_common rely on rt_oif for distinguishing of input and output routes with same keys values. But sometimes the input route has also same hash chain (keyed by iif != 0) with the output routes (keyed by orig_oif=0). Problem visible if running with small number of rhash_entries. Fix them to use rt_route_iif instead. By this way input route can not be returned to users that request output route. The patch fixes the ip_rt_bug errors that were reported in ip_local_out context, mostly for 255.255.255.255 destinations. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e3dec1c9f09d..cb7efe0567f0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -731,6 +731,7 @@ static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | (rt1->rt_mark ^ rt2->rt_mark) | (rt1->rt_key_tos ^ rt2->rt_key_tos) | + (rt1->rt_route_iif ^ rt2->rt_route_iif) | (rt1->rt_oif ^ rt2->rt_oif) | (rt1->rt_iif ^ rt2->rt_iif)) == 0; } @@ -2321,8 +2322,8 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | (rth->rt_iif ^ iif) | - rth->rt_oif | (rth->rt_key_tos ^ tos)) == 0 && + rt_is_input_route(rth) && rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { -- cgit v1.2.3 From dd23198e58cd35259dd09e8892bbdb90f1d57748 Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Sun, 7 Aug 2011 22:31:07 -0700 Subject: ipv4: Fix ip_getsockopt for IP_PKTOPTIONS IP_PKTOPTIONS is broken for 32-bit applications running in COMPAT mode on 64-bit kernels. This happens because msghdr's msg_flags field is always set to zero. When running in COMPAT mode this should be set to MSG_CMSG_COMPAT instead. Signed-off-by: Tiberiu Szocs-Mihai Signed-off-by: Daniel Baluta Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ab0c9efd1efa..8905e92f896a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1067,7 +1067,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt); */ static int do_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen, unsigned flags) { struct inet_sock *inet = inet_sk(sk); int val; @@ -1240,7 +1240,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, msg.msg_control = optval; msg.msg_controllen = len; - msg.msg_flags = 0; + msg.msg_flags = flags; if (inet->cmsg_flags & IP_CMSG_PKTINFO) { struct in_pktinfo info; @@ -1294,7 +1294,7 @@ int ip_getsockopt(struct sock *sk, int level, { int err; - err = do_ip_getsockopt(sk, level, optname, optval, optlen); + err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && @@ -1327,7 +1327,8 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, return compat_mc_getsockopt(sk, level, optname, optval, optlen, ip_getsockopt); - err = do_ip_getsockopt(sk, level, optname, optval, optlen); + err = do_ip_getsockopt(sk, level, optname, optval, optlen, + MSG_CMSG_COMPAT); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ -- cgit v1.2.3 From 797fd3913abf2f7036003ab8d3d019cbea41affd Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 7 Aug 2011 09:11:00 +0000 Subject: netfilter: TCP and raw fix for ip_route_me_harder TCP in some cases uses different global (raw) socket to send RST and ACK. The transparent flag is not set there. Currently, it is a problem for rerouting after the previous change. Fix it by simplifying the checks in ip_route_me_harder and use FLOWI_FLAG_ANYSRC even for sockets. It looks safe because the initial routing allowed this source address to be used and now we just have to make sure the packet is rerouted. As a side effect this also allows rerouting for normal raw sockets that use spoofed source addresses which was not possible even before we eliminated the ip_route_input call. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/netfilter.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 2e97e3ec1eb7..929b27bdeb79 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -18,17 +18,15 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) struct rtable *rt; struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; - __u8 flags = 0; + __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; unsigned int hh_len; - if (!skb->sk && addr_type != RTN_LOCAL) { - if (addr_type == RTN_UNSPEC) - addr_type = inet_addr_type(net, saddr); - if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) - flags |= FLOWI_FLAG_ANYSRC; - else - saddr = 0; - } + if (addr_type == RTN_UNSPEC) + addr_type = inet_addr_type(net, saddr); + if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) + flags |= FLOWI_FLAG_ANYSRC; + else + saddr = 0; /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. @@ -38,7 +36,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) fl4.flowi4_tos = RT_TOS(iph->tos); fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; fl4.flowi4_mark = skb->mark; - fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : flags; + fl4.flowi4_flags = flags; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return -1; -- cgit v1.2.3 From 47670b767b1593433b516df7798df03f858278be Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 7 Aug 2011 09:16:09 +0000 Subject: ipv4: route non-local sources for raw socket The raw sockets can provide source address for routing but their privileges are not considered. We can provide non-local source address, make sure the FLOWI_FLAG_ANYSRC flag is set if socket has privileges for this, i.e. based on hdrincl (IP_HDRINCL) and transparent flags. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 +- net/ipv4/raw.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index caaff5f5f39f..b897d6e6d0a5 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -238,7 +238,7 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { __u8 flags = 0; - if (inet_sk(sk)->transparent) + if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) flags |= FLOWI_FLAG_ANYSRC; if (sk->sk_protocol == IPPROTO_TCP) flags |= FLOWI_FLAG_PRECOW_METRICS; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 1457acb39cec..61714bd52925 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -563,7 +563,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0); + inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP, + daddr, saddr, 0, 0); if (!inet->hdrincl) { err = raw_probe_proto_opt(&fl4, msg); -- cgit v1.2.3 From d52fbfc9e5c7bb0b0dbc256edf17dee170ce839d Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 7 Aug 2011 10:17:22 +0000 Subject: ipv4: use dst with ref during bcast/mcast loopback Make sure skb dst has reference when moving to another context. Currently, I don't see protocols that can hit it when sending broadcasts/multicasts to loopback using noref dsts, so it is just a precaution. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 77d3eded665a..8c6563361ab5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -122,6 +122,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(newskb)); + skb_dst_force(newskb); netif_rx_ni(newskb); return 0; } -- cgit v1.2.3 From f0e3d0689da401f7d1981c2777a714ba295ea5ff Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Wed, 10 Aug 2011 21:59:57 -0700 Subject: tcp: initialize variable ecn_ok in syncookies path Using a gcc 4.4.3, warnings are emitted for a possibly uninitialized use of ecn_ok. This can happen if cookie_check_timestamp() returns due to not having seen a timestamp. Defaulting to ecn off seems like a reasonable thing to do in this case, so initialized ecn_ok to false. Signed-off-by: Mike Waychison Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 2 +- net/ipv6/syncookies.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 92bb9434b338..3bc5c8f7c71b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -276,7 +276,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, int mss; struct rtable *rt; __u8 rcv_wscale; - bool ecn_ok; + bool ecn_ok = false; if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 89d5bf806222..ac838965ff34 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -165,7 +165,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) int mss; struct dst_entry *dst; __u8 rcv_wscale; - bool ecn_ok; + bool ecn_ok = false; if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; -- cgit v1.2.3 From 97a804102021431fa6fa33c21c85df762b0f5cb9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 9 Aug 2011 04:01:16 +0000 Subject: ipv4: some rt_iif -> rt_route_iif conversions As rt_iif represents input device even for packets coming from loopback with output route, it is not an unique key specific to input routes. Now rt_route_iif has such role, it was fl.iif in 2.6.38, so better to change the checks at some places to save CPU cycles and to restore 2.6.38 semantics. compare_keys: - input routes: only rt_route_iif matters, rt_iif is same - output routes: only rt_oif matters, rt_iif is not used for matching in __ip_route_output_key - now we are back to 2.6.38 state ip_route_input_common: - matching rt_route_iif implies input route - compared to 2.6.38 we eliminated one rth->fl.oif check because it was not needed even for 2.6.38 compare_hash_inputs: Only the change here is not an optimization, it has effect only for output routes. I assume I'm restoring the original intention to ignore oif, it was using fl.iif - now we are back to 2.6.38 state Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/route.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cb7efe0567f0..075212e41b83 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -722,7 +722,7 @@ static inline bool compare_hash_inputs(const struct rtable *rt1, { return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | - (rt1->rt_iif ^ rt2->rt_iif)) == 0); + (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); } static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) @@ -732,8 +732,7 @@ static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) (rt1->rt_mark ^ rt2->rt_mark) | (rt1->rt_key_tos ^ rt2->rt_key_tos) | (rt1->rt_route_iif ^ rt2->rt_route_iif) | - (rt1->rt_oif ^ rt2->rt_oif) | - (rt1->rt_iif ^ rt2->rt_iif)) == 0; + (rt1->rt_oif ^ rt2->rt_oif)) == 0; } static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) @@ -2321,9 +2320,8 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth = rcu_dereference(rth->dst.rt_next)) { if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | - (rth->rt_iif ^ iif) | + (rth->rt_route_iif ^ iif) | (rth->rt_key_tos ^ tos)) == 0 && - rt_is_input_route(rth) && rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { -- cgit v1.2.3 From e05c4ad3ed874ee4f5e2c969e55d318ec654332c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 23 Aug 2011 22:54:37 +0000 Subject: mcast: Fix source address selection for multicast listener report Should check use count of include mode filter instead of total number of include mode filters. Signed-off-by: Zheng Yan Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 2 +- net/ipv6/mcast.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 283c0a26e03f..d577199eabd5 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -767,7 +767,7 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) break; for (i=0; isfcount[MCAST_INCLUDE] || + if (psf->sf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) continue; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 3e6ebcdb4779..ee7839f4d6e3 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1059,7 +1059,7 @@ static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, break; for (i=0; imca_sfcount[MCAST_INCLUDE] || + if (psf->sf_count[MCAST_INCLUDE] || pmc->mca_sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) continue; -- cgit v1.2.3 From c6675233f9015d3c0460c8aab53ed9b99d915c64 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 30 Aug 2011 15:01:20 +0200 Subject: netfilter: nf_queue: reject NF_STOLEN verdicts from userspace A userspace listener may send (bogus) NF_STOLEN verdict, which causes skb leak. This problem was previously fixed via 64507fdbc29c3a622180378210ecea8659b14e40 (netfilter: nf_queue: fix NF_STOLEN skb leak) but this had to be reverted because NF_STOLEN can also be returned by a netfilter hook when iterating the rules in nf_reinject. Reject userspace NF_STOLEN verdict, as suggested by Michal Miroslaw. This is complementary to commit fad54440438a7c231a6ae347738423cbabc936d9 (netfilter: avoid double free in nf_reinject). Cc: Julian Anastasov Cc: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ip_queue.c | 11 ++++------- net/ipv6/netfilter/ip6_queue.c | 11 ++++------- net/netfilter/nfnetlink_queue.c | 4 ++-- 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 48f7d5b4ff37..e59aabd0eae4 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -314,7 +314,7 @@ ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) { struct nf_queue_entry *entry; - if (vmsg->value > NF_MAX_VERDICT) + if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN) return -EINVAL; entry = ipq_find_dequeue_entry(vmsg->id); @@ -359,12 +359,9 @@ ipq_receive_peer(struct ipq_peer_msg *pmsg, break; case IPQM_VERDICT: - if (pmsg->msg.verdict.value > NF_MAX_VERDICT) - status = -EINVAL; - else - status = ipq_set_verdict(&pmsg->msg.verdict, - len - sizeof(*pmsg)); - break; + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; default: status = -EINVAL; } diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 87b243a25afa..e63c3972a739 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -314,7 +314,7 @@ ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) { struct nf_queue_entry *entry; - if (vmsg->value > NF_MAX_VERDICT) + if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN) return -EINVAL; entry = ipq_find_dequeue_entry(vmsg->id); @@ -359,12 +359,9 @@ ipq_receive_peer(struct ipq_peer_msg *pmsg, break; case IPQM_VERDICT: - if (pmsg->msg.verdict.value > NF_MAX_VERDICT) - status = -EINVAL; - else - status = ipq_set_verdict(&pmsg->msg.verdict, - len - sizeof(*pmsg)); - break; + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; default: status = -EINVAL; } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 00bd475eab4b..a80b0cb03f17 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -646,8 +646,8 @@ verdicthdr_get(const struct nlattr * const nfqa[]) return NULL; vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); - verdict = ntohl(vhdr->verdict); - if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) + verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; + if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) return NULL; return vhdr; } -- cgit v1.2.3 From 29c486df6a208432b370bd4be99ae1369ede28d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2011 18:57:00 -0400 Subject: net: ipv4: relax AF_INET check in bind() commit d0733d2e29b65 (Check for mistakenly passed in non-IPv4 address) added regression on legacy apps that use bind() with AF_UNSPEC family. Relax the check, but make sure the bind() is done on INADDR_ANY addresses, as AF_UNSPEC has probably no sane meaning for other addresses. Bugzilla reference : https://bugzilla.kernel.org/show_bug.cgi?id=42012 Signed-off-by: Eric Dumazet Reported-and-bisected-by: Rene Meier CC: Marcus Meissner Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1b745d412cf6..dd2b9478ddd1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -466,8 +466,13 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; if (addr->sin_family != AF_INET) { + /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) + * only if s_addr is INADDR_ANY. + */ err = -EAFNOSUPPORT; - goto out; + if (addr->sin_family != AF_UNSPEC || + addr->sin_addr.s_addr != htonl(INADDR_ANY)) + goto out; } chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); -- cgit v1.2.3 From 946cedccbd7387488d2cee5da92cdfeb28d2e670 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2011 03:21:44 +0000 Subject: tcp: Change possible SYN flooding messages "Possible SYN flooding on port xxxx " messages can fill logs on servers. Change logic to log the message only once per listener, and add two new SNMP counters to track : TCPReqQFullDoCookies : number of times a SYNCOOKIE was replied to client TCPReqQFullDrop : number of times a SYN request was dropped because syncookies were not enabled. Based on a prior patch from Tom Herbert, and suggestions from David. Signed-off-by: Eric Dumazet CC: Tom Herbert Signed-off-by: David S. Miller --- include/linux/snmp.h | 2 ++ include/net/request_sock.h | 3 ++- include/net/tcp.h | 3 +++ net/ipv4/proc.c | 2 ++ net/ipv4/tcp_ipv4.c | 49 ++++++++++++++++++++++++++-------------------- net/ipv6/tcp_ipv6.c | 31 +++-------------------------- 6 files changed, 40 insertions(+), 50 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 12b2b18e50c1..e16557a357e5 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -231,6 +231,8 @@ enum LINUX_MIB_TCPDEFERACCEPTDROP, LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */ LINUX_MIB_TCPTIMEWAITOVERFLOW, /* TCPTimeWaitOverflow */ + LINUX_MIB_TCPREQQFULLDOCOOKIES, /* TCPReqQFullDoCookies */ + LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */ __LINUX_MIB_MAX }; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 99e6e19b57c2..4c0766e201e3 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -96,7 +96,8 @@ extern int sysctl_max_syn_backlog; */ struct listen_sock { u8 max_qlen_log; - /* 3 bytes hole, try to use */ + u8 synflood_warned; + /* 2 bytes hole, try to use */ int qlen; int qlen_young; int clock_hand; diff --git a/include/net/tcp.h b/include/net/tcp.h index 149a415d1e0a..e9b48b094683 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -460,6 +460,9 @@ extern int tcp_write_wakeup(struct sock *); extern void tcp_send_fin(struct sock *sk); extern void tcp_send_active_reset(struct sock *sk, gfp_t priority); extern int tcp_send_synack(struct sock *); +extern int tcp_syn_flood_action(struct sock *sk, + const struct sk_buff *skb, + const char *proto); extern void tcp_push_one(struct sock *, unsigned int mss_now); extern void tcp_send_ack(struct sock *sk); extern void tcp_send_delayed_ack(struct sock *sk); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b14ec7d03b6e..4bfad5da94f4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -254,6 +254,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), + SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), + SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1c12b8ec849d..c34f01513945 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -808,20 +808,38 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -static void syn_flood_warning(const struct sk_buff *skb) +/* + * Return 1 if a syncookie should be sent + */ +int tcp_syn_flood_action(struct sock *sk, + const struct sk_buff *skb, + const char *proto) { - const char *msg; + const char *msg = "Dropping request"; + int want_cookie = 0; + struct listen_sock *lopt; + + #ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) + if (sysctl_tcp_syncookies) { msg = "Sending cookies"; - else + want_cookie = 1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); + } else #endif - msg = "Dropping request"; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - pr_info("TCP: Possible SYN flooding on port %d. %s.\n", - ntohs(tcp_hdr(skb)->dest), msg); + lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; + if (!lopt->synflood_warned) { + lopt->synflood_warned = 1; + pr_info("%s: Possible SYN flooding on port %d. %s. " + " Check SNMP counters.\n", + proto, ntohs(tcp_hdr(skb)->dest), msg); + } + return want_cookie; } +EXPORT_SYMBOL(tcp_syn_flood_action); /* * Save and compile IPv4 options into the request_sock if needed. @@ -1235,11 +1253,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; -#ifdef CONFIG_SYN_COOKIES int want_cookie = 0; -#else -#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ -#endif /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1250,14 +1264,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * evidently real one. */ if (inet_csk_reqsk_queue_is_full(sk) && !isn) { - if (net_ratelimit()) - syn_flood_warning(skb); -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) { - want_cookie = 1; - } else -#endif - goto drop; + want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); + if (!want_cookie) + goto drop; } /* Accept backlog is full. If we have already queued enough @@ -1303,9 +1312,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) while (l-- > 0) *c++ ^= *hash_location++; -#ifdef CONFIG_SYN_COOKIES want_cookie = 0; /* not our kind of cookie */ -#endif tmp_ext.cookie_out_never = 0; /* false */ tmp_ext.cookie_plus = tmp_opt.cookie_plus; } else if (!tp->rx_opt.cookie_in_always) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d1fb63f4aeb7..3c9fa618b69d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -531,20 +531,6 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req, return tcp_v6_send_synack(sk, req, rvp); } -static inline void syn_flood_warning(struct sk_buff *skb) -{ -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) - printk(KERN_INFO - "TCPv6: Possible SYN flooding on port %d. " - "Sending cookies.\n", ntohs(tcp_hdr(skb)->dest)); - else -#endif - printk(KERN_INFO - "TCPv6: Possible SYN flooding on port %d. " - "Dropping request.\n", ntohs(tcp_hdr(skb)->dest)); -} - static void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree_skb(inet6_rsk(req)->pktopts); @@ -1179,11 +1165,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; -#ifdef CONFIG_SYN_COOKIES int want_cookie = 0; -#else -#define want_cookie 0 -#endif if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); @@ -1192,14 +1174,9 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop; if (inet_csk_reqsk_queue_is_full(sk) && !isn) { - if (net_ratelimit()) - syn_flood_warning(skb); -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) - want_cookie = 1; - else -#endif - goto drop; + want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); + if (!want_cookie) + goto drop; } if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) @@ -1249,9 +1226,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) while (l-- > 0) *c++ ^= *hash_location++; -#ifdef CONFIG_SYN_COOKIES want_cookie = 0; /* not our kind of cookie */ -#endif tmp_ext.cookie_out_never = 0; /* false */ tmp_ext.cookie_plus = tmp_opt.cookie_plus; } else if (!tp->rx_opt.cookie_in_always) { -- cgit v1.2.3 From 19c1ea14c930db5e9c0cd7c3c6f4d01457dfcd69 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 4 Sep 2011 20:24:20 +0000 Subject: ipv4: Fix fib_info->fib_metrics leak Commit 4670994d(net,rcu: convert call_rcu(fc_rport_free_rcu) to kfree_rcu()) introduced a memory leak. This patch reverts it. Signed-off-by: Zheng Yan Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 33e2c35b74b7..80106d89d548 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -142,6 +142,14 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }; /* Release a nexthop info record */ +static void free_fib_info_rcu(struct rcu_head *head) +{ + struct fib_info *fi = container_of(head, struct fib_info, rcu); + + if (fi->fib_metrics != (u32 *) dst_default_metrics) + kfree(fi->fib_metrics); + kfree(fi); +} void free_fib_info(struct fib_info *fi) { @@ -156,7 +164,7 @@ void free_fib_info(struct fib_info *fi) } endfor_nexthops(fi); fib_info_cnt--; release_net(fi->fib_net); - kfree_rcu(fi, rcu); + call_rcu(&fi->rcu, free_fib_info_rcu); } void fib_release_info(struct fib_info *fi) -- cgit v1.2.3 From f779b2d60ab95c17f1e025778ed0df3ec2f05d75 Mon Sep 17 00:00:00 2001 From: Zheng Yan Date: Sun, 18 Sep 2011 22:37:34 -0400 Subject: tcp: fix validation of D-SACK D-SACK is allowed to reside below snd_una. But the corresponding check in tcp_is_sackblock_valid() is the exact opposite. It looks like a typo. Signed-off-by: Zheng Yan Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ea0d2183df4b..21fab3edb92c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1124,7 +1124,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, return 0; /* ...Then it's D-SACK, and must reside below snd_una completely */ - if (!after(end_seq, tp->snd_una)) + if (after(end_seq, tp->snd_una)) return 0; if (!before(start_seq, tp->undo_marker)) -- cgit v1.2.3 From 260fcbeb1ae9e768a44c9925338fbacb0d7e5ba9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 29 Sep 2011 17:10:10 +0000 Subject: tcp: properly handle md5sig_pool references tcp_v4_clear_md5_list() assumes that multiple tcp md5sig peers only hold one reference to md5sig_pool. but tcp_v4_md5_do_add() increases use count of md5sig_pool for each peer. This patch makes tcp_v4_md5_do_add() only increases use count for the first tcp md5sig peer. Signed-off-by: Zheng Yan Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 11 +++++++---- net/ipv6/tcp_ipv6.c | 8 +++++--- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c34f01513945..7963e03f1068 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -927,18 +927,21 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, } sk_nocaps_add(sk, NETIF_F_GSO_MASK); } - if (tcp_alloc_md5sig_pool(sk) == NULL) { + + md5sig = tp->md5sig_info; + if (md5sig->entries4 == 0 && + tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); return -ENOMEM; } - md5sig = tp->md5sig_info; if (md5sig->alloced4 == md5sig->entries4) { keys = kmalloc((sizeof(*keys) * (md5sig->entries4 + 1)), GFP_ATOMIC); if (!keys) { kfree(newkey); - tcp_free_md5sig_pool(); + if (md5sig->entries4 == 0) + tcp_free_md5sig_pool(); return -ENOMEM; } @@ -982,6 +985,7 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) kfree(tp->md5sig_info->keys4); tp->md5sig_info->keys4 = NULL; tp->md5sig_info->alloced4 = 0; + tcp_free_md5sig_pool(); } else if (tp->md5sig_info->entries4 != i) { /* Need to do some manipulation */ memmove(&tp->md5sig_info->keys4[i], @@ -989,7 +993,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) (tp->md5sig_info->entries4 - i) * sizeof(struct tcp4_md5sig_key)); } - tcp_free_md5sig_pool(); return 0; } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 79cc6469508d..7b8fc5794352 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -591,7 +591,8 @@ static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer, } sk_nocaps_add(sk, NETIF_F_GSO_MASK); } - if (tcp_alloc_md5sig_pool(sk) == NULL) { + if (tp->md5sig_info->entries6 == 0 && + tcp_alloc_md5sig_pool(sk) == NULL) { kfree(newkey); return -ENOMEM; } @@ -600,8 +601,9 @@ static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer, (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC); if (!keys) { - tcp_free_md5sig_pool(); kfree(newkey); + if (tp->md5sig_info->entries6 == 0) + tcp_free_md5sig_pool(); return -ENOMEM; } @@ -647,6 +649,7 @@ static int tcp_v6_md5_do_del(struct sock *sk, const struct in6_addr *peer) kfree(tp->md5sig_info->keys6); tp->md5sig_info->keys6 = NULL; tp->md5sig_info->alloced6 = 0; + tcp_free_md5sig_pool(); } else { /* shrink the database */ if (tp->md5sig_info->entries6 != i) @@ -655,7 +658,6 @@ static int tcp_v6_md5_do_del(struct sock *sk, const struct in6_addr *peer) (tp->md5sig_info->entries6 - i) * sizeof (tp->md5sig_info->keys6[0])); } - tcp_free_md5sig_pool(); return 0; } } -- cgit v1.2.3 From 1e5289e121372a3494402b1b131b41bfe1cf9b7f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 2 Oct 2011 04:21:50 +0000 Subject: tcp: properly update lost_cnt_hint during shifting lost_skb_hint is used by tcp_mark_head_lost() to mark the first unhandled skb. lost_cnt_hint is the number of packets or sacked packets before the lost_skb_hint; When shifting a skb that is before the lost_skb_hint, if tcp_is_fack() is ture, the skb has already been counted in the lost_cnt_hint; if tcp_is_fack() is false, tcp_sacktag_one() will increase the lost_cnt_hint. So tcp_shifted_skb() does not need to adjust the lost_cnt_hint by itself. When shifting a skb that is equal to lost_skb_hint, the shifted packets will not be counted by tcp_mark_head_lost(). So tcp_shifted_skb() should adjust the lost_cnt_hint even tcp_is_fack(tp) is true. Signed-off-by: Zheng Yan Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 21fab3edb92c..d73aab3fbfc0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1389,9 +1389,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!pcount); - /* Tweak before seqno plays */ - if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint && - !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) + if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; TCP_SKB_CB(prev)->end_seq += shifted; -- cgit v1.2.3