summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c52
-rw-r--r--net/ipv4/arp.c41
-rw-r--r--net/ipv4/devinet.c6
-rw-r--r--net/ipv4/fib_frontend.c6
-rw-r--r--net/ipv4/fib_trie.c12
-rw-r--r--net/ipv4/gre.c22
-rw-r--r--net/ipv4/icmp.c14
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_lro.c74
-rw-r--r--net/ipv4/inetpeer.c293
-rw-r--r--net/ipv4/ip_fragment.c5
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_input.c4
-rw-r--r--net/ipv4/ip_output.c16
-rw-r--r--net/ipv4/ipconfig.c75
-rw-r--r--net/ipv4/ipmr.c5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c26
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c210
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c2
-rw-r--r--net/ipv4/raw.c36
-rw-r--r--net/ipv4/route.c96
-rw-r--r--net/ipv4/syncookies.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c14
-rw-r--r--net/ipv4/tcp_input.c46
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv4/xfrm4_policy.c2
29 files changed, 450 insertions, 633 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ef1528af7abf..1b745d412cf6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1440,11 +1440,11 @@ EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
unsigned long snmp_fold_field(void __percpu *mib[], int offt)
{
unsigned long res = 0;
- int i;
+ int i, j;
for_each_possible_cpu(i) {
- res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
- res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
+ for (j = 0; j < SNMP_ARRAY_SZ; j++)
+ res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt);
}
return res;
}
@@ -1458,28 +1458,19 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
int cpu;
for_each_possible_cpu(cpu) {
- void *bhptr, *userptr;
+ void *bhptr;
struct u64_stats_sync *syncp;
- u64 v_bh, v_user;
+ u64 v;
unsigned int start;
- /* first mib used by softirq context, we must use _bh() accessors */
- bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu);
+ bhptr = per_cpu_ptr(mib[0], cpu);
syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
do {
start = u64_stats_fetch_begin_bh(syncp);
- v_bh = *(((u64 *) bhptr) + offt);
+ v = *(((u64 *) bhptr) + offt);
} while (u64_stats_fetch_retry_bh(syncp, start));
- /* second mib used in USER context */
- userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu);
- syncp = (struct u64_stats_sync *)(userptr + syncp_offset);
- do {
- start = u64_stats_fetch_begin(syncp);
- v_user = *(((u64 *) userptr) + offt);
- } while (u64_stats_fetch_retry(syncp, start));
-
- res += v_bh + v_user;
+ res += v;
}
return res;
}
@@ -1491,25 +1482,28 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
BUG_ON(ptr == NULL);
ptr[0] = __alloc_percpu(mibsize, align);
if (!ptr[0])
- goto err0;
+ return -ENOMEM;
+#if SNMP_ARRAY_SZ == 2
ptr[1] = __alloc_percpu(mibsize, align);
- if (!ptr[1])
- goto err1;
+ if (!ptr[1]) {
+ free_percpu(ptr[0]);
+ ptr[0] = NULL;
+ return -ENOMEM;
+ }
+#endif
return 0;
-err1:
- free_percpu(ptr[0]);
- ptr[0] = NULL;
-err0:
- return -ENOMEM;
}
EXPORT_SYMBOL_GPL(snmp_mib_init);
-void snmp_mib_free(void __percpu *ptr[2])
+void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ])
{
+ int i;
+
BUG_ON(ptr == NULL);
- free_percpu(ptr[0]);
- free_percpu(ptr[1]);
- ptr[0] = ptr[1] = NULL;
+ for (i = 0; i < SNMP_ARRAY_SZ; i++) {
+ free_percpu(ptr[i]);
+ ptr[i] = NULL;
+ }
}
EXPORT_SYMBOL_GPL(snmp_mib_free);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 1b74d3b64371..96a164aa1367 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -97,7 +97,6 @@
#include <linux/init.h>
#include <linux/net.h>
#include <linux/rcupdate.h>
-#include <linux/jhash.h>
#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -139,8 +138,6 @@ static const struct neigh_ops arp_generic_ops = {
.error_report = arp_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_connected_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
static const struct neigh_ops arp_hh_ops = {
@@ -149,16 +146,12 @@ static const struct neigh_ops arp_hh_ops = {
.error_report = arp_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_resolve_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
static const struct neigh_ops arp_direct_ops = {
.family = AF_INET,
- .output = dev_queue_xmit,
- .connected_output = dev_queue_xmit,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
};
static const struct neigh_ops arp_broken_ops = {
@@ -167,8 +160,6 @@ static const struct neigh_ops arp_broken_ops = {
.error_report = arp_error_report,
.output = neigh_compat_output,
.connected_output = neigh_compat_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
struct neigh_table arp_tbl = {
@@ -232,7 +223,7 @@ static u32 arp_hash(const void *pkey,
const struct net_device *dev,
__u32 hash_rnd)
{
- return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
+ return arp_hashfn(*(u32 *)pkey, dev, hash_rnd);
}
static int arp_constructor(struct neighbour *neigh)
@@ -259,7 +250,7 @@ static int arp_constructor(struct neighbour *neigh)
if (!dev->header_ops) {
neigh->nud_state = NUD_NOARP;
neigh->ops = &arp_direct_ops;
- neigh->output = neigh->ops->queue_xmit;
+ neigh->output = neigh_direct_output;
} else {
/* Good devices (checked by reading texts, but only Ethernet is
tested)
@@ -518,30 +509,6 @@ EXPORT_SYMBOL(arp_find);
/* END OF OBSOLETE FUNCTIONS */
-int arp_bind_neighbour(struct dst_entry *dst)
-{
- struct net_device *dev = dst->dev;
- struct neighbour *n = dst->neighbour;
-
- if (dev == NULL)
- return -EINVAL;
- if (n == NULL) {
- __be32 nexthop = ((struct rtable *)dst)->rt_gateway;
- if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
- nexthop = 0;
- n = __neigh_lookup_errno(
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
- dev->type == ARPHRD_ATM ?
- clip_tbl_hook :
-#endif
- &arp_tbl, &nexthop, dev);
- if (IS_ERR(n))
- return PTR_ERR(n);
- dst->neighbour = n;
- }
- return 0;
-}
-
/*
* Check if we can use proxy ARP for this path
*/
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 0d4a184af16f..37b3c188d8b3 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1833,8 +1833,8 @@ void __init devinet_init(void)
rtnl_af_register(&inet_af_ops);
- rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
- rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
- rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
+ rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 22524716fe70..92fc5f69f5da 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1124,9 +1124,9 @@ static struct pernet_operations fib_net_ops = {
void __init ip_fib_init(void)
{
- rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
- rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
- rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
+ rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
register_pernet_subsys(&fib_net_ops);
register_netdevice_notifier(&fib_netdev_notifier);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 58c25ea5a5c1..de9e2978476f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -110,9 +110,10 @@ struct leaf {
struct leaf_info {
struct hlist_node hlist;
- struct rcu_head rcu;
int plen;
+ u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
struct list_head falh;
+ struct rcu_head rcu;
};
struct tnode {
@@ -451,6 +452,7 @@ static struct leaf_info *leaf_info_new(int plen)
struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
if (li) {
li->plen = plen;
+ li->mask_plen = ntohl(inet_make_mask(plen));
INIT_LIST_HEAD(&li->falh);
}
return li;
@@ -1359,10 +1361,8 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
hlist_for_each_entry_rcu(li, node, hhead, hlist) {
struct fib_alias *fa;
- int plen = li->plen;
- __be32 mask = inet_make_mask(plen);
- if (l->key != (key & ntohl(mask)))
+ if (l->key != (key & li->mask_plen))
continue;
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -1394,7 +1394,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.semantic_match_passed++;
#endif
- res->prefixlen = plen;
+ res->prefixlen = li->plen;
res->nh_sel = nhsel;
res->type = fa->fa_type;
res->scope = fa->fa_info->fib_scope;
@@ -1402,7 +1402,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
res->table = tb;
res->fa_head = &li->falh;
if (!(fib_flags & FIB_LOOKUP_NOREF))
- atomic_inc(&res->fi->fib_clntref);
+ atomic_inc(&fi->fib_clntref);
return 0;
}
}
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index c6933f2ea310..dbfc21de3479 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -15,8 +15,8 @@
#include <linux/kmod.h>
#include <linux/skbuff.h>
#include <linux/in.h>
+#include <linux/ip.h>
#include <linux/netdevice.h>
-#include <linux/version.h>
#include <linux/spinlock.h>
#include <net/protocol.h>
#include <net/gre.h>
@@ -97,27 +97,17 @@ drop:
static void gre_err(struct sk_buff *skb, u32 info)
{
const struct gre_protocol *proto;
- u8 ver;
-
- if (!pskb_may_pull(skb, 12))
- goto drop;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
- ver = skb->data[1]&0x7f;
if (ver >= GREPROTO_MAX)
- goto drop;
+ return;
rcu_read_lock();
proto = rcu_dereference(gre_proto[ver]);
- if (!proto || !proto->err_handler)
- goto drop_unlock;
- proto->err_handler(skb, info);
- rcu_read_unlock();
- return;
-
-drop_unlock:
+ if (proto && proto->err_handler)
+ proto->err_handler(skb, info);
rcu_read_unlock();
-drop:
- kfree_skb(skb);
}
static const struct net_protocol net_gre_protocol = {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5395e45dcce6..23ef31baa1af 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -380,6 +380,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
struct icmp_bxm *param)
{
struct rtable *rt, *rt2;
+ struct flowi4 fl4_dec;
int err;
memset(fl4, 0, sizeof(*fl4));
@@ -408,19 +409,19 @@ static struct rtable *icmp_route_lookup(struct net *net,
} else
return rt;
- err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(fl4), AF_INET);
+ err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
if (err)
goto relookup_failed;
- if (inet_addr_type(net, fl4->saddr) == RTN_LOCAL) {
- rt2 = __ip_route_output_key(net, fl4);
+ if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
+ rt2 = __ip_route_output_key(net, &fl4_dec);
if (IS_ERR(rt2))
err = PTR_ERR(rt2);
} else {
struct flowi4 fl4_2 = {};
unsigned long orefdst;
- fl4_2.daddr = fl4->saddr;
+ fl4_2.daddr = fl4_dec.saddr;
rt2 = ip_route_output_key(net, &fl4_2);
if (IS_ERR(rt2)) {
err = PTR_ERR(rt2);
@@ -428,7 +429,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
}
/* Ugh! */
orefdst = skb_in->_skb_refdst; /* save old refdst */
- err = ip_route_input(skb_in, fl4->daddr, fl4->saddr,
+ err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
RT_TOS(tos), rt2->dst.dev);
dst_release(&rt2->dst);
@@ -440,10 +441,11 @@ static struct rtable *icmp_route_lookup(struct net *net,
goto relookup_failed;
rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
- flowi4_to_flowi(fl4), NULL,
+ flowi4_to_flowi(&fl4_dec), NULL,
XFRM_LOOKUP_ICMP);
if (!IS_ERR(rt2)) {
dst_release(&rt->dst);
+ memcpy(fl4, &fl4_dec, sizeof(*fl4));
rt = rt2;
} else if (PTR_ERR(rt2) == -EPERM) {
if (rt)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 3267d3898437..389a2e6a17fd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -869,7 +869,7 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
return netlink_dump_start(idiagnl, skb, nlh,
- inet_diag_dump, NULL);
+ inet_diag_dump, NULL, 0);
}
return inet_diag_get_exact(skb, nlh);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 85a0f75dae64..ef7ae6049a51 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -146,8 +146,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
}
static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
- struct iphdr *iph, struct tcphdr *tcph,
- u16 vlan_tag, struct vlan_group *vgrp)
+ struct iphdr *iph, struct tcphdr *tcph)
{
int nr_frags;
__be32 *ptr;
@@ -173,8 +172,6 @@ static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
}
lro_desc->mss = tcp_data_len;
- lro_desc->vgrp = vgrp;
- lro_desc->vlan_tag = vlan_tag;
lro_desc->active = 1;
lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
@@ -309,29 +306,17 @@ static void lro_flush(struct net_lro_mgr *lro_mgr,
skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
- if (lro_desc->vgrp) {
- if (lro_mgr->features & LRO_F_NAPI)
- vlan_hwaccel_receive_skb(lro_desc->parent,
- lro_desc->vgrp,
- lro_desc->vlan_tag);
- else
- vlan_hwaccel_rx(lro_desc->parent,
- lro_desc->vgrp,
- lro_desc->vlan_tag);
-
- } else {
- if (lro_mgr->features & LRO_F_NAPI)
- netif_receive_skb(lro_desc->parent);
- else
- netif_rx(lro_desc->parent);
- }
+ if (lro_mgr->features & LRO_F_NAPI)
+ netif_receive_skb(lro_desc->parent);
+ else
+ netif_rx(lro_desc->parent);
LRO_INC_STATS(lro_mgr, flushed);
lro_clear_desc(lro_desc);
}
static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
- struct vlan_group *vgrp, u16 vlan_tag, void *priv)
+ void *priv)
{
struct net_lro_desc *lro_desc;
struct iphdr *iph;
@@ -360,7 +345,7 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
goto out;
skb->ip_summed = lro_mgr->ip_summed_aggr;
- lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp);
+ lro_init_desc(lro_desc, skb, iph, tcph);
LRO_INC_STATS(lro_mgr, aggregated);
return 0;
}
@@ -433,8 +418,7 @@ static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
struct skb_frag_struct *frags,
int len, int true_size,
- struct vlan_group *vgrp,
- u16 vlan_tag, void *priv, __wsum sum)
+ void *priv, __wsum sum)
{
struct net_lro_desc *lro_desc;
struct iphdr *iph;
@@ -480,7 +464,7 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
tcph = (void *)((u8 *)skb->data + vlan_hdr_len
+ IP_HDR_LEN(iph));
- lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL);
+ lro_init_desc(lro_desc, skb, iph, tcph);
LRO_INC_STATS(lro_mgr, aggregated);
return NULL;
}
@@ -514,7 +498,7 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,
struct sk_buff *skb,
void *priv)
{
- if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {
+ if (__lro_proc_skb(lro_mgr, skb, priv)) {
if (lro_mgr->features & LRO_F_NAPI)
netif_receive_skb(skb);
else
@@ -523,29 +507,13 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,
}
EXPORT_SYMBOL(lro_receive_skb);
-void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
- struct sk_buff *skb,
- struct vlan_group *vgrp,
- u16 vlan_tag,
- void *priv)
-{
- if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) {
- if (lro_mgr->features & LRO_F_NAPI)
- vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
- else
- vlan_hwaccel_rx(skb, vgrp, vlan_tag);
- }
-}
-EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb);
-
void lro_receive_frags(struct net_lro_mgr *lro_mgr,
struct skb_frag_struct *frags,
int len, int true_size, void *priv, __wsum sum)
{
struct sk_buff *skb;
- skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0,
- priv, sum);
+ skb = __lro_proc_segment(lro_mgr, frags, len, true_size, priv, sum);
if (!skb)
return;
@@ -556,26 +524,6 @@ void lro_receive_frags(struct net_lro_mgr *lro_mgr,
}
EXPORT_SYMBOL(lro_receive_frags);
-void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size,
- struct vlan_group *vgrp,
- u16 vlan_tag, void *priv, __wsum sum)
-{
- struct sk_buff *skb;
-
- skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp,
- vlan_tag, priv, sum);
- if (!skb)
- return;
-
- if (lro_mgr->features & LRO_F_NAPI)
- vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
- else
- vlan_hwaccel_rx(skb, vgrp, vlan_tag);
-}
-EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags);
-
void lro_flush_all(struct net_lro_mgr *lro_mgr)
{
int i;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index ce616d92cc54..e38213817d0a 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -54,15 +54,11 @@
* 1. Nodes may appear in the tree only with the pool lock held.
* 2. Nodes may disappear from the tree only with the pool lock held
* AND reference count being 0.
- * 3. Nodes appears and disappears from unused node list only under
- * "inet_peer_unused_lock".
- * 4. Global variable peer_total is modified under the pool lock.
- * 5. struct inet_peer fields modification:
+ * 3. Global variable peer_total is modified under the pool lock.
+ * 4. struct inet_peer fields modification:
* avl_left, avl_right, avl_parent, avl_height: pool lock
- * unused: unused node list lock
* refcnt: atomically against modifications on other CPU;
* usually under some other lock to prevent node disappearing
- * dtime: unused node list lock
* daddr: unchangeable
* ip_id_count: atomic value (no lock needed)
*/
@@ -104,19 +100,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m
* aggressively at this stage */
int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
-int inet_peer_gc_mintime __read_mostly = 10 * HZ;
-int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
-
-static struct {
- struct list_head list;
- spinlock_t lock;
-} unused_peers = {
- .list = LIST_HEAD_INIT(unused_peers.list),
- .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock),
-};
-
-static void peer_check_expire(unsigned long dummy);
-static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
/* Called from ip_output.c:ip_init */
@@ -142,21 +125,6 @@ void __init inet_initpeers(void)
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
- /* All the timers, started at system startup tend
- to synchronize. Perturb it a bit.
- */
- peer_periodic_timer.expires = jiffies
- + net_random() % inet_peer_gc_maxtime
- + inet_peer_gc_maxtime;
- add_timer(&peer_periodic_timer);
-}
-
-/* Called with or without local BH being disabled. */
-static void unlink_from_unused(struct inet_peer *p)
-{
- spin_lock_bh(&unused_peers.lock);
- list_del_init(&p->unused);
- spin_unlock_bh(&unused_peers.lock);
}
static int addr_compare(const struct inetpeer_addr *a,
@@ -203,20 +171,6 @@ static int addr_compare(const struct inetpeer_addr *a,
u; \
})
-static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv)
-{
- int cur, old = atomic_read(ptr);
-
- while (old != u) {
- *newv = old + a;
- cur = atomic_cmpxchg(ptr, old, *newv);
- if (cur == old)
- return true;
- old = cur;
- }
- return false;
-}
-
/*
* Called with rcu_read_lock()
* Because we hold no lock against a writer, its quite possible we fall
@@ -225,8 +179,7 @@ static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv)
* We exit from this function if number of links exceeds PEER_MAXDEPTH
*/
static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
- struct inet_peer_base *base,
- int *newrefcnt)
+ struct inet_peer_base *base)
{
struct inet_peer *u = rcu_dereference(base->root);
int count = 0;
@@ -235,11 +188,9 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
int cmp = addr_compare(daddr, &u->daddr);
if (cmp == 0) {
/* Before taking a reference, check if this entry was
- * deleted, unlink_from_pool() sets refcnt=-1 to make
- * distinction between an unused entry (refcnt=0) and
- * a freed one.
+ * deleted (refcnt=-1)
*/
- if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt))
+ if (!atomic_add_unless(&u->refcnt, 1, -1))
u = NULL;
return u;
}
@@ -366,137 +317,99 @@ static void inetpeer_free_rcu(struct rcu_head *head)
kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
}
-/* May be called with local BH enabled. */
static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
struct inet_peer __rcu **stack[PEER_MAXDEPTH])
{
- int do_free;
-
- do_free = 0;
-
- write_seqlock_bh(&base->lock);
- /* Check the reference counter. It was artificially incremented by 1
- * in cleanup() function to prevent sudden disappearing. If we can
- * atomically (because of lockless readers) take this last reference,
- * it's safe to remove the node and free it later.
- * We use refcnt=-1 to alert lockless readers this entry is deleted.
- */
- if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
- struct inet_peer __rcu ***stackptr, ***delp;
- if (lookup(&p->daddr, stack, base) != p)
- BUG();
- delp = stackptr - 1; /* *delp[0] == p */
- if (p->avl_left == peer_avl_empty_rcu) {
- *delp[0] = p->avl_right;
- --stackptr;
- } else {
- /* look for a node to insert instead of p */
- struct inet_peer *t;
- t = lookup_rightempty(p, base);
- BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
- **--stackptr = t->avl_left;
- /* t is removed, t->daddr > x->daddr for any
- * x in p->avl_left subtree.
- * Put t in the old place of p. */
- RCU_INIT_POINTER(*delp[0], t);
- t->avl_left = p->avl_left;
- t->avl_right = p->avl_right;
- t->avl_height = p->avl_height;
- BUG_ON(delp[1] != &p->avl_left);
- delp[1] = &t->avl_left; /* was &p->avl_left */
- }
- peer_avl_rebalance(stack, stackptr, base);
- base->total--;
- do_free = 1;
+ struct inet_peer __rcu ***stackptr, ***delp;
+
+ if (lookup(&p->daddr, stack, base) != p)
+ BUG();
+ delp = stackptr - 1; /* *delp[0] == p */
+ if (p->avl_left == peer_avl_empty_rcu) {
+ *delp[0] = p->avl_right;
+ --stackptr;
+ } else {
+ /* look for a node to insert instead of p */
+ struct inet_peer *t;
+ t = lookup_rightempty(p, base);
+ BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
+ **--stackptr = t->avl_left;
+ /* t is removed, t->daddr > x->daddr for any
+ * x in p->avl_left subtree.
+ * Put t in the old place of p. */
+ RCU_INIT_POINTER(*delp[0], t);
+ t->avl_left = p->avl_left;
+ t->avl_right = p->avl_right;
+ t->avl_height = p->avl_height;
+ BUG_ON(delp[1] != &p->avl_left);
+ delp[1] = &t->avl_left; /* was &p->avl_left */
}
- write_sequnlock_bh(&base->lock);
-
- if (do_free)
- call_rcu(&p->rcu, inetpeer_free_rcu);
- else
- /* The node is used again. Decrease the reference counter
- * back. The loop "cleanup -> unlink_from_unused
- * -> unlink_from_pool -> putpeer -> link_to_unused
- * -> cleanup (for the same node)"
- * doesn't really exist because the entry will have a
- * recent deletion time and will not be cleaned again soon.
- */
- inet_putpeer(p);
+ peer_avl_rebalance(stack, stackptr, base);
+ base->total--;
+ call_rcu(&p->rcu, inetpeer_free_rcu);
}
static struct inet_peer_base *family_to_base(int family)
{
- return (family == AF_INET ? &v4_peers : &v6_peers);
-}
-
-static struct inet_peer_base *peer_to_base(struct inet_peer *p)
-{
- return family_to_base(p->daddr.family);
+ return family == AF_INET ? &v4_peers : &v6_peers;
}
-/* May be called with local BH enabled. */
-static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH])
+/* perform garbage collect on all items stacked during a lookup */
+static int inet_peer_gc(struct inet_peer_base *base,
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH],
+ struct inet_peer __rcu ***stackptr)
{
- struct inet_peer *p = NULL;
-
- /* Remove the first entry from the list of unused nodes. */
- spin_lock_bh(&unused_peers.lock);
- if (!list_empty(&unused_peers.list)) {
- __u32 delta;
-
- p = list_first_entry(&unused_peers.list, struct inet_peer, unused);
- delta = (__u32)jiffies - p->dtime;
+ struct inet_peer *p, *gchead = NULL;
+ __u32 delta, ttl;
+ int cnt = 0;
- if (delta < ttl) {
- /* Do not prune fresh entries. */
- spin_unlock_bh(&unused_peers.lock);
- return -1;
+ if (base->total >= inet_peer_threshold)
+ ttl = 0; /* be aggressive */
+ else
+ ttl = inet_peer_maxttl
+ - (inet_peer_maxttl - inet_peer_minttl) / HZ *
+ base->total / inet_peer_threshold * HZ;
+ stackptr--; /* last stack slot is peer_avl_empty */
+ while (stackptr > stack) {
+ stackptr--;
+ p = rcu_deref_locked(**stackptr, base);
+ if (atomic_read(&p->refcnt) == 0) {
+ smp_rmb();
+ delta = (__u32)jiffies - p->dtime;
+ if (delta >= ttl &&
+ atomic_cmpxchg(&p->refcnt, 0, -1) == 0) {
+ p->gc_next = gchead;
+ gchead = p;
+ }
}
-
- list_del_init(&p->unused);
-
- /* Grab an extra reference to prevent node disappearing
- * before unlink_from_pool() call. */
- atomic_inc(&p->refcnt);
}
- spin_unlock_bh(&unused_peers.lock);
-
- if (p == NULL)
- /* It means that the total number of USED entries has
- * grown over inet_peer_threshold. It shouldn't really
- * happen because of entry limits in route cache. */
- return -1;
-
- unlink_from_pool(p, peer_to_base(p), stack);
- return 0;
+ while ((p = gchead) != NULL) {
+ gchead = p->gc_next;
+ cnt++;
+ unlink_from_pool(p, base, stack);
+ }
+ return cnt;
}
-/* Called with or without local BH being disabled. */
-struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
+struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create)
{
struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
struct inet_peer_base *base = family_to_base(daddr->family);
struct inet_peer *p;
unsigned int sequence;
- int invalidated, newrefcnt = 0;
+ int invalidated, gccnt = 0;
- /* Look up for the address quickly, lockless.
+ /* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
rcu_read_lock();
sequence = read_seqbegin(&base->lock);
- p = lookup_rcu(daddr, base, &newrefcnt);
+ p = lookup_rcu(daddr, base);
invalidated = read_seqretry(&base->lock, sequence);
rcu_read_unlock();
- if (p) {
-found: /* The existing node has been found.
- * Remove the entry from unused list if it was there.
- */
- if (newrefcnt == 1)
- unlink_from_unused(p);
+ if (p)
return p;
- }
/* If no writer did a change during our lookup, we can return early. */
if (!create && !invalidated)
@@ -506,18 +419,27 @@ found: /* The existing node has been found.
* At least, nodes should be hot in our cache.
*/
write_seqlock_bh(&base->lock);
+relookup:
p = lookup(daddr, stack, base);
if (p != peer_avl_empty) {
- newrefcnt = atomic_inc_return(&p->refcnt);
+ atomic_inc(&p->refcnt);
write_sequnlock_bh(&base->lock);
- goto found;
+ return p;
+ }
+ if (!gccnt) {
+ gccnt = inet_peer_gc(base, stack, stackptr);
+ if (gccnt && create)
+ goto relookup;
}
p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
if (p) {
p->daddr = *daddr;
atomic_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
- atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
+ atomic_set(&p->ip_id_count,
+ (daddr->family == AF_INET) ?
+ secure_ip_id(daddr->addr.a4) :
+ secure_ipv6_id(daddr->addr.a6));
p->tcp_ts_stamp = 0;
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
@@ -525,7 +447,6 @@ found: /* The existing node has been found.
p->pmtu_expires = 0;
p->pmtu_orig = 0;
memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
- INIT_LIST_HEAD(&p->unused);
/* Link the node. */
@@ -534,63 +455,15 @@ found: /* The existing node has been found.
}
write_sequnlock_bh(&base->lock);
- if (base->total >= inet_peer_threshold)
- /* Remove one less-recently-used entry. */
- cleanup_once(0, stack);
-
return p;
}
-
-static int compute_total(void)
-{
- return v4_peers.total + v6_peers.total;
-}
EXPORT_SYMBOL_GPL(inet_getpeer);
-/* Called with local BH disabled. */
-static void peer_check_expire(unsigned long dummy)
-{
- unsigned long now = jiffies;
- int ttl, total;
- struct inet_peer __rcu **stack[PEER_MAXDEPTH];
-
- total = compute_total();
- if (total >= inet_peer_threshold)
- ttl = inet_peer_minttl;
- else
- ttl = inet_peer_maxttl
- - (inet_peer_maxttl - inet_peer_minttl) / HZ *
- total / inet_peer_threshold * HZ;
- while (!cleanup_once(ttl, stack)) {
- if (jiffies != now)
- break;
- }
-
- /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
- * interval depending on the total number of entries (more entries,
- * less interval). */
- total = compute_total();
- if (total >= inet_peer_threshold)
- peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
- else
- peer_periodic_timer.expires = jiffies
- + inet_peer_gc_maxtime
- - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
- total / inet_peer_threshold * HZ;
- add_timer(&peer_periodic_timer);
-}
-
void inet_putpeer(struct inet_peer *p)
{
- local_bh_disable();
-
- if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) {
- list_add_tail(&p->unused, &unused_peers.list);
- p->dtime = (__u32)jiffies;
- spin_unlock(&unused_peers.lock);
- }
-
- local_bh_enable();
+ p->dtime = (__u32)jiffies;
+ smp_mb__before_atomic_dec();
+ atomic_dec(&p->refcnt);
}
EXPORT_SYMBOL_GPL(inet_putpeer);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 0ad6035f6366..0e0ab98abc6f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -261,8 +261,9 @@ static void ip_expire(unsigned long arg)
* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
- if (qp->user == IP_DEFRAG_CONNTRACK_IN &&
- skb_rtable(head)->rt_type != RTN_LOCAL)
+ if (qp->user == IP_DEFRAG_AF_PACKET ||
+ (qp->user == IP_DEFRAG_CONNTRACK_IN &&
+ skb_rtable(head)->rt_type != RTN_LOCAL))
goto out_rcu_unlock;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8871067560db..d7bb94c48345 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -731,9 +731,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct neighbour *neigh = dst_get_neighbour(skb_dst(skb));
const struct in6_addr *addr6;
int addr_type;
- struct neighbour *neigh = skb_dst(skb)->neighbour;
if (neigh == NULL)
goto tx_error;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c8f48efc5fd3..073a9b01c40c 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -165,7 +165,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
net_eq(sock_net(sk), dev_net(dev))) {
- if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
return 1;
}
@@ -256,7 +256,7 @@ int ip_local_deliver(struct sk_buff *skb)
* Reassemble IP fragments.
*/
- if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 84f26e8e6c60..ccaaa851ab42 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -182,6 +182,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
+ struct neighbour *neigh;
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -203,10 +204,9 @@ static inline int ip_finish_output2(struct sk_buff *skb)
skb = skb2;
}
- if (dst->hh)
- return neigh_hh_output(dst->hh, skb);
- else if (dst->neighbour)
- return dst->neighbour->output(skb);
+ neigh = dst_get_neighbour(dst);
+ if (neigh)
+ return neigh_output(neigh, skb);
if (net_ratelimit())
printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
@@ -489,7 +489,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
- (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
+ ip_is_fragment(iph) ||
skb_cloned(skb))
goto slow_path;
@@ -734,7 +734,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
- int transhdrlen, int mtu, unsigned int flags)
+ int transhdrlen, int maxfraglen, unsigned int flags)
{
struct sk_buff *skb;
int err;
@@ -767,7 +767,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
skb->csum = 0;
/* specify the length of each IP datagram fragment */
- skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+ skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
__skb_queue_tail(queue, skb);
}
@@ -831,7 +831,7 @@ static int __ip_append_data(struct sock *sk,
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
- mtu, flags);
+ maxfraglen, flags);
if (err)
goto error;
return 0;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index ab7e5542c1cf..472a8c4f1dc0 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -861,41 +861,44 @@ static void __init ic_do_bootp_ext(u8 *ext)
#endif
switch (*ext++) {
- case 1: /* Subnet mask */
- if (ic_netmask == NONE)
- memcpy(&ic_netmask, ext+1, 4);
- break;
- case 3: /* Default gateway */
- if (ic_gateway == NONE)
- memcpy(&ic_gateway, ext+1, 4);
- break;
- case 6: /* DNS server */
- servers= *ext/4;
- if (servers > CONF_NAMESERVERS_MAX)
- servers = CONF_NAMESERVERS_MAX;
- for (i = 0; i < servers; i++) {
- if (ic_nameservers[i] == NONE)
- memcpy(&ic_nameservers[i], ext+1+4*i, 4);
- }
- break;
- case 12: /* Host name */
- ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN);
- ic_host_name_set = 1;
- break;
- case 15: /* Domain name (DNS) */
- ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
- break;
- case 17: /* Root path */
- if (!root_server_path[0])
- ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
- break;
- case 26: /* Interface MTU */
- memcpy(&mtu, ext+1, sizeof(mtu));
- ic_dev_mtu = ntohs(mtu);
- break;
- case 40: /* NIS Domain name (_not_ DNS) */
- ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN);
- break;
+ case 1: /* Subnet mask */
+ if (ic_netmask == NONE)
+ memcpy(&ic_netmask, ext+1, 4);
+ break;
+ case 3: /* Default gateway */
+ if (ic_gateway == NONE)
+ memcpy(&ic_gateway, ext+1, 4);
+ break;
+ case 6: /* DNS server */
+ servers= *ext/4;
+ if (servers > CONF_NAMESERVERS_MAX)
+ servers = CONF_NAMESERVERS_MAX;
+ for (i = 0; i < servers; i++) {
+ if (ic_nameservers[i] == NONE)
+ memcpy(&ic_nameservers[i], ext+1+4*i, 4);
+ }
+ break;
+ case 12: /* Host name */
+ ic_bootp_string(utsname()->nodename, ext+1, *ext,
+ __NEW_UTS_LEN);
+ ic_host_name_set = 1;
+ break;
+ case 15: /* Domain name (DNS) */
+ ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+ break;
+ case 17: /* Root path */
+ if (!root_server_path[0])
+ ic_bootp_string(root_server_path, ext+1, *ext,
+ sizeof(root_server_path));
+ break;
+ case 26: /* Interface MTU */
+ memcpy(&mtu, ext+1, sizeof(mtu));
+ ic_dev_mtu = ntohs(mtu);
+ break;
+ case 40: /* NIS Domain name (_not_ DNS) */
+ ic_bootp_string(utsname()->domainname, ext+1, *ext,
+ __NEW_UTS_LEN);
+ break;
}
}
@@ -932,7 +935,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
goto drop;
/* Fragments are not supported */
- if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
+ if (ip_is_fragment(h)) {
if (net_ratelimit())
printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
"reply.\n");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 30a7763c400e..58e879157976 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1796,7 +1796,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
struct flowi4 fl4 = {
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowi4_tos = iph->tos,
+ .flowi4_tos = RT_TOS(iph->tos),
.flowi4_oif = rt->rt_oif,
.flowi4_iif = rt->rt_iif,
.flowi4_mark = rt->rt_mark,
@@ -2544,7 +2544,8 @@ int __init ip_mr_init(void)
goto add_proto_fail;
}
#endif
- rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
+ rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
+ NULL, ipmr_rtm_dumproute, NULL);
return 0;
#ifdef CONFIG_IP_PIMSM_V2
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 5c9e97c79017..db8d22db425f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -317,19 +317,19 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
hash = clusterip_hashfn(skb, cipinfo->config);
switch (ctinfo) {
- case IP_CT_NEW:
- ct->mark = hash;
- break;
- case IP_CT_RELATED:
- case IP_CT_RELATED_REPLY:
- /* FIXME: we don't handle expectations at the
- * moment. they can arrive on a different node than
- * the master connection (e.g. FTP passive mode) */
- case IP_CT_ESTABLISHED:
- case IP_CT_ESTABLISHED_REPLY:
- break;
- default:
- break;
+ case IP_CT_NEW:
+ ct->mark = hash;
+ break;
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ /* FIXME: we don't handle expectations at the moment.
+ * They can arrive on a different node than
+ * the master connection (e.g. FTP passive mode) */
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+ default: /* Prevent gcc warnings */
+ break;
}
#ifdef DEBUG
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index f3a9b42b16c6..9bb1b8a37a22 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -82,7 +82,7 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
#endif
#endif
/* Gather fragments. */
- if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb);
if (nf_ct_ipv4_gather_frags(skb, user))
return NF_STOLEN;
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 8812a02078ab..076b7c8c4aa4 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -719,117 +719,115 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
l = 0;
switch (type) {
- case SNMP_INTEGER:
- len = sizeof(long);
- if (!asn1_long_decode(ctx, end, &l)) {
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len,
- GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- (*obj)->syntax.l[0] = l;
- break;
- case SNMP_OCTETSTR:
- case SNMP_OPAQUE:
- if (!asn1_octets_decode(ctx, end, &p, &len)) {
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len,
- GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(p);
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- memcpy((*obj)->syntax.c, p, len);
+ case SNMP_INTEGER:
+ len = sizeof(long);
+ if (!asn1_long_decode(ctx, end, &l)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ if (net_ratelimit())
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ (*obj)->syntax.l[0] = l;
+ break;
+ case SNMP_OCTETSTR:
+ case SNMP_OPAQUE:
+ if (!asn1_octets_decode(ctx, end, &p, &len)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
kfree(p);
- break;
- case SNMP_NULL:
- case SNMP_NOSUCHOBJECT:
- case SNMP_NOSUCHINSTANCE:
- case SNMP_ENDOFMIBVIEW:
- len = 0;
- *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- if (!asn1_null_decode(ctx, end)) {
- kfree(id);
- kfree(*obj);
- *obj = NULL;
- return 0;
- }
- break;
- case SNMP_OBJECTID:
- if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
- kfree(id);
- return 0;
- }
- len *= sizeof(unsigned long);
- *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(lp);
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- memcpy((*obj)->syntax.ul, lp, len);
+ kfree(id);
+ if (net_ratelimit())
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ memcpy((*obj)->syntax.c, p, len);
+ kfree(p);
+ break;
+ case SNMP_NULL:
+ case SNMP_NOSUCHOBJECT:
+ case SNMP_NOSUCHINSTANCE:
+ case SNMP_ENDOFMIBVIEW:
+ len = 0;
+ *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ if (net_ratelimit())
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ if (!asn1_null_decode(ctx, end)) {
+ kfree(id);
+ kfree(*obj);
+ *obj = NULL;
+ return 0;
+ }
+ break;
+ case SNMP_OBJECTID:
+ if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
+ kfree(id);
+ return 0;
+ }
+ len *= sizeof(unsigned long);
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
kfree(lp);
- break;
- case SNMP_IPADDR:
- if (!asn1_octets_decode(ctx, end, &p, &len)) {
- kfree(id);
- return 0;
- }
- if (len != 4) {
- kfree(p);
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(p);
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- memcpy((*obj)->syntax.uc, p, len);
+ kfree(id);
+ if (net_ratelimit())
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ memcpy((*obj)->syntax.ul, lp, len);
+ kfree(lp);
+ break;
+ case SNMP_IPADDR:
+ if (!asn1_octets_decode(ctx, end, &p, &len)) {
+ kfree(id);
+ return 0;
+ }
+ if (len != 4) {
kfree(p);
- break;
- case SNMP_COUNTER:
- case SNMP_GAUGE:
- case SNMP_TIMETICKS:
- len = sizeof(unsigned long);
- if (!asn1_ulong_decode(ctx, end, &ul)) {
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- (*obj)->syntax.ul[0] = ul;
- break;
- default:
kfree(id);
return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(p);
+ kfree(id);
+ if (net_ratelimit())
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ memcpy((*obj)->syntax.uc, p, len);
+ kfree(p);
+ break;
+ case SNMP_COUNTER:
+ case SNMP_GAUGE:
+ case SNMP_TIMETICKS:
+ len = sizeof(unsigned long);
+ if (!asn1_ulong_decode(ctx, end, &ul)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ if (net_ratelimit())
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
+ return 0;
+ }
+ (*obj)->syntax.ul[0] = ul;
+ break;
+ default:
+ kfree(id);
+ return 0;
}
(*obj)->syntax_len = len;
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 483b76d042da..a6e606e84820 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -88,7 +88,7 @@ nf_nat_fn(unsigned int hooknum,
/* We never see fragments: conntrack defrags on pre-routing
and local-out, and nf_nat_out protects post-routing. */
- NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
+ NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
ct = nf_ct_get(skb, &ctinfo);
/* Can't track? It's not due to stress, or conntrack would
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index c9893d43242e..08526786dc39 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -825,28 +825,28 @@ static int compat_raw_getsockopt(struct sock *sk, int level, int optname,
static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
switch (cmd) {
- case SIOCOUTQ: {
- int amount = sk_wmem_alloc_get(sk);
+ case SIOCOUTQ: {
+ int amount = sk_wmem_alloc_get(sk);
- return put_user(amount, (int __user *)arg);
- }
- case SIOCINQ: {
- struct sk_buff *skb;
- int amount = 0;
-
- spin_lock_bh(&sk->sk_receive_queue.lock);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb != NULL)
- amount = skb->len;
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- return put_user(amount, (int __user *)arg);
- }
+ return put_user(amount, (int __user *)arg);
+ }
+ case SIOCINQ: {
+ struct sk_buff *skb;
+ int amount = 0;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb != NULL)
+ amount = skb->len;
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return put_user(amount, (int __user *)arg);
+ }
- default:
+ default:
#ifdef CONFIG_IP_MROUTE
- return ipmr_ioctl(sk, cmd, (void __user *)arg);
+ return ipmr_ioctl(sk, cmd, (void __user *)arg);
#else
- return -ENOIOCTLCMD;
+ return -ENOIOCTLCMD;
#endif
}
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index aa13ef105110..1730689f560e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -108,6 +108,7 @@
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
+#include <net/atmclip.h>
#define RT_FL_TOS(oldflp4) \
((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
@@ -184,6 +185,8 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
return p;
}
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
+
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
@@ -198,6 +201,7 @@ static struct dst_ops ipv4_dst_ops = {
.link_failure = ipv4_link_failure,
.update_pmtu = ip_rt_update_pmtu,
.local_out = __ip_local_out,
+ .neigh_lookup = ipv4_neigh_lookup,
};
#define ECN_OR_COST(class) TC_PRIO_##class
@@ -411,8 +415,10 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
"HHUptod\tSpecDst");
else {
struct rtable *r = v;
+ struct neighbour *n;
int len;
+ n = dst_get_neighbour(&r->dst);
seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
"%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
r->dst.dev ? r->dst.dev->name : "*",
@@ -425,9 +431,8 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
dst_metric(&r->dst, RTAX_RTTVAR)),
r->rt_key_tos,
- r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
- r->dst.hh ? (r->dst.hh->hh_output ==
- dev_queue_xmit) : 0,
+ -1,
+ (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
r->rt_spec_dst, &len);
seq_printf(seq, "%*s\n", 127 - len, "");
@@ -1006,6 +1011,37 @@ static int slow_chain_length(const struct rtable *head)
return length >> FRACT_BITS;
}
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
+{
+ struct neigh_table *tbl = &arp_tbl;
+ static const __be32 inaddr_any = 0;
+ struct net_device *dev = dst->dev;
+ const __be32 *pkey = daddr;
+ struct neighbour *n;
+
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+ if (dev->type == ARPHRD_ATM)
+ tbl = clip_tbl_hook;
+#endif
+ if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+ pkey = &inaddr_any;
+
+ n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
+ if (n)
+ return n;
+ return neigh_create(tbl, pkey, dev);
+}
+
+static int rt_bind_neighbour(struct rtable *rt)
+{
+ struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
+ if (IS_ERR(n))
+ return PTR_ERR(n);
+ dst_set_neighbour(&rt->dst, n);
+
+ return 0;
+}
+
static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
struct sk_buff *skb, int ifindex)
{
@@ -1042,7 +1078,7 @@ restart:
rt->dst.flags |= DST_NOCACHE;
if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
- int err = arp_bind_neighbour(&rt->dst);
+ int err = rt_bind_neighbour(rt);
if (err) {
if (net_ratelimit())
printk(KERN_WARNING
@@ -1138,7 +1174,7 @@ restart:
route or unicast forwarding path.
*/
if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
- int err = arp_bind_neighbour(&rt->dst);
+ int err = rt_bind_neighbour(rt);
if (err) {
spin_unlock_bh(rt_hash_lock_addr(hash));
@@ -1439,20 +1475,20 @@ static int ip_error(struct sk_buff *skb)
int code;
switch (rt->dst.error) {
- case EINVAL:
- default:
- goto out;
- case EHOSTUNREACH:
- code = ICMP_HOST_UNREACH;
- break;
- case ENETUNREACH:
- code = ICMP_NET_UNREACH;
- IP_INC_STATS_BH(dev_net(rt->dst.dev),
- IPSTATS_MIB_INNOROUTES);
- break;
- case EACCES:
- code = ICMP_PKT_FILTERED;
- break;
+ case EINVAL:
+ default:
+ goto out;
+ case EHOSTUNREACH:
+ code = ICMP_HOST_UNREACH;
+ break;
+ case ENETUNREACH:
+ code = ICMP_NET_UNREACH;
+ IP_INC_STATS_BH(dev_net(rt->dst.dev),
+ IPSTATS_MIB_INNOROUTES);
+ break;
+ case EACCES:
+ code = ICMP_PKT_FILTERED;
+ break;
}
if (!rt->peer)
@@ -1592,23 +1628,24 @@ static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
{
struct rtable *rt = (struct rtable *) dst;
__be32 orig_gw = rt->rt_gateway;
+ struct neighbour *n;
dst_confirm(&rt->dst);
- neigh_release(rt->dst.neighbour);
- rt->dst.neighbour = NULL;
+ neigh_release(dst_get_neighbour(&rt->dst));
+ dst_set_neighbour(&rt->dst, NULL);
rt->rt_gateway = peer->redirect_learned.a4;
- if (arp_bind_neighbour(&rt->dst) ||
- !(rt->dst.neighbour->nud_state & NUD_VALID)) {
- if (rt->dst.neighbour)
- neigh_event_send(rt->dst.neighbour, NULL);
+ rt_bind_neighbour(rt);
+ n = dst_get_neighbour(&rt->dst);
+ if (!n || !(n->nud_state & NUD_VALID)) {
+ if (n)
+ neigh_event_send(n, NULL);
rt->rt_gateway = orig_gw;
return -EAGAIN;
} else {
rt->rt_flags |= RTCF_REDIRECTED;
- call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
- rt->dst.neighbour);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
}
return 0;
}
@@ -1703,7 +1740,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = iph->daddr;
fl4.saddr = iph->saddr;
- fl4.flowi4_tos = iph->tos;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
fl4.flowi4_oif = rt->dst.dev->ifindex;
fl4.flowi4_iif = skb->dev->ifindex;
fl4.flowi4_mark = skb->mark;
@@ -2708,6 +2745,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
.default_advmss = ipv4_default_advmss,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
.cow_metrics = ipv4_rt_blackhole_cow_metrics,
+ .neigh_lookup = ipv4_neigh_lookup,
};
struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
@@ -3303,7 +3341,7 @@ int __init ip_rt_init(void)
xfrm_init();
xfrm4_init(ip_rt_max_size);
#endif
- rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
#ifdef CONFIG_SYSCTL
register_pernet_subsys(&sysctl_route_ops);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 26461492a847..92bb9434b338 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -316,6 +316,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
ireq->wscale_ok = tcp_opt.wscale_ok;
ireq->tstamp_ok = tcp_opt.saw_tstamp;
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+ treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 57d0752e239a..69fd7201129a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -398,20 +398,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
{
- .procname = "inet_peer_gc_mintime",
- .data = &inet_peer_gc_mintime,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "inet_peer_gc_maxtime",
- .data = &inet_peer_gc_maxtime,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
.procname = "tcp_orphan_retries",
.data = &sysctl_tcp_orphan_retries,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bef9f04c22ba..ea0d2183df4b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -880,6 +880,11 @@ static void tcp_init_metrics(struct sock *sk)
tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ } else {
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
}
if (dst_metric(dst, RTAX_REORDERING) &&
tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
@@ -887,10 +892,7 @@ static void tcp_init_metrics(struct sock *sk)
tp->reordering = dst_metric(dst, RTAX_REORDERING);
}
- if (dst_metric(dst, RTAX_RTT) == 0)
- goto reset;
-
- if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
+ if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
goto reset;
/* Initial rtt is determined from SYN,SYN-ACK.
@@ -916,19 +918,26 @@ static void tcp_init_metrics(struct sock *sk)
tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
}
tcp_set_rto(sk);
- if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
reset:
- /* Play conservative. If timestamps are not
- * supported, TCP will fail to recalculate correct
- * rtt, if initial rto is too small. FORGET ALL AND RESET!
+ if (tp->srtt == 0) {
+ /* RFC2988bis: We've failed to get a valid RTT sample from
+ * 3WHS. This is most likely due to retransmission,
+ * including spurious one. Reset the RTO back to 3secs
+ * from the more aggressive 1sec to avoid more spurious
+ * retransmission.
*/
- if (!tp->rx_opt.saw_tstamp && tp->srtt) {
- tp->srtt = 0;
- tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
- inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
- }
+ tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
}
- tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+ /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+ * retransmitted. In light of RFC2988bis' more aggressive 1sec
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+ * retransmission has occurred.
+ */
+ if (tp->total_retrans > 1)
+ tp->snd_cwnd = 1;
+ else
+ tp->snd_cwnd = tcp_init_cwnd(tp, dst);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -3112,12 +3121,13 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
tcp_xmit_retransmit_queue(sk);
}
-static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
+void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
{
tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
}
+EXPORT_SYMBOL(tcp_valid_rtt_meas);
/* Read draft-ietf-tcplw-high-performance before mucking
* with this code. (Supersedes RFC1323)
@@ -5806,12 +5816,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- /* tcp_ack considers this ACK as duplicate
- * and does not calculate rtt.
- * Force it here.
- */
- tcp_ack_update_rtt(sk, 0, 0);
-
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 708dc203b034..955b8e65b69e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -429,8 +429,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
break;
icsk->icsk_backoff--;
- inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
- icsk->icsk_backoff;
+ inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
+ TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
tcp_bound_rto(sk);
skb = tcp_write_queue_head(sk);
@@ -1384,6 +1384,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
isn = tcp_v4_init_sequence(skb);
}
tcp_rsk(req)->snt_isn = isn;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
if (tcp_v4_send_synack(sk, dst, req,
(struct request_values *)&tmp_ext) ||
@@ -1458,6 +1459,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
tcp_initialize_rcv_mss(newsk);
+ if (tcp_rsk(req)->snt_synack)
+ tcp_valid_rtt_meas(newsk,
+ tcp_time_stamp - tcp_rsk(req)->snt_synack);
+ newtp->total_retrans = req->retrans;
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
@@ -1855,7 +1860,7 @@ static int tcp_v4_init_sock(struct sock *sk)
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
- tp->snd_cwnd = 2;
+ tp->snd_cwnd = TCP_INIT_CWND;
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 80b1f80759ab..d2fe4e06b472 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -486,7 +486,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
- newtp->snd_cwnd = 2;
+ newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
newtp->bytes_acked = 0;
@@ -720,6 +720,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
return NULL;
}
+ if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
+ tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
+ else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
+ tcp_rsk(req)->snt_synack = 0;
/* OK, ACK is valid, create big socket and
* feed this segment to it. It will repeat all
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 198f75b7bdd3..1b5a19340a95 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -105,6 +105,7 @@
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
+#include <trace/events/udp.h>
#include "udp_impl.h"
struct udp_table udp_table __read_mostly;
@@ -1366,6 +1367,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
is_udplite);
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
+ trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 981e43eaf704..fc5368ad2b0d 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -117,7 +117,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;
- if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
+ if (!ip_is_fragment(iph)) {
switch (iph->protocol) {
case IPPROTO_UDP:
case IPPROTO_UDPLITE: