summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig7
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c46
-rw-r--r--net/ipv4/ah4.c17
-rw-r--r--net/ipv4/arp.c163
-rw-r--r--net/ipv4/cipso_ipv4.c2
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/devinet.c415
-rw-r--r--net/ipv4/esp4.c33
-rw-r--r--net/ipv4/fib_frontend.c271
-rw-r--r--net/ipv4/fib_hash.c126
-rw-r--r--net/ipv4/fib_lookup.h16
-rw-r--r--net/ipv4/fib_rules.c81
-rw-r--r--net/ipv4/fib_semantics.c64
-rw-r--r--net/ipv4/fib_trie.c866
-rw-r--r--net/ipv4/icmp.c130
-rw-r--r--net/ipv4/igmp.c56
-rw-r--r--net/ipv4/inet_connection_sock.c19
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_fragment.c85
-rw-r--r--net/ipv4/inet_hashtables.c87
-rw-r--r--net/ipv4/inet_timewait_sock.c23
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_fragment.c202
-rw-r--r--net/ipv4/ip_gre.c136
-rw-r--r--net/ipv4/ip_input.c22
-rw-r--r--net/ipv4/ip_options.c4
-rw-r--r--net/ipv4/ip_output.c55
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ipcomp.c20
-rw-r--r--net/ipv4/ipconfig.c27
-rw-r--r--net/ipv4/ipip.c72
-rw-r--r--net/ipv4/ipmr.c35
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c9
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c73
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c116
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c39
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c36
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c36
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c16
-rw-r--r--net/ipv4/ipvs/ip_vs_sched.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c34
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c16
-rw-r--r--net/ipv4/netfilter.c43
-rw-r--r--net/ipv4/netfilter/Kconfig85
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/arp_tables.c989
-rw-r--r--net/ipv4/netfilter/arptable_filter.c2
-rw-r--r--net/ipv4/netfilter/ip_queue.c210
-rw-r--r--net/ipv4/netfilter/ip_tables.c493
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c55
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c43
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c45
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c43
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c52
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c47
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c155
-rw-r--r--net/ipv4/netfilter/ipt_SAME.c179
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c87
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c40
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c52
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c115
-rw-r--r--net/ipv4/netfilter/ipt_ah.c39
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c35
-rw-r--r--net/ipv4/netfilter/ipt_iprange.c79
-rw-r--r--net/ipv4/netfilter/ipt_owner.c92
-rw-r--r--net/ipv4/netfilter/ipt_recent.c41
-rw-r--r--net/ipv4/netfilter/ipt_tos.c55
-rw-r--r--net/ipv4/netfilter/ipt_ttl.c26
-rw-r--r--net/ipv4/netfilter/iptable_filter.c24
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c42
-rw-r--r--net/ipv4/netfilter/iptable_raw.c16
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c37
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c5
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c30
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c68
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c26
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c29
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_unknown.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c38
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c22
-rw-r--r--net/ipv4/proc.c15
-rw-r--r--net/ipv4/raw.c216
-rw-r--r--net/ipv4/route.c427
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c167
-rw-r--r--net/ipv4/tcp.c223
-rw-r--r--net/ipv4/tcp_bic.c3
-rw-r--r--net/ipv4/tcp_cong.c23
-rw-r--r--net/ipv4/tcp_cubic.c3
-rw-r--r--net/ipv4/tcp_highspeed.c3
-rw-r--r--net/ipv4/tcp_htcp.c3
-rw-r--r--net/ipv4/tcp_hybla.c5
-rw-r--r--net/ipv4/tcp_illinois.c3
-rw-r--r--net/ipv4/tcp_input.c1281
-rw-r--r--net/ipv4/tcp_ipv4.c8
-rw-r--r--net/ipv4/tcp_lp.c4
-rw-r--r--net/ipv4/tcp_output.c651
-rw-r--r--net/ipv4/tcp_scalable.c3
-rw-r--r--net/ipv4/tcp_timer.c43
-rw-r--r--net/ipv4/tcp_vegas.c7
-rw-r--r--net/ipv4/tcp_veno.c7
-rw-r--r--net/ipv4/tcp_yeah.c3
-rw-r--r--net/ipv4/udp.c106
-rw-r--r--net/ipv4/udplite.c2
-rw-r--r--net/ipv4/xfrm4_input.c134
-rw-r--r--net/ipv4/xfrm4_mode_beet.c62
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c93
-rw-r--r--net/ipv4/xfrm4_output.c95
-rw-r--r--net/ipv4/xfrm4_policy.c219
-rw-r--r--net/ipv4/xfrm4_state.c20
120 files changed, 5822 insertions, 4965 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 9f9fd2c6f6e2..24e2b7294bf8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -85,6 +85,13 @@ endchoice
config IP_FIB_HASH
def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
+config IP_FIB_TRIE_STATS
+ bool "FIB TRIE statistics"
+ depends on IP_FIB_TRIE
+ ---help---
+ Keep track of statistics on structure of FIB TRIE table.
+ Useful for testing and measuring TRIE performance.
+
config IP_MULTIPLE_TABLES
bool "IP: policy routing"
depends on IP_ADVANCED_ROUTER
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 93fe3966805d..ad40ef3f9ebc 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,9 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_minisocks.o tcp_cong.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
- sysctl_net_ipv4.o fib_frontend.o fib_semantics.o \
+ fib_frontend.o fib_semantics.o \
inet_fragment.o
+obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d2f22e74b267..09ca5293d08f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -126,6 +126,10 @@ extern void ip_mc_drop_socket(struct sock *sk);
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);
+struct ipv4_config ipv4_config;
+
+EXPORT_SYMBOL(ipv4_config);
+
/* New destruction routine */
void inet_sock_destruct(struct sock *sk)
@@ -135,6 +139,8 @@ void inet_sock_destruct(struct sock *sk)
__skb_queue_purge(&sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_error_queue);
+ sk_mem_reclaim(sk);
+
if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
printk("Attempt to release TCP socket in state %d %p\n",
sk->sk_state, sk);
@@ -440,7 +446,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr_len < sizeof(struct sockaddr_in))
goto out;
- chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+ chk_addr_ret = inet_addr_type(&init_net, addr->sin_addr.s_addr);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
@@ -789,12 +795,12 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCADDRT:
case SIOCDELRT:
case SIOCRTMSG:
- err = ip_rt_ioctl(cmd, (void __user *)arg);
+ err = ip_rt_ioctl(sk->sk_net, cmd, (void __user *)arg);
break;
case SIOCDARP:
case SIOCGARP:
case SIOCSARP:
- err = arp_ioctl(cmd, (void __user *)arg);
+ err = arp_ioctl(sk->sk_net, cmd, (void __user *)arg);
break;
case SIOCGIFADDR:
case SIOCSIFADDR:
@@ -838,6 +844,7 @@ const struct proto_ops inet_stream_ops = {
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = tcp_sendpage,
+ .splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
@@ -1106,7 +1113,7 @@ int inet_sk_rebuild_header(struct sock *sk)
};
security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(&rt, &fl, sk, 0);
+ err = ip_route_output_flow(&init_net, &rt, &fl, sk, 0);
}
if (!err)
sk_setup_caps(sk, &rt->u.dst);
@@ -1237,7 +1244,7 @@ unsigned long snmp_fold_field(void *mib[], int offt)
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
-int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
+int snmp_mib_init(void *ptr[2], size_t mibsize)
{
BUG_ON(ptr == NULL);
ptr[0] = __alloc_percpu(mibsize);
@@ -1286,37 +1293,31 @@ static struct net_protocol udp_protocol = {
static struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
+ .no_policy = 1,
};
static int __init init_ipv4_mibs(void)
{
if (snmp_mib_init((void **)net_statistics,
- sizeof(struct linux_mib),
- __alignof__(struct linux_mib)) < 0)
+ sizeof(struct linux_mib)) < 0)
goto err_net_mib;
if (snmp_mib_init((void **)ip_statistics,
- sizeof(struct ipstats_mib),
- __alignof__(struct ipstats_mib)) < 0)
+ sizeof(struct ipstats_mib)) < 0)
goto err_ip_mib;
if (snmp_mib_init((void **)icmp_statistics,
- sizeof(struct icmp_mib),
- __alignof__(struct icmp_mib)) < 0)
+ sizeof(struct icmp_mib)) < 0)
goto err_icmp_mib;
if (snmp_mib_init((void **)icmpmsg_statistics,
- sizeof(struct icmpmsg_mib),
- __alignof__(struct icmpmsg_mib)) < 0)
+ sizeof(struct icmpmsg_mib)) < 0)
goto err_icmpmsg_mib;
if (snmp_mib_init((void **)tcp_statistics,
- sizeof(struct tcp_mib),
- __alignof__(struct tcp_mib)) < 0)
+ sizeof(struct tcp_mib)) < 0)
goto err_tcp_mib;
if (snmp_mib_init((void **)udp_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ sizeof(struct udp_mib)) < 0)
goto err_udp_mib;
if (snmp_mib_init((void **)udplite_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ sizeof(struct udp_mib)) < 0)
goto err_udplite_mib;
tcp_mib_init();
@@ -1418,6 +1419,9 @@ static int __init inet_init(void)
/* Setup TCP slab cache for open requests. */
tcp_init();
+ /* Setup UDP memory threshold */
+ udp_init();
+
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
@@ -1471,15 +1475,11 @@ static int __init ipv4_proc_init(void)
goto out_tcp;
if (udp4_proc_init())
goto out_udp;
- if (fib_proc_init())
- goto out_fib;
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
- fib_proc_exit();
-out_fib:
udp4_proc_exit();
out_udp:
tcp4_proc_exit();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 5fc346d8b566..d76803a3dcae 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -169,6 +169,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
if (ip_clear_mutable_options(iph, &dummy))
goto out;
}
+
+ spin_lock(&x->lock);
{
u8 auth_data[MAX_AH_AUTH_LEN];
@@ -176,13 +178,16 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
skb_push(skb, ihl);
err = ah_mac_digest(ahp, skb, ah->auth_data);
if (err)
- goto out;
- err = -EINVAL;
- if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) {
- x->stats.integrity_failed++;
- goto out;
- }
+ goto unlock;
+ if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len))
+ err = -EBADMSG;
}
+unlock:
+ spin_unlock(&x->lock);
+
+ if (err)
+ goto out;
+
skb->network_header += ah_hlen;
memcpy(skb_network_header(skb), work_buf, ihl);
skb->transport_header = skb->network_header;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 54a76b8b803a..5976c598cc4b 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -235,8 +235,6 @@ static int arp_constructor(struct neighbour *neigh)
struct in_device *in_dev;
struct neigh_parms *parms;
- neigh->type = inet_addr_type(addr);
-
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev == NULL) {
@@ -244,6 +242,8 @@ static int arp_constructor(struct neighbour *neigh)
return -EINVAL;
}
+ neigh->type = inet_addr_type(&init_net, addr);
+
parms = in_dev->arp_parms;
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
@@ -341,14 +341,14 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
default:
case 0: /* By default announce any local IP */
- if (skb && inet_addr_type(ip_hdr(skb)->saddr) == RTN_LOCAL)
+ if (skb && inet_addr_type(&init_net, ip_hdr(skb)->saddr) == RTN_LOCAL)
saddr = ip_hdr(skb)->saddr;
break;
case 1: /* Restrict announcements of saddr in same subnet */
if (!skb)
break;
saddr = ip_hdr(skb)->saddr;
- if (inet_addr_type(saddr) == RTN_LOCAL) {
+ if (inet_addr_type(&init_net, saddr) == RTN_LOCAL) {
/* saddr should be known to target */
if (inet_addr_onlink(in_dev, target, saddr))
break;
@@ -382,8 +382,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
read_unlock_bh(&neigh->lock);
}
-static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
- __be32 sip, __be32 tip)
+static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
{
int scope;
@@ -403,7 +402,6 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
case 3: /* Do not reply for scope host addresses */
sip = 0;
scope = RT_SCOPE_LINK;
- dev = NULL;
break;
case 4: /* Reserved */
case 5:
@@ -415,7 +413,7 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
default:
return 0;
}
- return !inet_confirm_addr(dev, sip, tip, scope);
+ return !inet_confirm_addr(in_dev, sip, tip, scope);
}
static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
@@ -426,7 +424,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
int flag = 0;
/*unsigned long now; */
- if (ip_route_output_key(&rt, &fl) < 0)
+ if (ip_route_output_key(&init_net, &rt, &fl) < 0)
return 1;
if (rt->u.dst.dev != dev) {
NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
@@ -479,7 +477,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
paddr = ((struct rtable*)skb->dst)->rt_gateway;
- if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
+ if (arp_set_predefined(inet_addr_type(&init_net, paddr), haddr, paddr, dev))
return 0;
n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
@@ -777,7 +775,7 @@ static int arp_process(struct sk_buff *skb)
* Check for bad requests for 127.x.x.x and requests for multicast
* addresses. If this is one such, delete it.
*/
- if (LOOPBACK(tip) || MULTICAST(tip))
+ if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
goto out;
/*
@@ -806,8 +804,8 @@ static int arp_process(struct sk_buff *skb)
/* Special case: IPv4 duplicate address detection packet (RFC2131) */
if (sip == 0) {
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- inet_addr_type(tip) == RTN_LOCAL &&
- !arp_ignore(in_dev,dev,sip,tip))
+ inet_addr_type(&init_net, tip) == RTN_LOCAL &&
+ !arp_ignore(in_dev, sip, tip))
arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
dev->dev_addr, sha);
goto out;
@@ -825,7 +823,7 @@ static int arp_process(struct sk_buff *skb)
int dont_send = 0;
if (!dont_send)
- dont_send |= arp_ignore(in_dev,dev,sip,tip);
+ dont_send |= arp_ignore(in_dev,sip,tip);
if (!dont_send && IN_DEV_ARPFILTER(in_dev))
dont_send |= arp_filter(sip,tip,dev);
if (!dont_send)
@@ -835,9 +833,8 @@ static int arp_process(struct sk_buff *skb)
}
goto out;
} else if (IN_DEV_FORWARD(in_dev)) {
- if ((rt->rt_flags&RTCF_DNAT) ||
- (addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
- (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
+ if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
+ (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &init_net, &tip, dev, 0))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
@@ -860,14 +857,14 @@ static int arp_process(struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
- if (IPV4_DEVCONF_ALL(ARP_ACCEPT)) {
+ if (IPV4_DEVCONF_ALL(dev->nd_net, ARP_ACCEPT)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
if (n == NULL &&
arp->ar_op == htons(ARPOP_REPLY) &&
- inet_addr_type(sip) == RTN_UNICAST)
+ inet_addr_type(&init_net, sip) == RTN_UNICAST)
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
@@ -952,44 +949,60 @@ out_of_mem:
* Set (create) an ARP cache entry.
*/
-static int arp_req_set(struct arpreq *r, struct net_device * dev)
+static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
{
- __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ if (dev == NULL) {
+ IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
+ return 0;
+ }
+ if (__in_dev_get_rtnl(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
+ return 0;
+ }
+ return -ENXIO;
+}
+
+static int arp_req_set_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask && mask != htonl(0xFFFFFFFF))
+ return -EINVAL;
+ if (!dev && (r->arp_flags & ATF_COM)) {
+ dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
+ r->arp_ha.sa_data);
+ if (!dev)
+ return -ENODEV;
+ }
+ if (mask) {
+ if (pneigh_lookup(&arp_tbl, net, &ip, dev, 1) == NULL)
+ return -ENOBUFS;
+ return 0;
+ }
+
+ return arp_req_set_proxy(net, dev, 1);
+}
+
+static int arp_req_set(struct net *net, struct arpreq *r,
+ struct net_device * dev)
+{
+ __be32 ip;
struct neighbour *neigh;
int err;
- if (r->arp_flags&ATF_PUBL) {
- __be32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr;
- if (mask && mask != htonl(0xFFFFFFFF))
- return -EINVAL;
- if (!dev && (r->arp_flags & ATF_COM)) {
- dev = dev_getbyhwaddr(&init_net, r->arp_ha.sa_family, r->arp_ha.sa_data);
- if (!dev)
- return -ENODEV;
- }
- if (mask) {
- if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL)
- return -ENOBUFS;
- return 0;
- }
- if (dev == NULL) {
- IPV4_DEVCONF_ALL(PROXY_ARP) = 1;
- return 0;
- }
- if (__in_dev_get_rtnl(dev)) {
- IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, 1);
- return 0;
- }
- return -ENXIO;
- }
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_set_public(net, r, dev);
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
if (r->arp_flags & ATF_PERM)
r->arp_flags |= ATF_COM;
if (dev == NULL) {
struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
.tos = RTO_ONLINK } } };
struct rtable * rt;
- if ((err = ip_route_output_key(&rt, &fl)) != 0)
+ if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
return err;
dev = rt->u.dst.dev;
ip_rt_put(rt);
@@ -1066,37 +1079,37 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
return err;
}
-static int arp_req_delete(struct arpreq *r, struct net_device * dev)
+static int arp_req_delete_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask == htonl(0xFFFFFFFF))
+ return pneigh_delete(&arp_tbl, net, &ip, dev);
+
+ if (mask)
+ return -EINVAL;
+
+ return arp_req_set_proxy(net, dev, 0);
+}
+
+static int arp_req_delete(struct net *net, struct arpreq *r,
+ struct net_device * dev)
{
int err;
- __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ __be32 ip;
struct neighbour *neigh;
- if (r->arp_flags & ATF_PUBL) {
- __be32 mask =
- ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
- if (mask == htonl(0xFFFFFFFF))
- return pneigh_delete(&arp_tbl, &ip, dev);
- if (mask == 0) {
- if (dev == NULL) {
- IPV4_DEVCONF_ALL(PROXY_ARP) = 0;
- return 0;
- }
- if (__in_dev_get_rtnl(dev)) {
- IN_DEV_CONF_SET(__in_dev_get_rtnl(dev),
- PROXY_ARP, 0);
- return 0;
- }
- return -ENXIO;
- }
- return -EINVAL;
- }
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_delete_public(net, r, dev);
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
if (dev == NULL) {
struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
.tos = RTO_ONLINK } } };
struct rtable * rt;
- if ((err = ip_route_output_key(&rt, &fl)) != 0)
+ if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
return err;
dev = rt->u.dst.dev;
ip_rt_put(rt);
@@ -1119,7 +1132,7 @@ static int arp_req_delete(struct arpreq *r, struct net_device * dev)
* Handle an ARP layer I/O control request.
*/
-int arp_ioctl(unsigned int cmd, void __user *arg)
+int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
int err;
struct arpreq r;
@@ -1151,7 +1164,7 @@ int arp_ioctl(unsigned int cmd, void __user *arg)
rtnl_lock();
if (r.arp_dev[0]) {
err = -ENODEV;
- if ((dev = __dev_get_by_name(&init_net, r.arp_dev)) == NULL)
+ if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL)
goto out;
/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1167,10 +1180,10 @@ int arp_ioctl(unsigned int cmd, void __user *arg)
switch (cmd) {
case SIOCDARP:
- err = arp_req_delete(&r, dev);
+ err = arp_req_delete(net, &r, dev);
break;
case SIOCSARP:
- err = arp_req_set(&r, dev);
+ err = arp_req_set(net, &r, dev);
break;
case SIOCGARP:
err = arp_req_get(&r, dev);
@@ -1359,8 +1372,8 @@ static const struct seq_operations arp_seq_ops = {
static int arp_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &arp_seq_ops,
- sizeof(struct neigh_seq_state));
+ return seq_open_net(inode, file, &arp_seq_ops,
+ sizeof(struct neigh_seq_state));
}
static const struct file_operations arp_seq_fops = {
@@ -1368,7 +1381,7 @@ static const struct file_operations arp_seq_fops = {
.open = arp_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
static int __init arp_proc_init(void)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index f18e88bc86ec..d4dc4eb48d95 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -63,7 +63,7 @@ struct cipso_v4_domhsh_entry {
* probably be turned into a hash table or something similar so we
* can do quick lookups. */
static DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
-static struct list_head cipso_v4_doi_list = LIST_HEAD_INIT(cipso_v4_doi_list);
+static LIST_HEAD(cipso_v4_doi_list);
/* Label mapping cache */
int cipso_v4_cache_enabled = 1;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 0301dd468cf4..0c0c73f368ce 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -40,7 +40,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
oif = sk->sk_bound_dev_if;
saddr = inet->saddr;
- if (MULTICAST(usin->sin_addr.s_addr)) {
+ if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
if (!oif)
oif = inet->mc_index;
if (!saddr)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index b42f74617bac..21f71bf912d5 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -62,6 +62,7 @@
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
struct ipv4_devconf ipv4_devconf = {
.data = {
@@ -82,7 +83,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = {
},
};
-#define IPV4_DEVCONF_DFLT(attr) IPV4_DEVCONF(ipv4_devconf_dflt, attr)
+#define IPV4_DEVCONF_DFLT(net, attr) \
+ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr)
static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LOCAL] = { .type = NLA_U32 },
@@ -98,9 +100,15 @@ static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
int destroy);
#ifdef CONFIG_SYSCTL
-static void devinet_sysctl_register(struct in_device *in_dev,
- struct ipv4_devconf *p);
-static void devinet_sysctl_unregister(struct ipv4_devconf *p);
+static void devinet_sysctl_register(struct in_device *idev);
+static void devinet_sysctl_unregister(struct in_device *idev);
+#else
+static inline void devinet_sysctl_register(struct in_device *idev)
+{
+}
+static inline void devinet_sysctl_unregister(struct in_device *idev)
+{
+}
#endif
/* Locks all the inet devices. */
@@ -157,24 +165,18 @@ static struct in_device *inetdev_init(struct net_device *dev)
if (!in_dev)
goto out;
INIT_RCU_HEAD(&in_dev->rcu_head);
- memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
+ memcpy(&in_dev->cnf, dev->nd_net->ipv4.devconf_dflt,
+ sizeof(in_dev->cnf));
in_dev->cnf.sysctl = NULL;
in_dev->dev = dev;
if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
goto out_kfree;
/* Reference in_dev->dev */
dev_hold(dev);
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
- NET_IPV4_NEIGH, "ipv4", NULL, NULL);
-#endif
-
/* Account for reference dev->ip_ptr (below) */
in_dev_hold(in_dev);
-#ifdef CONFIG_SYSCTL
- devinet_sysctl_register(in_dev, &in_dev->cnf);
-#endif
+ devinet_sysctl_register(in_dev);
ip_mc_init_dev(in_dev);
if (dev->flags & IFF_UP)
ip_mc_up(in_dev);
@@ -213,15 +215,9 @@ static void inetdev_destroy(struct in_device *in_dev)
inet_free_ifa(ifa);
}
-#ifdef CONFIG_SYSCTL
- devinet_sysctl_unregister(&in_dev->cnf);
-#endif
-
dev->ip_ptr = NULL;
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_unregister(in_dev->arp_parms);
-#endif
+ devinet_sysctl_unregister(in_dev);
neigh_parms_release(&arp_tbl, in_dev->arp_parms);
arp_ifdown(dev);
@@ -408,17 +404,17 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
in_dev_hold(in_dev);
ifa->ifa_dev = in_dev;
}
- if (LOOPBACK(ifa->ifa_local))
+ if (ipv4_is_loopback(ifa->ifa_local))
ifa->ifa_scope = RT_SCOPE_HOST;
return inet_insert_ifa(ifa);
}
-struct in_device *inetdev_by_index(int ifindex)
+struct in_device *inetdev_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
struct in_device *in_dev = NULL;
read_lock(&dev_base_lock);
- dev = __dev_get_by_index(&init_net, ifindex);
+ dev = __dev_get_by_index(net, ifindex);
if (dev)
in_dev = in_dev_get(dev);
read_unlock(&dev_base_lock);
@@ -441,6 +437,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
{
+ struct net *net = skb->sk->sk_net;
struct nlattr *tb[IFA_MAX+1];
struct in_device *in_dev;
struct ifaddrmsg *ifm;
@@ -449,12 +446,15 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
ASSERT_RTNL();
+ if (net != &init_net)
+ return -EINVAL;
+
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
if (err < 0)
goto errout;
ifm = nlmsg_data(nlh);
- in_dev = inetdev_by_index(ifm->ifa_index);
+ in_dev = inetdev_by_index(net, ifm->ifa_index);
if (in_dev == NULL) {
err = -ENODEV;
goto errout;
@@ -560,10 +560,14 @@ errout:
static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
{
+ struct net *net = skb->sk->sk_net;
struct in_ifaddr *ifa;
ASSERT_RTNL();
+ if (net != &init_net)
+ return -EINVAL;
+
ifa = rtm_to_ifaddr(nlh);
if (IS_ERR(ifa))
return PTR_ERR(ifa);
@@ -579,7 +583,7 @@ static __inline__ int inet_abc_len(__be32 addr)
{
int rc = -1; /* Something else, probably a multicast. */
- if (ZERONET(addr))
+ if (ipv4_is_zeronet(addr))
rc = 0;
else {
__u32 haddr = ntohl(addr);
@@ -964,28 +968,25 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
/*
* Confirm that local IP address exists using wildcards:
- * - dev: only on this interface, 0=any interface
+ * - in_dev: only on this interface, 0=any interface
* - dst: only in the same subnet as dst, 0=any dst
* - local: address, 0=autoselect the local address
* - scope: maximum allowed scope value for the local address
*/
-__be32 inet_confirm_addr(const struct net_device *dev, __be32 dst, __be32 local, int scope)
+__be32 inet_confirm_addr(struct in_device *in_dev,
+ __be32 dst, __be32 local, int scope)
{
__be32 addr = 0;
- struct in_device *in_dev;
-
- if (dev) {
- rcu_read_lock();
- if ((in_dev = __in_dev_get_rcu(dev)))
- addr = confirm_addr_indev(in_dev, dst, local, scope);
- rcu_read_unlock();
+ struct net_device *dev;
+ struct net *net;
- return addr;
- }
+ if (scope != RT_SCOPE_LINK)
+ return confirm_addr_indev(in_dev, dst, local, scope);
+ net = in_dev->dev->nd_net;
read_lock(&dev_base_lock);
rcu_read_lock();
- for_each_netdev(&init_net, dev) {
+ for_each_netdev(net, dev) {
if ((in_dev = __in_dev_get_rcu(dev))) {
addr = confirm_addr_indev(in_dev, dst, local, scope);
if (addr)
@@ -1106,13 +1107,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
*/
inetdev_changename(dev, in_dev);
-#ifdef CONFIG_SYSCTL
- devinet_sysctl_unregister(&in_dev->cnf);
- neigh_sysctl_unregister(in_dev->arp_parms);
- neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
- NET_IPV4_NEIGH, "ipv4", NULL, NULL);
- devinet_sysctl_register(in_dev, &in_dev->cnf);
-#endif
+ devinet_sysctl_unregister(in_dev);
+ devinet_sysctl_register(in_dev);
break;
}
out:
@@ -1174,12 +1170,16 @@ nla_put_failure:
static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = skb->sk->sk_net;
int idx, ip_idx;
struct net_device *dev;
struct in_device *in_dev;
struct in_ifaddr *ifa;
int s_ip_idx, s_idx = cb->args[0];
+ if (net != &init_net)
+ return 0;
+
s_ip_idx = ip_idx = cb->args[1];
idx = 0;
for_each_netdev(&init_net, dev) {
@@ -1228,28 +1228,50 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa, struct nlmsghdr *nlh,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ err = rtnl_notify(skb, &init_net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
errout:
if (err < 0)
- rtnl_set_sk_err(RTNLGRP_IPV4_IFADDR, err);
+ rtnl_set_sk_err(&init_net, RTNLGRP_IPV4_IFADDR, err);
}
#ifdef CONFIG_SYSCTL
-static void devinet_copy_dflt_conf(int i)
+static void devinet_copy_dflt_conf(struct net *net, int i)
{
struct net_device *dev;
read_lock(&dev_base_lock);
- for_each_netdev(&init_net, dev) {
+ for_each_netdev(net, dev) {
struct in_device *in_dev;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev && !test_bit(i, in_dev->cnf.state))
- in_dev->cnf.data[i] = ipv4_devconf_dflt.data[i];
+ in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
+ rcu_read_unlock();
+ }
+ read_unlock(&dev_base_lock);
+}
+
+static void inet_forward_change(struct net *net)
+{
+ struct net_device *dev;
+ int on = IPV4_DEVCONF_ALL(net, FORWARDING);
+
+ IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
+ IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
+
+ read_lock(&dev_base_lock);
+ for_each_netdev(net, dev) {
+ struct in_device *in_dev;
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev)
+ IN_DEV_CONF_SET(in_dev, FORWARDING, on);
rcu_read_unlock();
}
read_unlock(&dev_base_lock);
+
+ rt_cache_flush(0);
}
static int devinet_conf_proc(ctl_table *ctl, int write,
@@ -1260,12 +1282,13 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
if (write) {
struct ipv4_devconf *cnf = ctl->extra1;
+ struct net *net = ctl->extra2;
int i = (int *)ctl->data - cnf->data;
set_bit(i, cnf->state);
- if (cnf == &ipv4_devconf_dflt)
- devinet_copy_dflt_conf(i);
+ if (cnf == net->ipv4.devconf_dflt)
+ devinet_copy_dflt_conf(net, i);
}
return ret;
@@ -1276,6 +1299,7 @@ static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen,
void __user *newval, size_t newlen)
{
struct ipv4_devconf *cnf;
+ struct net *net;
int *valp = table->data;
int new;
int i;
@@ -1311,38 +1335,17 @@ static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen,
*valp = new;
cnf = table->extra1;
+ net = table->extra2;
i = (int *)table->data - cnf->data;
set_bit(i, cnf->state);
- if (cnf == &ipv4_devconf_dflt)
- devinet_copy_dflt_conf(i);
+ if (cnf == net->ipv4.devconf_dflt)
+ devinet_copy_dflt_conf(net, i);
return 1;
}
-void inet_forward_change(void)
-{
- struct net_device *dev;
- int on = IPV4_DEVCONF_ALL(FORWARDING);
-
- IPV4_DEVCONF_ALL(ACCEPT_REDIRECTS) = !on;
- IPV4_DEVCONF_DFLT(FORWARDING) = on;
-
- read_lock(&dev_base_lock);
- for_each_netdev(&init_net, dev) {
- struct in_device *in_dev;
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(dev);
- if (in_dev)
- IN_DEV_CONF_SET(in_dev, FORWARDING, on);
- rcu_read_unlock();
- }
- read_unlock(&dev_base_lock);
-
- rt_cache_flush(0);
-}
-
static int devinet_sysctl_forward(ctl_table *ctl, int write,
struct file* filp, void __user *buffer,
size_t *lenp, loff_t *ppos)
@@ -1352,9 +1355,11 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
if (write && *valp != val) {
- if (valp == &IPV4_DEVCONF_ALL(FORWARDING))
- inet_forward_change();
- else if (valp != &IPV4_DEVCONF_DFLT(FORWARDING))
+ struct net *net = ctl->extra2;
+
+ if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING))
+ inet_forward_change(net);
+ else if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING))
rt_cache_flush(0);
}
@@ -1419,11 +1424,8 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
static struct devinet_sysctl_table {
struct ctl_table_header *sysctl_header;
- ctl_table devinet_vars[__NET_IPV4_CONF_MAX];
- ctl_table devinet_dev[2];
- ctl_table devinet_conf_dir[2];
- ctl_table devinet_proto_dir[2];
- ctl_table devinet_root_dir[2];
+ struct ctl_table devinet_vars[__NET_IPV4_CONF_MAX];
+ char *dev_name;
} devinet_sysctl = {
.devinet_vars = {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
@@ -1455,62 +1457,32 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
"promote_secondaries"),
},
- .devinet_dev = {
- {
- .ctl_name = NET_PROTO_CONF_ALL,
- .procname = "all",
- .mode = 0555,
- .child = devinet_sysctl.devinet_vars,
- },
- },
- .devinet_conf_dir = {
- {
- .ctl_name = NET_IPV4_CONF,
- .procname = "conf",
- .mode = 0555,
- .child = devinet_sysctl.devinet_dev,
- },
- },
- .devinet_proto_dir = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = devinet_sysctl.devinet_conf_dir,
- },
- },
- .devinet_root_dir = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = devinet_sysctl.devinet_proto_dir,
- },
- },
};
-static void devinet_sysctl_register(struct in_device *in_dev,
- struct ipv4_devconf *p)
+static int __devinet_sysctl_register(struct net *net, char *dev_name,
+ int ctl_name, struct ipv4_devconf *p)
{
int i;
- struct net_device *dev = in_dev ? in_dev->dev : NULL;
- struct devinet_sysctl_table *t = kmemdup(&devinet_sysctl, sizeof(*t),
- GFP_KERNEL);
- char *dev_name = NULL;
+ struct devinet_sysctl_table *t;
+#define DEVINET_CTL_PATH_DEV 3
+
+ struct ctl_path devinet_ctl_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "ipv4", .ctl_name = NET_IPV4, },
+ { .procname = "conf", .ctl_name = NET_IPV4_CONF, },
+ { /* to be set */ },
+ { },
+ };
+
+ t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
if (!t)
- return;
+ goto out;
+
for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
t->devinet_vars[i].extra1 = p;
- }
-
- if (dev) {
- dev_name = dev->name;
- t->devinet_dev[0].ctl_name = dev->ifindex;
- } else {
- dev_name = "default";
- t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
+ t->devinet_vars[i].extra2 = net;
}
/*
@@ -1518,56 +1490,183 @@ static void devinet_sysctl_register(struct in_device *in_dev,
* by sysctl and we wouldn't want anyone to change it under our feet
* (see SIOCSIFNAME).
*/
- dev_name = kstrdup(dev_name, GFP_KERNEL);
- if (!dev_name)
- goto free;
+ t->dev_name = kstrdup(dev_name, GFP_KERNEL);
+ if (!t->dev_name)
+ goto free;
- t->devinet_dev[0].procname = dev_name;
- t->devinet_dev[0].child = t->devinet_vars;
- t->devinet_conf_dir[0].child = t->devinet_dev;
- t->devinet_proto_dir[0].child = t->devinet_conf_dir;
- t->devinet_root_dir[0].child = t->devinet_proto_dir;
+ devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
+ devinet_ctl_path[DEVINET_CTL_PATH_DEV].ctl_name = ctl_name;
- t->sysctl_header = register_sysctl_table(t->devinet_root_dir);
+ t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
+ t->devinet_vars);
if (!t->sysctl_header)
- goto free_procname;
+ goto free_procname;
p->sysctl = t;
- return;
+ return 0;
- /* error path */
- free_procname:
- kfree(dev_name);
- free:
+free_procname:
+ kfree(t->dev_name);
+free:
kfree(t);
- return;
+out:
+ return -ENOBUFS;
}
-static void devinet_sysctl_unregister(struct ipv4_devconf *p)
+static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
+{
+ struct devinet_sysctl_table *t = cnf->sysctl;
+
+ if (t == NULL)
+ return;
+
+ cnf->sysctl = NULL;
+ unregister_sysctl_table(t->sysctl_header);
+ kfree(t->dev_name);
+ kfree(t);
+}
+
+static void devinet_sysctl_register(struct in_device *idev)
+{
+ neigh_sysctl_register(idev->dev, idev->arp_parms, NET_IPV4,
+ NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+ __devinet_sysctl_register(idev->dev->nd_net, idev->dev->name,
+ idev->dev->ifindex, &idev->cnf);
+}
+
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+ __devinet_sysctl_unregister(&idev->cnf);
+ neigh_sysctl_unregister(idev->arp_parms);
+}
+
+static struct ctl_table ctl_forward_entry[] = {
+ {
+ .ctl_name = NET_IPV4_FORWARD,
+ .procname = "ip_forward",
+ .data = &ipv4_devconf.data[
+ NET_IPV4_CONF_FORWARDING - 1],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = devinet_sysctl_forward,
+ .strategy = devinet_conf_sysctl,
+ .extra1 = &ipv4_devconf,
+ .extra2 = &init_net,
+ },
+ { },
+};
+
+static __net_initdata struct ctl_path net_ipv4_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "ipv4", .ctl_name = NET_IPV4, },
+ { },
+};
+#endif
+
+static __net_init int devinet_init_net(struct net *net)
{
- if (p->sysctl) {
- struct devinet_sysctl_table *t = p->sysctl;
- p->sysctl = NULL;
- unregister_sysctl_table(t->sysctl_header);
- kfree(t->devinet_dev[0].procname);
- kfree(t);
+ int err;
+ struct ipv4_devconf *all, *dflt;
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl = ctl_forward_entry;
+ struct ctl_table_header *forw_hdr;
+#endif
+
+ err = -ENOMEM;
+ all = &ipv4_devconf;
+ dflt = &ipv4_devconf_dflt;
+
+ if (net != &init_net) {
+ all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
+ if (all == NULL)
+ goto err_alloc_all;
+
+ dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
+ if (dflt == NULL)
+ goto err_alloc_dflt;
+
+#ifdef CONFIG_SYSCTL
+ tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_alloc_ctl;
+
+ tbl[0].data = &all->data[NET_IPV4_CONF_FORWARDING - 1];
+ tbl[0].extra1 = all;
+ tbl[0].extra2 = net;
+#endif
}
+
+#ifdef CONFIG_SYSCTL
+ err = __devinet_sysctl_register(net, "all",
+ NET_PROTO_CONF_ALL, all);
+ if (err < 0)
+ goto err_reg_all;
+
+ err = __devinet_sysctl_register(net, "default",
+ NET_PROTO_CONF_DEFAULT, dflt);
+ if (err < 0)
+ goto err_reg_dflt;
+
+ err = -ENOMEM;
+ forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);
+ if (forw_hdr == NULL)
+ goto err_reg_ctl;
+ net->ipv4.forw_hdr = forw_hdr;
+#endif
+
+ net->ipv4.devconf_all = all;
+ net->ipv4.devconf_dflt = dflt;
+ return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_ctl:
+ __devinet_sysctl_unregister(dflt);
+err_reg_dflt:
+ __devinet_sysctl_unregister(all);
+err_reg_all:
+ if (tbl != ctl_forward_entry)
+ kfree(tbl);
+err_alloc_ctl:
+#endif
+ if (dflt != &ipv4_devconf_dflt)
+ kfree(dflt);
+err_alloc_dflt:
+ if (all != &ipv4_devconf)
+ kfree(all);
+err_alloc_all:
+ return err;
}
+
+static __net_exit void devinet_exit_net(struct net *net)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ tbl = net->ipv4.forw_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.forw_hdr);
+ __devinet_sysctl_unregister(net->ipv4.devconf_dflt);
+ __devinet_sysctl_unregister(net->ipv4.devconf_all);
+ kfree(tbl);
#endif
+ kfree(net->ipv4.devconf_dflt);
+ kfree(net->ipv4.devconf_all);
+}
+
+static __net_initdata struct pernet_operations devinet_ops = {
+ .init = devinet_init_net,
+ .exit = devinet_exit_net,
+};
void __init devinet_init(void)
{
+ register_pernet_subsys(&devinet_ops);
+
register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);
rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
-#ifdef CONFIG_SYSCTL
- devinet_sysctl.sysctl_header =
- register_sysctl_table(devinet_sysctl.devinet_root_dir);
- devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
-#endif
}
EXPORT_SYMBOL(in_dev_finish_destroy);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1738113268bc..28ea5c77ca23 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -163,7 +163,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
u8 nexthdr[2];
struct scatterlist *sg;
int padlen;
- int err;
+ int err = -EINVAL;
if (!pskb_may_pull(skb, sizeof(*esph)))
goto out;
@@ -171,28 +171,31 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
if (elen <= 0 || (elen & (blksize-1)))
goto out;
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ spin_lock(&x->lock);
+
/* If integrity check is required, do this. */
if (esp->auth.icv_full_len) {
u8 sum[alen];
err = esp_mac_digest(esp, skb, 0, skb->len - alen);
if (err)
- goto out;
+ goto unlock;
if (skb_copy_bits(skb, skb->len - alen, sum, alen))
BUG();
if (unlikely(memcmp(esp->auth.work_icv, sum, alen))) {
- x->stats.integrity_failed++;
- goto out;
+ err = -EBADMSG;
+ goto unlock;
}
}
- if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0)
- goto out;
-
- skb->ip_summed = CHECKSUM_NONE;
-
esph = (struct ip_esp_hdr *)skb->data;
/* Get ivec. This can be wrong, check against another impls. */
@@ -202,9 +205,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
sg = &esp->sgbuf[0];
if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
+ err = -ENOMEM;
sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
if (!sg)
- goto out;
+ goto unlock;
}
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
@@ -213,12 +217,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
err = crypto_blkcipher_decrypt(&desc, sg, sg, elen);
if (unlikely(sg != &esp->sgbuf[0]))
kfree(sg);
+
+unlock:
+ spin_unlock(&x->lock);
+
if (unlikely(err))
- return err;
+ goto out;
if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
BUG();
+ err = -EINVAL;
padlen = nexthdr[0];
if (padlen+2 >= elen)
goto out;
@@ -276,7 +285,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
return nexthdr[1];
out:
- return -EINVAL;
+ return err;
}
static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 97abf934d185..d28261826bc2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -47,59 +47,65 @@
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
-#define FFprint(a...) printk(KERN_DEBUG a)
+#ifndef CONFIG_IP_MULTIPLE_TABLES
-static struct sock *fibnl;
+static int __net_init fib4_rules_init(struct net *net)
+{
+ struct fib_table *local_table, *main_table;
-#ifndef CONFIG_IP_MULTIPLE_TABLES
+ local_table = fib_hash_table(RT_TABLE_LOCAL);
+ if (local_table == NULL)
+ return -ENOMEM;
-struct fib_table *ip_fib_local_table;
-struct fib_table *ip_fib_main_table;
+ main_table = fib_hash_table(RT_TABLE_MAIN);
+ if (main_table == NULL)
+ goto fail;
-#define FIB_TABLE_HASHSZ 1
-static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
+ hlist_add_head_rcu(&local_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
+ hlist_add_head_rcu(&main_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
+ return 0;
-static void __init fib4_rules_init(void)
-{
- ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
- hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]);
- ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN);
- hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]);
+fail:
+ kfree(local_table);
+ return -ENOMEM;
}
#else
-#define FIB_TABLE_HASHSZ 256
-static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
-
-struct fib_table *fib_new_table(u32 id)
+struct fib_table *fib_new_table(struct net *net, u32 id)
{
struct fib_table *tb;
unsigned int h;
if (id == 0)
id = RT_TABLE_MAIN;
- tb = fib_get_table(id);
+ tb = fib_get_table(net, id);
if (tb)
return tb;
- tb = fib_hash_init(id);
+
+ tb = fib_hash_table(id);
if (!tb)
return NULL;
h = id & (FIB_TABLE_HASHSZ - 1);
- hlist_add_head_rcu(&tb->tb_hlist, &fib_table_hash[h]);
+ hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
return tb;
}
-struct fib_table *fib_get_table(u32 id)
+struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
struct hlist_node *node;
+ struct hlist_head *head;
unsigned int h;
if (id == 0)
id = RT_TABLE_MAIN;
h = id & (FIB_TABLE_HASHSZ - 1);
+
rcu_read_lock();
- hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb_hlist) {
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
if (tb->tb_id == id) {
rcu_read_unlock();
return tb;
@@ -110,15 +116,32 @@ struct fib_table *fib_get_table(u32 id)
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
-static void fib_flush(void)
+void fib_select_default(struct net *net,
+ const struct flowi *flp, struct fib_result *res)
+{
+ struct fib_table *tb;
+ int table = RT_TABLE_MAIN;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
+ return;
+ table = res->r->table;
+#endif
+ tb = fib_get_table(net, table);
+ if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+ tb->tb_select_default(tb, flp, res);
+}
+
+static void fib_flush(struct net *net)
{
int flushed = 0;
struct fib_table *tb;
struct hlist_node *node;
+ struct hlist_head *head;
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
- hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist)
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry(tb, node, head, tb_hlist)
flushed += tb->tb_flush(tb);
}
@@ -130,7 +153,7 @@ static void fib_flush(void)
* Find the first device with a given source address.
*/
-struct net_device * ip_dev_find(__be32 addr)
+struct net_device * ip_dev_find(struct net *net, __be32 addr)
{
struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
struct fib_result res;
@@ -141,7 +164,7 @@ struct net_device * ip_dev_find(__be32 addr)
res.r = NULL;
#endif
- local_table = fib_get_table(RT_TABLE_LOCAL);
+ local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (!local_table || local_table->tb_lookup(local_table, &fl, &res))
return NULL;
if (res.type != RTN_LOCAL)
@@ -155,33 +178,51 @@ out:
return dev;
}
-unsigned inet_addr_type(__be32 addr)
+/*
+ * Find address type as if only "dev" was present in the system. If
+ * on_dev is NULL then all interfaces are taken into consideration.
+ */
+static inline unsigned __inet_dev_addr_type(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
{
struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
struct fib_result res;
unsigned ret = RTN_BROADCAST;
struct fib_table *local_table;
- if (ZERONET(addr) || BADCLASS(addr))
+ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
return RTN_BROADCAST;
- if (MULTICAST(addr))
+ if (ipv4_is_multicast(addr))
return RTN_MULTICAST;
#ifdef CONFIG_IP_MULTIPLE_TABLES
res.r = NULL;
#endif
- local_table = fib_get_table(RT_TABLE_LOCAL);
+ local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (local_table) {
ret = RTN_UNICAST;
if (!local_table->tb_lookup(local_table, &fl, &res)) {
- ret = res.type;
+ if (!dev || dev == res.fi->fib_dev)
+ ret = res.type;
fib_res_put(&res);
}
}
return ret;
}
+unsigned int inet_addr_type(struct net *net, __be32 addr)
+{
+ return __inet_dev_addr_type(net, NULL, addr);
+}
+
+unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
+ __be32 addr)
+{
+ return __inet_dev_addr_type(net, dev, addr);
+}
+
/* Given (packet source, input interface) and optional (dst, oif, tos):
- (main) check, that source is valid i.e. not broadcast or our local
address.
@@ -202,6 +243,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
struct fib_result res;
int no_addr, rpf;
int ret;
+ struct net *net;
no_addr = rpf = 0;
rcu_read_lock();
@@ -215,7 +257,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
if (in_dev == NULL)
goto e_inval;
- if (fib_lookup(&fl, &res))
+ net = dev->nd_net;
+ if (fib_lookup(net, &fl, &res))
goto last_resort;
if (res.type != RTN_UNICAST)
goto e_inval_res;
@@ -239,7 +282,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
fl.oif = dev->ifindex;
ret = 0;
- if (fib_lookup(&fl, &res) == 0) {
+ if (fib_lookup(net, &fl, &res) == 0) {
if (res.type == RTN_UNICAST) {
*spec_dst = FIB_RES_PREFSRC(res);
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
@@ -278,13 +321,14 @@ static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
return len + nla_total_size(4);
}
-static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
+static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
struct fib_config *cfg)
{
__be32 addr;
int plen;
memset(cfg, 0, sizeof(*cfg));
+ cfg->fc_nlinfo.nl_net = net;
if (rt->rt_dst.sa_family != AF_INET)
return -EAFNOSUPPORT;
@@ -345,7 +389,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
colon = strchr(devname, ':');
if (colon)
*colon = 0;
- dev = __dev_get_by_name(&init_net, devname);
+ dev = __dev_get_by_name(net, devname);
if (!dev)
return -ENODEV;
cfg->fc_oif = dev->ifindex;
@@ -368,7 +412,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
if (rt->rt_gateway.sa_family == AF_INET && addr) {
cfg->fc_gw = addr;
if (rt->rt_flags & RTF_GATEWAY &&
- inet_addr_type(addr) == RTN_UNICAST)
+ inet_addr_type(net, addr) == RTN_UNICAST)
cfg->fc_scope = RT_SCOPE_UNIVERSE;
}
@@ -409,7 +453,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
* Handle IP routing ioctl calls. These are used to manipulate the routing tables
*/
-int ip_rt_ioctl(unsigned int cmd, void __user *arg)
+int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
struct fib_config cfg;
struct rtentry rt;
@@ -425,18 +469,18 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg)
return -EFAULT;
rtnl_lock();
- err = rtentry_to_fib_config(cmd, &rt, &cfg);
+ err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
if (err == 0) {
struct fib_table *tb;
if (cmd == SIOCDELRT) {
- tb = fib_get_table(cfg.fc_table);
+ tb = fib_get_table(net, cfg.fc_table);
if (tb)
err = tb->tb_delete(tb, &cfg);
else
err = -ESRCH;
} else {
- tb = fib_new_table(cfg.fc_table);
+ tb = fib_new_table(net, cfg.fc_table);
if (tb)
err = tb->tb_insert(tb, &cfg);
else
@@ -466,8 +510,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
[RTA_FLOW] = { .type = NLA_U32 },
};
-static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct fib_config *cfg)
+static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct fib_config *cfg)
{
struct nlattr *attr;
int err, remaining;
@@ -491,6 +535,7 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
cfg->fc_nlinfo.nlh = nlh;
+ cfg->fc_nlinfo.nl_net = net;
if (cfg->fc_type > RTN_MAX) {
err = -EINVAL;
@@ -538,15 +583,16 @@ errout:
static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
{
+ struct net *net = skb->sk->sk_net;
struct fib_config cfg;
struct fib_table *tb;
int err;
- err = rtm_to_fib_config(skb, nlh, &cfg);
+ err = rtm_to_fib_config(net, skb, nlh, &cfg);
if (err < 0)
goto errout;
- tb = fib_get_table(cfg.fc_table);
+ tb = fib_get_table(net, cfg.fc_table);
if (tb == NULL) {
err = -ESRCH;
goto errout;
@@ -559,15 +605,16 @@ errout:
static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
{
+ struct net *net = skb->sk->sk_net;
struct fib_config cfg;
struct fib_table *tb;
int err;
- err = rtm_to_fib_config(skb, nlh, &cfg);
+ err = rtm_to_fib_config(net, skb, nlh, &cfg);
if (err < 0)
goto errout;
- tb = fib_new_table(cfg.fc_table);
+ tb = fib_new_table(net, cfg.fc_table);
if (tb == NULL) {
err = -ENOBUFS;
goto errout;
@@ -580,10 +627,12 @@ errout:
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = skb->sk->sk_net;
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
struct hlist_node *node;
+ struct hlist_head *head;
int dumped = 0;
if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
@@ -595,7 +644,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
- hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist) {
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry(tb, node, head, tb_hlist) {
if (e < s_e)
goto next;
if (dumped)
@@ -624,6 +674,7 @@ out:
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
+ struct net *net = ifa->ifa_dev->dev->nd_net;
struct fib_table *tb;
struct fib_config cfg = {
.fc_protocol = RTPROT_KERNEL,
@@ -633,12 +684,15 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
.fc_prefsrc = ifa->ifa_local,
.fc_oif = ifa->ifa_dev->dev->ifindex,
.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
+ .fc_nlinfo = {
+ .nl_net = net,
+ },
};
if (type == RTN_UNICAST)
- tb = fib_new_table(RT_TABLE_MAIN);
+ tb = fib_new_table(net, RT_TABLE_MAIN);
else
- tb = fib_new_table(RT_TABLE_LOCAL);
+ tb = fib_new_table(net, RT_TABLE_LOCAL);
if (tb == NULL)
return;
@@ -668,7 +722,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
if (ifa->ifa_flags&IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
if (prim == NULL) {
- printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
+ printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
return;
}
}
@@ -682,7 +736,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+ if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
@@ -715,7 +769,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
else {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
if (prim == NULL) {
- printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
+ printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
return;
}
}
@@ -747,7 +801,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
/* Check, that this local address finally disappeared. */
- if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
+ if (inet_addr_type(dev->nd_net, ifa->ifa_local) != RTN_LOCAL) {
/* And the last, but not the least thing.
We must flush stray FIB entries.
@@ -755,7 +809,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
for stray nexthop entries, then ignite fib_flush.
*/
if (fib_sync_down(ifa->ifa_local, NULL, 0))
- fib_flush();
+ fib_flush(dev->nd_net);
}
}
#undef LOCAL_OK
@@ -797,11 +851,13 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
static void nl_fib_input(struct sk_buff *skb)
{
+ struct net *net;
struct fib_result_nl *frn;
struct nlmsghdr *nlh;
struct fib_table *tb;
u32 pid;
+ net = skb->sk->sk_net;
nlh = nlmsg_hdr(skb);
if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
@@ -813,26 +869,37 @@ static void nl_fib_input(struct sk_buff *skb)
nlh = nlmsg_hdr(skb);
frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
- tb = fib_get_table(frn->tb_id_in);
+ tb = fib_get_table(net, frn->tb_id_in);
nl_fib_lookup(frn, tb);
pid = NETLINK_CB(skb).pid; /* pid of sending process */
NETLINK_CB(skb).pid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
- netlink_unicast(fibnl, skb, pid, MSG_DONTWAIT);
+ netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
+}
+
+static int nl_fib_lookup_init(struct net *net)
+{
+ struct sock *sk;
+ sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
+ nl_fib_input, NULL, THIS_MODULE);
+ if (sk == NULL)
+ return -EAFNOSUPPORT;
+ net->ipv4.fibnl = sk;
+ return 0;
}
-static void nl_fib_lookup_init(void)
+static void nl_fib_lookup_exit(struct net *net)
{
- fibnl = netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0,
- nl_fib_input, NULL, THIS_MODULE);
+ netlink_kernel_release(net->ipv4.fibnl);
+ net->ipv4.fibnl = NULL;
}
static void fib_disable_ip(struct net_device *dev, int force)
{
if (fib_sync_down(0, dev, force))
- fib_flush();
+ fib_flush(dev->nd_net);
rt_cache_flush(0);
arp_ifdown(dev);
}
@@ -869,9 +936,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
struct net_device *dev = ptr;
struct in_device *in_dev = __in_dev_get_rtnl(dev);
- if (dev->nd_net != &init_net)
- return NOTIFY_DONE;
-
if (event == NETDEV_UNREGISTER) {
fib_disable_ip(dev, 2);
return NOTIFY_DONE;
@@ -909,23 +973,92 @@ static struct notifier_block fib_netdev_notifier = {
.notifier_call =fib_netdev_event,
};
-void __init ip_fib_init(void)
+static int __net_init ip_fib_net_init(struct net *net)
{
unsigned int i;
+ net->ipv4.fib_table_hash = kzalloc(
+ sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
+ if (net->ipv4.fib_table_hash == NULL)
+ return -ENOMEM;
+
for (i = 0; i < FIB_TABLE_HASHSZ; i++)
- INIT_HLIST_HEAD(&fib_table_hash[i]);
+ INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
- fib4_rules_init();
+ return fib4_rules_init(net);
+}
- register_netdevice_notifier(&fib_netdev_notifier);
- register_inetaddr_notifier(&fib_inetaddr_notifier);
- nl_fib_lookup_init();
+static void __net_exit ip_fib_net_exit(struct net *net)
+{
+ unsigned int i;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ fib4_rules_exit(net);
+#endif
+ for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+ struct fib_table *tb;
+ struct hlist_head *head;
+ struct hlist_node *node, *tmp;
+
+ head = &net->ipv4.fib_table_hash[i];
+ hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
+ hlist_del(node);
+ tb->tb_flush(tb);
+ kfree(tb);
+ }
+ }
+ kfree(net->ipv4.fib_table_hash);
+}
+
+static int __net_init fib_net_init(struct net *net)
+{
+ int error;
+
+ error = ip_fib_net_init(net);
+ if (error < 0)
+ goto out;
+ error = nl_fib_lookup_init(net);
+ if (error < 0)
+ goto out_nlfl;
+ error = fib_proc_init(net);
+ if (error < 0)
+ goto out_proc;
+out:
+ return error;
+
+out_proc:
+ nl_fib_lookup_exit(net);
+out_nlfl:
+ ip_fib_net_exit(net);
+ goto out;
+}
+
+static void __net_exit fib_net_exit(struct net *net)
+{
+ fib_proc_exit(net);
+ nl_fib_lookup_exit(net);
+ ip_fib_net_exit(net);
+}
+
+static struct pernet_operations fib_net_ops = {
+ .init = fib_net_init,
+ .exit = fib_net_exit,
+};
+
+void __init ip_fib_init(void)
+{
rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
+
+ register_pernet_subsys(&fib_net_ops);
+ register_netdevice_notifier(&fib_netdev_notifier);
+ register_inetaddr_notifier(&fib_inetaddr_notifier);
+
+ fib_hash_init();
}
EXPORT_SYMBOL(inet_addr_type);
+EXPORT_SYMBOL(inet_dev_addr_type);
EXPORT_SYMBOL(ip_dev_find);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 0dfee27cfbcd..a15b2f1b2721 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -52,6 +52,7 @@ struct fib_node {
struct hlist_node fn_hash;
struct list_head fn_alias;
__be32 fn_key;
+ struct fib_alias fn_embedded_alias;
};
struct fn_zone {
@@ -102,10 +103,10 @@ static struct hlist_head *fz_hash_alloc(int divisor)
unsigned long size = divisor * sizeof(struct hlist_head);
if (size <= PAGE_SIZE) {
- return kmalloc(size, GFP_KERNEL);
+ return kzalloc(size, GFP_KERNEL);
} else {
return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL, get_order(size));
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
}
}
@@ -168,14 +169,13 @@ static void fn_rehash_zone(struct fn_zone *fz)
new_hashmask = (new_divisor - 1);
#if RT_CACHE_DEBUG >= 2
- printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor);
+ printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
+ fz->fz_order, old_divisor);
#endif
ht = fz_hash_alloc(new_divisor);
if (ht) {
- memset(ht, 0, new_divisor * sizeof(struct hlist_head));
-
write_lock_bh(&fib_hash_lock);
old_ht = fz->fz_hash;
fz->fz_hash = ht;
@@ -194,10 +194,13 @@ static inline void fn_free_node(struct fib_node * f)
kmem_cache_free(fn_hash_kmem, f);
}
-static inline void fn_free_alias(struct fib_alias *fa)
+static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
{
fib_release_info(fa->fa_info);
- kmem_cache_free(fn_alias_kmem, fa);
+ if (fa == &f->fn_embedded_alias)
+ fa->fa_info = NULL;
+ else
+ kmem_cache_free(fn_alias_kmem, fa);
}
static struct fn_zone *
@@ -219,7 +222,6 @@ fn_new_zone(struct fn_hash *table, int z)
kfree(fz);
return NULL;
}
- memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *));
fz->fz_order = z;
fz->fz_mask = inet_make_mask(z);
@@ -275,8 +277,6 @@ out:
return err;
}
-static int fn_hash_last_dflt=-1;
-
static void
fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
{
@@ -317,12 +317,9 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
if (next_fi != res->fi)
break;
} else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, &fn_hash_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- fn_hash_last_dflt = order;
+ &last_idx, tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
goto out;
}
fi = next_fi;
@@ -331,27 +328,20 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
}
if (order <= 0 || fi == NULL) {
- fn_hash_last_dflt = -1;
+ tb->tb_default = -1;
goto out;
}
- if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- fn_hash_last_dflt = order;
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+ tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
goto out;
}
- if (last_idx >= 0) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = last_resort;
- if (last_resort)
- atomic_inc(&last_resort->fib_clntref);
- }
- fn_hash_last_dflt = last_idx;
+ if (last_idx >= 0)
+ fib_result_assign(res, last_resort);
+ tb->tb_default = last_idx;
out:
read_unlock(&fib_hash_lock);
}
@@ -490,15 +480,12 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
goto out;
err = -ENOBUFS;
- new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- if (new_fa == NULL)
- goto out;
new_f = NULL;
if (!f) {
- new_f = kmem_cache_alloc(fn_hash_kmem, GFP_KERNEL);
+ new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
if (new_f == NULL)
- goto out_free_new_fa;
+ goto out;
INIT_HLIST_NODE(&new_f->fn_hash);
INIT_LIST_HEAD(&new_f->fn_alias);
@@ -506,6 +493,12 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
f = new_f;
}
+ new_fa = &f->fn_embedded_alias;
+ if (new_fa->fa_info != NULL) {
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (new_fa == NULL)
+ goto out_free_new_f;
+ }
new_fa->fa_info = fi;
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
@@ -532,8 +525,8 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
&cfg->fc_nlinfo, 0);
return 0;
-out_free_new_fa:
- kmem_cache_free(fn_alias_kmem, new_fa);
+out_free_new_f:
+ kmem_cache_free(fn_hash_kmem, new_f);
out:
fib_release_info(fi);
return err;
@@ -609,7 +602,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
if (fa->fa_state & FA_S_ACCESSED)
rt_cache_flush(-1);
- fn_free_alias(fa);
+ fn_free_alias(fa, f);
if (kill_fn) {
fn_free_node(f);
fz->fz_nent--;
@@ -645,7 +638,7 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
fib_hash_genid++;
write_unlock_bh(&fib_hash_lock);
- fn_free_alias(fa);
+ fn_free_alias(fa, f);
found++;
}
}
@@ -761,25 +754,19 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
return skb->len;
}
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-struct fib_table * fib_hash_init(u32 id)
-#else
-struct fib_table * __init fib_hash_init(u32 id)
-#endif
+void __init fib_hash_init(void)
{
- struct fib_table *tb;
+ fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
+ 0, SLAB_PANIC, NULL);
- if (fn_hash_kmem == NULL)
- fn_hash_kmem = kmem_cache_create("ip_fib_hash",
- sizeof(struct fib_node),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
+ fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
+ 0, SLAB_PANIC, NULL);
- if (fn_alias_kmem == NULL)
- fn_alias_kmem = kmem_cache_create("ip_fib_alias",
- sizeof(struct fib_alias),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
+}
+
+struct fib_table *fib_hash_table(u32 id)
+{
+ struct fib_table *tb;
tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
GFP_KERNEL);
@@ -787,6 +774,7 @@ struct fib_table * __init fib_hash_init(u32 id)
return NULL;
tb->tb_id = id;
+ tb->tb_default = -1;
tb->tb_lookup = fn_hash_lookup;
tb->tb_insert = fn_hash_insert;
tb->tb_delete = fn_hash_delete;
@@ -801,6 +789,7 @@ struct fib_table * __init fib_hash_init(u32 id)
#ifdef CONFIG_PROC_FS
struct fib_iter_state {
+ struct seq_net_private p;
struct fn_zone *zone;
int bucket;
struct hlist_head *hash_head;
@@ -814,7 +803,11 @@ struct fib_iter_state {
static struct fib_alias *fib_get_first(struct seq_file *seq)
{
struct fib_iter_state *iter = seq->private;
- struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data;
+ struct fib_table *main_table;
+ struct fn_hash *table;
+
+ main_table = fib_get_table(iter->p.net, RT_TABLE_MAIN);
+ table = (struct fn_hash *)main_table->tb_data;
iter->bucket = 0;
iter->hash_head = NULL;
@@ -949,11 +942,13 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
}
static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(fib_hash_lock)
{
+ struct fib_iter_state *iter = seq->private;
void *v = NULL;
read_lock(&fib_hash_lock);
- if (ip_fib_main_table)
+ if (fib_get_table(iter->p.net, RT_TABLE_MAIN))
v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
return v;
}
@@ -965,6 +960,7 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void fib_seq_stop(struct seq_file *seq, void *v)
+ __releases(fib_hash_lock)
{
read_unlock(&fib_hash_lock);
}
@@ -1040,8 +1036,8 @@ static const struct seq_operations fib_seq_ops = {
static int fib_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &fib_seq_ops,
- sizeof(struct fib_iter_state));
+ return seq_open_net(inode, file, &fib_seq_ops,
+ sizeof(struct fib_iter_state));
}
static const struct file_operations fib_seq_fops = {
@@ -1049,18 +1045,18 @@ static const struct file_operations fib_seq_fops = {
.open = fib_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
-int __init fib_proc_init(void)
+int __net_init fib_proc_init(struct net *net)
{
- if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_seq_fops))
+ if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
return -ENOMEM;
return 0;
}
-void __init fib_proc_exit(void)
+void __net_exit fib_proc_exit(struct net *net)
{
- proc_net_remove(&init_net, "route");
+ proc_net_remove(net, "route");
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index eef9eec17e0c..2c1623d2768b 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,12 +7,14 @@
struct fib_alias {
struct list_head fa_list;
- struct rcu_head rcu;
struct fib_info *fa_info;
u8 fa_tos;
u8 fa_type;
u8 fa_scope;
u8 fa_state;
+#ifdef CONFIG_IP_FIB_TRIE
+ struct rcu_head rcu;
+#endif
};
#define FA_S_ACCESSED 0x01
@@ -36,6 +38,16 @@ extern struct fib_alias *fib_find_alias(struct list_head *fah,
u8 tos, u32 prio);
extern int fib_detect_death(struct fib_info *fi, int order,
struct fib_info **last_resort,
- int *last_idx, int *dflt);
+ int *last_idx, int dflt);
+
+static inline void fib_result_assign(struct fib_result *res,
+ struct fib_info *fi)
+{
+ if (res->fi != NULL)
+ fib_info_put(res->fi);
+ res->fi = fi;
+ if (fi != NULL)
+ atomic_inc(&fi->fib_clntref);
+}
#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index a0ada3a8d8dd..19274d01afa4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -32,8 +32,6 @@
#include <net/ip_fib.h>
#include <net/fib_rules.h>
-static struct fib_rules_ops fib4_rules_ops;
-
struct fib4_rule
{
struct fib_rule common;
@@ -56,14 +54,14 @@ u32 fib_rules_tclass(struct fib_result *res)
}
#endif
-int fib_lookup(struct flowi *flp, struct fib_result *res)
+int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
{
struct fib_lookup_arg arg = {
.result = res,
};
int err;
- err = fib_rules_lookup(&fib4_rules_ops, flp, 0, &arg);
+ err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg);
res->r = arg.rule;
return err;
@@ -93,7 +91,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
goto errout;
}
- if ((tbl = fib_get_table(rule->table)) == NULL)
+ if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
goto errout;
err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result);
@@ -104,16 +102,6 @@ errout:
}
-void fib_select_default(const struct flowi *flp, struct fib_result *res)
-{
- if (res->r && res->r->action == FR_ACT_TO_TBL &&
- FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
- struct fib_table *tb;
- if ((tb = fib_get_table(res->r->table)) != NULL)
- tb->tb_select_default(tb, flp, res);
- }
-}
-
static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
struct fib4_rule *r = (struct fib4_rule *) rule;
@@ -130,13 +118,13 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
return 1;
}
-static struct fib_table *fib_empty_table(void)
+static struct fib_table *fib_empty_table(struct net *net)
{
u32 id;
for (id = 1; id <= RT_TABLE_MAX; id++)
- if (fib_get_table(id) == NULL)
- return fib_new_table(id);
+ if (fib_get_table(net, id) == NULL)
+ return fib_new_table(net, id);
return NULL;
}
@@ -149,6 +137,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct nlmsghdr *nlh, struct fib_rule_hdr *frh,
struct nlattr **tb)
{
+ struct net *net = skb->sk->sk_net;
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
@@ -159,7 +148,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (rule->action == FR_ACT_TO_TBL) {
struct fib_table *table;
- table = fib_empty_table();
+ table = fib_empty_table(net);
if (table == NULL) {
err = -ENOBUFS;
goto errout;
@@ -245,14 +234,14 @@ nla_put_failure:
return -ENOBUFS;
}
-static u32 fib4_rule_default_pref(void)
+static u32 fib4_rule_default_pref(struct fib_rules_ops *ops)
{
struct list_head *pos;
struct fib_rule *rule;
- if (!list_empty(&fib4_rules_ops.rules_list)) {
- pos = fib4_rules_ops.rules_list.next;
- if (pos->next != &fib4_rules_ops.rules_list) {
+ if (!list_empty(&ops->rules_list)) {
+ pos = ops->rules_list.next;
+ if (pos->next != &ops->rules_list) {
rule = list_entry(pos->next, struct fib_rule, list);
if (rule->pref)
return rule->pref - 1;
@@ -274,7 +263,7 @@ static void fib4_rule_flush_cache(void)
rt_cache_flush(-1);
}
-static struct fib_rules_ops fib4_rules_ops = {
+static struct fib_rules_ops fib4_rules_ops_template = {
.family = AF_INET,
.rule_size = sizeof(struct fib4_rule),
.addr_size = sizeof(u32),
@@ -288,31 +277,53 @@ static struct fib_rules_ops fib4_rules_ops = {
.flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
.policy = fib4_rule_policy,
- .rules_list = LIST_HEAD_INIT(fib4_rules_ops.rules_list),
.owner = THIS_MODULE,
};
-static int __init fib_default_rules_init(void)
+static int fib_default_rules_init(struct fib_rules_ops *ops)
{
int err;
- err = fib_default_rule_add(&fib4_rules_ops, 0,
- RT_TABLE_LOCAL, FIB_RULE_PERMANENT);
+ err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT);
if (err < 0)
return err;
- err = fib_default_rule_add(&fib4_rules_ops, 0x7FFE,
- RT_TABLE_MAIN, 0);
+ err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
if (err < 0)
return err;
- err = fib_default_rule_add(&fib4_rules_ops, 0x7FFF,
- RT_TABLE_DEFAULT, 0);
+ err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
if (err < 0)
return err;
return 0;
}
-void __init fib4_rules_init(void)
+int __net_init fib4_rules_init(struct net *net)
+{
+ int err;
+ struct fib_rules_ops *ops;
+
+ ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL);
+ if (ops == NULL)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&ops->rules_list);
+ ops->fro_net = net;
+
+ fib_rules_register(ops);
+
+ err = fib_default_rules_init(ops);
+ if (err < 0)
+ goto fail;
+ net->ipv4.rules_ops = ops;
+ return 0;
+
+fail:
+ /* also cleans all rules already added */
+ fib_rules_unregister(ops);
+ kfree(ops);
+ return err;
+}
+
+void __net_exit fib4_rules_exit(struct net *net)
{
- BUG_ON(fib_default_rules_init());
- fib_rules_register(&fib4_rules_ops);
+ fib_rules_unregister(net->ipv4.rules_ops);
+ kfree(net->ipv4.rules_ops);
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1351a2617dce..c7912866d987 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -47,8 +47,6 @@
#include "fib_lookup.h"
-#define FSprintk(a...)
-
static DEFINE_SPINLOCK(fib_info_lock);
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
@@ -145,7 +143,7 @@ static const struct
void free_fib_info(struct fib_info *fi)
{
if (fi->fib_dead == 0) {
- printk("Freeing alive fib_info %p\n", fi);
+ printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
return;
}
change_nexthops(fi) {
@@ -196,6 +194,15 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
return 0;
}
+static inline unsigned int fib_devindex_hashfn(unsigned int val)
+{
+ unsigned int mask = DEVINDEX_HASHSIZE - 1;
+
+ return (val ^
+ (val >> DEVINDEX_HASHBITS) ^
+ (val >> (DEVINDEX_HASHBITS * 2))) & mask;
+}
+
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
unsigned int mask = (fib_hash_size - 1);
@@ -204,6 +211,9 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
val ^= fi->fib_protocol;
val ^= (__force u32)fi->fib_prefsrc;
val ^= fi->fib_priority;
+ for_nexthops(fi) {
+ val ^= fib_devindex_hashfn(nh->nh_oif);
+ } endfor_nexthops(fi)
return (val ^ (val >> 7) ^ (val >> 12)) & mask;
}
@@ -234,15 +244,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
return NULL;
}
-static inline unsigned int fib_devindex_hashfn(unsigned int val)
-{
- unsigned int mask = DEVINDEX_HASHSIZE - 1;
-
- return (val ^
- (val >> DEVINDEX_HASHBITS) ^
- (val >> (DEVINDEX_HASHBITS * 2))) & mask;
-}
-
/* Check, that the gateway is already configured.
Used only by redirect accept routine.
*/
@@ -320,11 +321,11 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
+ err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
info->nlh, GFP_KERNEL);
errout:
if (err < 0)
- rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}
/* Return the first fib alias matching TOS with
@@ -346,7 +347,7 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
}
int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort, int *last_idx, int *dflt)
+ struct fib_info **last_resort, int *last_idx, int dflt)
{
struct neighbour *n;
int state = NUD_NONE;
@@ -358,10 +359,10 @@ int fib_detect_death(struct fib_info *fi, int order,
}
if (state==NUD_REACHABLE)
return 0;
- if ((state&NUD_VALID) && order != *dflt)
+ if ((state&NUD_VALID) && order != dflt)
return 0;
if ((state&NUD_VALID) ||
- (*last_idx<0 && order > *dflt)) {
+ (*last_idx<0 && order > dflt)) {
*last_resort = fi;
*last_idx = order;
}
@@ -518,7 +519,9 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
struct fib_nh *nh)
{
int err;
+ struct net *net;
+ net = cfg->fc_nlinfo.nl_net;
if (nh->nh_gw) {
struct fib_result res;
@@ -531,9 +534,9 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
if (cfg->fc_scope >= RT_SCOPE_LINK)
return -EINVAL;
- if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
+ if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
return -EINVAL;
- if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL)
+ if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
return -ENODEV;
if (!(dev->flags&IFF_UP))
return -ENETDOWN;
@@ -556,7 +559,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
/* It is not necessary, but requires a bit of thinking */
if (fl.fl4_scope < RT_SCOPE_LINK)
fl.fl4_scope = RT_SCOPE_LINK;
- if ((err = fib_lookup(&fl, &res)) != 0)
+ if ((err = fib_lookup(net, &fl, &res)) != 0)
return err;
}
err = -EINVAL;
@@ -580,7 +583,7 @@ out:
if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
return -EINVAL;
- in_dev = inetdev_by_index(nh->nh_oif);
+ in_dev = inetdev_by_index(net, nh->nh_oif);
if (in_dev == NULL)
return -ENODEV;
if (!(in_dev->dev->flags&IFF_UP)) {
@@ -605,10 +608,10 @@ static inline unsigned int fib_laddr_hashfn(__be32 val)
static struct hlist_head *fib_hash_alloc(int bytes)
{
if (bytes <= PAGE_SIZE)
- return kmalloc(bytes, GFP_KERNEL);
+ return kzalloc(bytes, GFP_KERNEL);
else
return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL, get_order(bytes));
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
}
static void fib_hash_free(struct hlist_head *hash, int bytes)
@@ -712,12 +715,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (!new_info_hash || !new_laddrhash) {
fib_hash_free(new_info_hash, bytes);
fib_hash_free(new_laddrhash, bytes);
- } else {
- memset(new_info_hash, 0, bytes);
- memset(new_laddrhash, 0, bytes);
-
+ } else
fib_hash_move(new_info_hash, new_laddrhash, new_size);
- }
if (!fib_hash_size)
goto failure;
@@ -799,7 +798,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (nhs != 1 || nh->nh_gw)
goto err_inval;
nh->nh_scope = RT_SCOPE_NOWHERE;
- nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif);
+ nh->nh_dev = dev_get_by_index(cfg->fc_nlinfo.nl_net,
+ fi->fib_nh->nh_oif);
err = -ENODEV;
if (nh->nh_dev == NULL)
goto failure;
@@ -813,7 +813,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (fi->fib_prefsrc) {
if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
fi->fib_prefsrc != cfg->fc_dst)
- if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
+ if (inet_addr_type(cfg->fc_nlinfo.nl_net,
+ fi->fib_prefsrc) != RTN_LOCAL)
goto err_inval;
}
@@ -914,7 +915,8 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
continue;
default:
- printk(KERN_DEBUG "impossible 102\n");
+ printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
+ fa->fa_type);
return -EINVAL;
}
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1010b469d7d3..f2f47033f31f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -82,7 +82,6 @@
#include <net/ip_fib.h>
#include "fib_lookup.h"
-#undef CONFIG_IP_FIB_TRIE_STATS
#define MAX_STAT_DEPTH 32
#define KEYLENGTH (8*sizeof(t_key))
@@ -98,13 +97,13 @@ typedef unsigned int t_key;
#define IS_LEAF(n) (n->parent & T_LEAF)
struct node {
- t_key key;
unsigned long parent;
+ t_key key;
};
struct leaf {
- t_key key;
unsigned long parent;
+ t_key key;
struct hlist_head list;
struct rcu_head rcu;
};
@@ -117,12 +116,12 @@ struct leaf_info {
};
struct tnode {
- t_key key;
unsigned long parent;
- unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
- unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
- unsigned short full_children; /* KEYLENGTH bits needed */
- unsigned short empty_children; /* KEYLENGTH bits needed */
+ t_key key;
+ unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ unsigned char bits; /* 2log(KEYLENGTH) bits needed */
+ unsigned int full_children; /* KEYLENGTH bits needed */
+ unsigned int empty_children; /* KEYLENGTH bits needed */
struct rcu_head rcu;
struct node *child[0];
};
@@ -144,6 +143,7 @@ struct trie_stat {
unsigned int tnodes;
unsigned int leaves;
unsigned int nullpointers;
+ unsigned int prefixes;
unsigned int nodesizes[MAX_STAT_DEPTH];
};
@@ -152,25 +152,28 @@ struct trie {
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats stats;
#endif
- int size;
- unsigned int revision;
};
static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+ int wasfull);
static struct node *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
static void tnode_free(struct tnode *tn);
static struct kmem_cache *fn_alias_kmem __read_mostly;
-static struct trie *trie_local = NULL, *trie_main = NULL;
+static struct kmem_cache *trie_leaf_kmem __read_mostly;
static inline struct tnode *node_parent(struct node *node)
{
- struct tnode *ret;
+ return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
+}
+
+static inline struct tnode *node_parent_rcu(struct node *node)
+{
+ struct tnode *ret = node_parent(node);
- ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
return rcu_dereference(ret);
}
@@ -180,13 +183,18 @@ static inline void node_set_parent(struct node *node, struct tnode *ptr)
(unsigned long)ptr | NODE_TYPE(node));
}
-/* rcu_read_lock needs to be hold by caller from readside */
+static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
+{
+ BUG_ON(i >= 1U << tn->bits);
+
+ return tn->child[i];
+}
-static inline struct node *tnode_get_child(struct tnode *tn, int i)
+static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
{
- BUG_ON(i >= 1 << tn->bits);
+ struct node *ret = tnode_get_child(tn, i);
- return rcu_dereference(tn->child[i]);
+ return rcu_dereference(ret);
}
static inline int tnode_child_length(const struct tnode *tn)
@@ -300,10 +308,10 @@ static inline void check_tnode(const struct tnode *tn)
WARN_ON(tn && tn->pos+tn->bits > 32);
}
-static int halve_threshold = 25;
-static int inflate_threshold = 50;
-static int halve_threshold_root = 8;
-static int inflate_threshold_root = 15;
+static const int halve_threshold = 25;
+static const int inflate_threshold = 50;
+static const int halve_threshold_root = 8;
+static const int inflate_threshold_root = 15;
static void __alias_free_mem(struct rcu_head *head)
@@ -319,7 +327,8 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
static void __leaf_free_rcu(struct rcu_head *head)
{
- kfree(container_of(head, struct leaf, rcu));
+ struct leaf *l = container_of(head, struct leaf, rcu);
+ kmem_cache_free(trie_leaf_kmem, l);
}
static void __leaf_info_free_rcu(struct rcu_head *head)
@@ -332,12 +341,12 @@ static inline void free_leaf_info(struct leaf_info *leaf)
call_rcu(&leaf->rcu, __leaf_info_free_rcu);
}
-static struct tnode *tnode_alloc(unsigned int size)
+static struct tnode *tnode_alloc(size_t size)
{
struct page *pages;
if (size <= PAGE_SIZE)
- return kcalloc(size, 1, GFP_KERNEL);
+ return kzalloc(size, GFP_KERNEL);
pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
if (!pages)
@@ -349,8 +358,8 @@ static struct tnode *tnode_alloc(unsigned int size)
static void __tnode_free_rcu(struct rcu_head *head)
{
struct tnode *tn = container_of(head, struct tnode, rcu);
- unsigned int size = sizeof(struct tnode) +
- (1 << tn->bits) * sizeof(struct node *);
+ size_t size = sizeof(struct tnode) +
+ (sizeof(struct node *) << tn->bits);
if (size <= PAGE_SIZE)
kfree(tn);
@@ -369,7 +378,7 @@ static inline void tnode_free(struct tnode *tn)
static struct leaf *leaf_new(void)
{
- struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
+ struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (l) {
l->parent = T_LEAF;
INIT_HLIST_HEAD(&l->list);
@@ -387,14 +396,12 @@ static struct leaf_info *leaf_info_new(int plen)
return li;
}
-static struct tnode* tnode_new(t_key key, int pos, int bits)
+static struct tnode *tnode_new(t_key key, int pos, int bits)
{
- int nchildren = 1<<bits;
- int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
+ size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
struct tnode *tn = tnode_alloc(sz);
if (tn) {
- memset(tn, 0, sz);
tn->parent = T_TNODE;
tn->pos = pos;
tn->bits = bits;
@@ -403,8 +410,8 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
tn->empty_children = 1<<bits;
}
- pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
- (unsigned int) (sizeof(struct node) * 1<<bits));
+ pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
+ (unsigned long) (sizeof(struct node) << bits));
return tn;
}
@@ -421,7 +428,8 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}
-static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
+static inline void put_child(struct trie *t, struct tnode *tn, int i,
+ struct node *n)
{
tnode_put_child_reorg(tn, i, n, -1);
}
@@ -431,14 +439,14 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod
* Update the value of full_children and empty_children.
*/
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+ int wasfull)
{
struct node *chi = tn->child[i];
int isfull;
BUG_ON(i >= 1<<tn->bits);
-
/* update emptyChildren */
if (n == NULL && chi != NULL)
tn->empty_children++;
@@ -571,11 +579,13 @@ static struct node *resize(struct trie *t, struct tnode *tn)
err = 0;
max_resize = 10;
while ((tn->full_children > 0 && max_resize-- &&
- 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
- inflate_threshold_use * tnode_child_length(tn))) {
+ 50 * (tn->full_children + tnode_child_length(tn)
+ - tn->empty_children)
+ >= inflate_threshold_use * tnode_child_length(tn))) {
old_tn = tn;
tn = inflate(t, tn);
+
if (IS_ERR(tn)) {
tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -587,11 +597,13 @@ static struct node *resize(struct trie *t, struct tnode *tn)
if (max_resize < 0) {
if (!tn->parent)
- printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
- inflate_threshold_root, tn->bits);
+ pr_warning("Fix inflate_threshold_root."
+ " Now=%d size=%d bits\n",
+ inflate_threshold_root, tn->bits);
else
- printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
- inflate_threshold, tn->bits);
+ pr_warning("Fix inflate_threshold."
+ " Now=%d size=%d bits\n",
+ inflate_threshold, tn->bits);
}
check_tnode(tn);
@@ -628,11 +640,13 @@ static struct node *resize(struct trie *t, struct tnode *tn)
if (max_resize < 0) {
if (!tn->parent)
- printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
- halve_threshold_root, tn->bits);
+ pr_warning("Fix halve_threshold_root."
+ " Now=%d size=%d bits\n",
+ halve_threshold_root, tn->bits);
else
- printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
- halve_threshold, tn->bits);
+ pr_warning("Fix halve_threshold."
+ " Now=%d size=%d bits\n",
+ halve_threshold, tn->bits);
}
/* Only one child remains */
@@ -656,7 +670,6 @@ static struct node *resize(struct trie *t, struct tnode *tn)
static struct tnode *inflate(struct trie *t, struct tnode *tn)
{
- struct tnode *inode;
struct tnode *oldtnode = tn;
int olen = tnode_child_length(tn);
int i;
@@ -676,8 +689,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
*/
for (i = 0; i < olen; i++) {
- struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+ struct tnode *inode;
+ inode = (struct tnode *) tnode_get_child(oldtnode, i);
if (inode &&
IS_TNODE(inode) &&
inode->pos == oldtnode->pos + oldtnode->bits &&
@@ -704,6 +718,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
}
for (i = 0; i < olen; i++) {
+ struct tnode *inode;
struct node *node = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
int size, j;
@@ -716,8 +731,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
if (IS_LEAF(node) || ((struct tnode *) node)->pos >
tn->pos + tn->bits - 1) {
- if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
- 1) == 0)
+ if (tkey_extract_bits(node->key,
+ oldtnode->pos + oldtnode->bits,
+ 1) == 0)
put_child(t, tn, 2*i, node);
else
put_child(t, tn, 2*i+1, node);
@@ -877,19 +893,6 @@ nomem:
}
}
-static void trie_init(struct trie *t)
-{
- if (!t)
- return;
-
- t->size = 0;
- rcu_assign_pointer(t->trie, NULL);
- t->revision = 0;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- memset(&t->stats, 0, sizeof(struct trie_use_stats));
-#endif
-}
-
/* readside must use rcu_read_lock currently dump routines
via get_fa_head and dump */
@@ -906,7 +909,7 @@ static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
return NULL;
}
-static inline struct list_head * get_fa_head(struct leaf *l, int plen)
+static inline struct list_head *get_fa_head(struct leaf *l, int plen)
{
struct leaf_info *li = find_leaf_info(l, plen);
@@ -956,7 +959,10 @@ fib_find_node(struct trie *t, u32 key)
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
pos = tn->pos + tn->bits;
- n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+ n = tnode_get_child_rcu(tn,
+ tkey_extract_bits(key,
+ tn->pos,
+ tn->bits));
} else
break;
}
@@ -977,8 +983,10 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
- tn = (struct tnode *) resize (t, (struct tnode *)tn);
- tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
+ tn = (struct tnode *) resize(t, (struct tnode *)tn);
+
+ tnode_put_child_reorg((struct tnode *)tp, cindex,
+ (struct node *)tn, wasfull);
tp = node_parent((struct node *) tn);
if (!tp)
@@ -988,15 +996,14 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
/* Handle last (top) tnode */
if (IS_TNODE(tn))
- tn = (struct tnode*) resize(t, (struct tnode *)tn);
+ tn = (struct tnode *)resize(t, (struct tnode *)tn);
- return (struct node*) tn;
+ return (struct node *)tn;
}
/* only used from updater-side */
-static struct list_head *
-fib_insert_node(struct trie *t, int *err, u32 key, int plen)
+static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
@@ -1036,7 +1043,10 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
tp = tn;
pos = tn->pos + tn->bits;
- n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+ n = tnode_get_child(tn,
+ tkey_extract_bits(key,
+ tn->pos,
+ tn->bits));
BUG_ON(n && node_parent(n) != tn);
} else
@@ -1054,34 +1064,27 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
/* Case 1: n is a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- struct leaf *l = (struct leaf *) n;
-
+ l = (struct leaf *) n;
li = leaf_info_new(plen);
- if (!li) {
- *err = -ENOMEM;
- goto err;
- }
+ if (!li)
+ return NULL;
fa_head = &li->falh;
insert_leaf_info(&l->list, li);
goto done;
}
- t->size++;
l = leaf_new();
- if (!l) {
- *err = -ENOMEM;
- goto err;
- }
+ if (!l)
+ return NULL;
l->key = key;
li = leaf_info_new(plen);
if (!li) {
tnode_free((struct tnode *) l);
- *err = -ENOMEM;
- goto err;
+ return NULL;
}
fa_head = &li->falh;
@@ -1117,8 +1120,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
if (!tn) {
free_leaf_info(li);
tnode_free((struct tnode *) l);
- *err = -ENOMEM;
- goto err;
+ return NULL;
}
node_set_parent((struct node *)tn, tp);
@@ -1129,23 +1131,23 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
+ put_child(t, (struct tnode *)tp, cindex,
+ (struct node *)tn);
} else {
- rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
+ rcu_assign_pointer(t->trie, (struct node *)tn);
tp = tn;
}
}
if (tp && tp->pos + tp->bits > 32)
- printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
- tp, tp->pos, tp->bits, key, plen);
+ pr_warning("fib_trie"
+ " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+ tp, tp->pos, tp->bits, key, plen);
/* Rebalance the trie */
rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
done:
- t->revision++;
-err:
return fa_head;
}
@@ -1253,10 +1255,10 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
break;
if (fa->fa_type == cfg->fc_type &&
fa->fa_scope == cfg->fc_scope &&
- fa->fa_info == fi) {
+ fa->fa_info == fi)
goto out;
- }
}
+
if (!(cfg->fc_nlflags & NLM_F_APPEND))
fa = fa_orig;
}
@@ -1279,10 +1281,11 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
*/
if (!fa_head) {
- err = 0;
- fa_head = fib_insert_node(t, &err, key, plen);
- if (err)
+ fa_head = fib_insert_node(t, key, plen);
+ if (unlikely(!fa_head)) {
+ err = -ENOMEM;
goto out_free_new_fa;
+ }
}
list_add_tail_rcu(&new_fa->fa_list,
@@ -1302,40 +1305,41 @@ err:
return err;
}
-
/* should be called with rcu_read_lock */
-static inline int check_leaf(struct trie *t, struct leaf *l,
- t_key key, int *plen, const struct flowi *flp,
- struct fib_result *res)
+static int check_leaf(struct trie *t, struct leaf *l,
+ t_key key, const struct flowi *flp,
+ struct fib_result *res)
{
- int err, i;
- __be32 mask;
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
struct hlist_node *node;
hlist_for_each_entry_rcu(li, node, hhead, hlist) {
- i = li->plen;
- mask = inet_make_mask(i);
+ int err;
+ int plen = li->plen;
+ __be32 mask = inet_make_mask(plen);
+
if (l->key != (key & ntohl(mask)))
continue;
- if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
- *plen = i;
+ err = fib_semantic_match(&li->falh, flp, res,
+ htonl(l->key), mask, plen);
+
#ifdef CONFIG_IP_FIB_TRIE_STATS
+ if (err <= 0)
t->stats.semantic_match_passed++;
+ else
+ t->stats.semantic_match_miss++;
#endif
- return err;
- }
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_miss++;
-#endif
+ if (err <= 0)
+ return plen;
}
- return 1;
+
+ return -1;
}
-static int
-fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
+ struct fib_result *res)
{
struct trie *t = (struct trie *) tb->tb_data;
int plen, ret = 0;
@@ -1362,10 +1366,13 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
/* Just a leaf? */
if (IS_LEAF(n)) {
- if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
- goto found;
- goto failed;
+ plen = check_leaf(t, (struct leaf *)n, key, flp, res);
+ if (plen < 0)
+ goto failed;
+ ret = 0;
+ goto found;
}
+
pn = (struct tnode *) n;
chopped_off = 0;
@@ -1387,14 +1394,14 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
}
if (IS_LEAF(n)) {
- if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
- goto found;
- else
+ plen = check_leaf(t, (struct leaf *)n, key, flp, res);
+ if (plen < 0)
goto backtrace;
+
+ ret = 0;
+ goto found;
}
-#define HL_OPTIMIZE
-#ifdef HL_OPTIMIZE
cn = (struct tnode *)n;
/*
@@ -1423,12 +1430,13 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
* *are* zero.
*/
- /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
+ /* NOTA BENE: Checking only skipped bits
+ for the new node here */
if (current_prefix_length < pos+bits) {
if (tkey_extract_bits(cn->key, current_prefix_length,
- cn->pos - current_prefix_length) != 0 ||
- !(cn->child[0]))
+ cn->pos - current_prefix_length)
+ || !(cn->child[0]))
goto backtrace;
}
@@ -1451,14 +1459,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
* new tnode's key.
*/
- /* Note: We aren't very concerned about the piece of the key
- * that precede pn->pos+pn->bits, since these have already been
- * checked. The bits after cn->pos aren't checked since these are
- * by definition "unknown" at this point. Thus, what we want to
- * see is if we are about to enter the "prefix matching" state,
- * and in that case verify that the skipped bits that will prevail
- * throughout this subtree are zero, as they have to be if we are
- * to find a matching prefix.
+ /*
+ * Note: We aren't very concerned about the piece of
+ * the key that precede pn->pos+pn->bits, since these
+ * have already been checked. The bits after cn->pos
+ * aren't checked since these are by definition
+ * "unknown" at this point. Thus, what we want to see
+ * is if we are about to enter the "prefix matching"
+ * state, and in that case verify that the skipped
+ * bits that will prevail throughout this subtree are
+ * zero, as they have to be if we are to find a
+ * matching prefix.
*/
node_prefix = mask_pfx(cn->key, cn->pos);
@@ -1466,13 +1477,15 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
pref_mismatch = key_prefix^node_prefix;
mp = 0;
- /* In short: If skipped bits in this node do not match the search
- * key, enter the "prefix matching" state.directly.
+ /*
+ * In short: If skipped bits in this node do not match
+ * the search key, enter the "prefix matching"
+ * state.directly.
*/
if (pref_mismatch) {
while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
mp++;
- pref_mismatch = pref_mismatch <<1;
+ pref_mismatch = pref_mismatch << 1;
}
key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
@@ -1482,7 +1495,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
if (current_prefix_length >= cn->pos)
current_prefix_length = mp;
}
-#endif
+
pn = (struct tnode *)n; /* Descend */
chopped_off = 0;
continue;
@@ -1491,12 +1504,14 @@ backtrace:
chopped_off++;
/* As zero don't change the child key (cindex) */
- while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
+ while ((chopped_off <= pn->bits)
+ && !(cindex & (1<<(chopped_off-1))))
chopped_off++;
/* Decrease current_... with bits chopped off */
if (current_prefix_length > pn->pos + pn->bits - chopped_off)
- current_prefix_length = pn->pos + pn->bits - chopped_off;
+ current_prefix_length = pn->pos + pn->bits
+ - chopped_off;
/*
* Either we do the actual chop off according or if we have
@@ -1528,52 +1543,23 @@ found:
return ret;
}
-/* only called from updater side */
-static int trie_leaf_remove(struct trie *t, t_key key)
+/*
+ * Remove the leaf and return parent.
+ */
+static void trie_leaf_remove(struct trie *t, struct leaf *l)
{
- t_key cindex;
- struct tnode *tp = NULL;
- struct node *n = t->trie;
- struct leaf *l;
-
- pr_debug("entering trie_leaf_remove(%p)\n", n);
-
- /* Note that in the case skipped bits, those bits are *not* checked!
- * When we finish this, we will have NULL or a T_LEAF, and the
- * T_LEAF may or may not match our key.
- */
-
- while (n != NULL && IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *) n;
- check_tnode(tn);
- n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
-
- BUG_ON(n && node_parent(n) != tn);
- }
- l = (struct leaf *) n;
-
- if (!n || !tkey_equals(l->key, key))
- return 0;
-
- /*
- * Key found.
- * Remove the leaf and rebalance the tree
- */
-
- t->revision++;
- t->size--;
+ struct tnode *tp = node_parent((struct node *) l);
- tp = node_parent(n);
- tnode_free((struct tnode *) n);
+ pr_debug("entering trie_leaf_remove(%p)\n", l);
if (tp) {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+ t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
put_child(t, (struct tnode *)tp, cindex, NULL);
rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
} else
rcu_assign_pointer(t->trie, NULL);
- return 1;
+ tnode_free((struct tnode *) l);
}
/*
@@ -1651,7 +1637,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
}
if (hlist_empty(&l->list))
- trie_leaf_remove(t, key);
+ trie_leaf_remove(t, l);
if (fa->fa_state & FA_S_ACCESSED)
rt_cache_flush(-1);
@@ -1697,64 +1683,64 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
return found;
}
-/* rcu_read_lock needs to be hold by caller from readside */
-
-static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
+/*
+ * Scan for the next right leaf starting at node p->child[idx]
+ * Since we have back pointer, no recursion necessary.
+ */
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
{
- struct node *c = (struct node *) thisleaf;
- struct tnode *p;
- int idx;
- struct node *trie = rcu_dereference(t->trie);
-
- if (c == NULL) {
- if (trie == NULL)
- return NULL;
-
- if (IS_LEAF(trie)) /* trie w. just a leaf */
- return (struct leaf *) trie;
-
- p = (struct tnode*) trie; /* Start */
- } else
- p = node_parent(c);
-
- while (p) {
- int pos, last;
+ do {
+ t_key idx;
- /* Find the next child of the parent */
if (c)
- pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
+ idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
else
- pos = 0;
-
- last = 1 << p->bits;
- for (idx = pos; idx < last ; idx++) {
- c = rcu_dereference(p->child[idx]);
+ idx = 0;
+ while (idx < 1u << p->bits) {
+ c = tnode_get_child_rcu(p, idx++);
if (!c)
continue;
- /* Decend if tnode */
- while (IS_TNODE(c)) {
- p = (struct tnode *) c;
- idx = 0;
-
- /* Rightmost non-NULL branch */
- if (p && IS_TNODE(p))
- while (!(c = rcu_dereference(p->child[idx]))
- && idx < (1<<p->bits)) idx++;
-
- /* Done with this tnode? */
- if (idx >= (1 << p->bits) || !c)
- goto up;
+ if (IS_LEAF(c)) {
+ prefetch(p->child[idx]);
+ return (struct leaf *) c;
}
- return (struct leaf *) c;
+
+ /* Rescan start scanning in new node */
+ p = (struct tnode *) c;
+ idx = 0;
}
-up:
- /* No more children go up one step */
+
+ /* Node empty, walk back up to parent */
c = (struct node *) p;
- p = node_parent(c);
- }
- return NULL; /* Ready. Root of trie */
+ } while ( (p = node_parent_rcu(c)) != NULL);
+
+ return NULL; /* Root of trie */
+}
+
+static struct leaf *trie_firstleaf(struct trie *t)
+{
+ struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
+
+ if (!n)
+ return NULL;
+
+ if (IS_LEAF(n)) /* trie is just a leaf */
+ return (struct leaf *) n;
+
+ return leaf_walk_rcu(n, NULL);
+}
+
+static struct leaf *trie_nextleaf(struct leaf *l)
+{
+ struct node *c = (struct node *) l;
+ struct tnode *p = node_parent(c);
+
+ if (!p)
+ return NULL; /* trie with just one leaf */
+
+ return leaf_walk_rcu(p, c);
}
/*
@@ -1763,30 +1749,27 @@ up:
static int fn_trie_flush(struct fib_table *tb)
{
struct trie *t = (struct trie *) tb->tb_data;
- struct leaf *ll = NULL, *l = NULL;
- int found = 0, h;
-
- t->revision++;
+ struct leaf *l, *ll = NULL;
+ int found = 0;
- for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
+ for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
found += trie_flush_leaf(t, l);
if (ll && hlist_empty(&ll->list))
- trie_leaf_remove(t, ll->key);
+ trie_leaf_remove(t, ll);
ll = l;
}
if (ll && hlist_empty(&ll->list))
- trie_leaf_remove(t, ll->key);
+ trie_leaf_remove(t, ll);
pr_debug("trie_flush found=%d\n", found);
return found;
}
-static int trie_last_dflt = -1;
-
-static void
-fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+static void fn_trie_select_default(struct fib_table *tb,
+ const struct flowi *flp,
+ struct fib_result *res)
{
struct trie *t = (struct trie *) tb->tb_data;
int order, last_idx;
@@ -1831,48 +1814,38 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
if (next_fi != res->fi)
break;
} else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, &trie_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- trie_last_dflt = order;
+ &last_idx, tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
goto out;
}
fi = next_fi;
order++;
}
if (order <= 0 || fi == NULL) {
- trie_last_dflt = -1;
+ tb->tb_default = -1;
goto out;
}
- if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- trie_last_dflt = order;
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+ tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
goto out;
}
- if (last_idx >= 0) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = last_resort;
- if (last_resort)
- atomic_inc(&last_resort->fib_clntref);
- }
- trie_last_dflt = last_idx;
- out:;
+ if (last_idx >= 0)
+ fib_result_assign(res, last_resort);
+ tb->tb_default = last_idx;
+out:
rcu_read_unlock();
}
-static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
+static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
+ struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
{
int i, s_i;
struct fib_alias *fa;
-
__be32 xkey = htonl(key);
s_i = cb->args[4];
@@ -1885,7 +1858,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
i++;
continue;
}
- BUG_ON(!fa->fa_info);
if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
@@ -1896,7 +1868,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
xkey,
plen,
fa->fa_tos,
- fa->fa_info, 0) < 0) {
+ fa->fa_info, NLM_F_MULTI) < 0) {
cb->args[4] = i;
return -1;
}
@@ -1906,109 +1878,118 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
return skb->len;
}
-static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
+static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
{
- int h, s_h;
- struct list_head *fa_head;
- struct leaf *l = NULL;
+ struct leaf_info *li;
+ struct hlist_node *node;
+ int i, s_i;
- s_h = cb->args[3];
+ s_i = cb->args[3];
+ i = 0;
- for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
- if (h < s_h)
+ /* rcu_read_lock is hold by caller */
+ hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ if (i < s_i) {
+ i++;
continue;
- if (h > s_h)
- memset(&cb->args[4], 0,
- sizeof(cb->args) - 4*sizeof(cb->args[0]));
-
- fa_head = get_fa_head(l, plen);
+ }
- if (!fa_head)
- continue;
+ if (i > s_i)
+ cb->args[4] = 0;
- if (list_empty(fa_head))
+ if (list_empty(&li->falh))
continue;
- if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
- cb->args[3] = h;
+ if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
+ cb->args[3] = i;
return -1;
}
+ i++;
}
- cb->args[3] = h;
+
+ cb->args[3] = i;
return skb->len;
}
-static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
{
- int m, s_m;
+ struct leaf *l;
struct trie *t = (struct trie *) tb->tb_data;
-
- s_m = cb->args[2];
+ t_key key = cb->args[2];
rcu_read_lock();
- for (m = 0; m <= 32; m++) {
- if (m < s_m)
- continue;
- if (m > s_m)
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
+ /* Dump starting at last key.
+ * Note: 0.0.0.0/0 (ie default) is first key.
+ */
+ if (!key)
+ l = trie_firstleaf(t);
+ else {
+ l = fib_find_node(t, key);
+ if (!l) {
+ /* The table changed during the dump, rather than
+ * giving partial data, just make application retry.
+ */
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+ }
- if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
- cb->args[2] = m;
- goto out;
+ while (l) {
+ cb->args[2] = l->key;
+ if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+ rcu_read_unlock();
+ return -1;
}
+
+ l = trie_nextleaf(l);
+ memset(&cb->args[3], 0,
+ sizeof(cb->args) - 3*sizeof(cb->args[0]));
}
rcu_read_unlock();
- cb->args[2] = m;
+
return skb->len;
-out:
- rcu_read_unlock();
- return -1;
}
-/* Fix more generic FIB names for init later */
+void __init fib_hash_init(void)
+{
+ fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+ sizeof(struct fib_alias),
+ 0, SLAB_PANIC, NULL);
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-struct fib_table * fib_hash_init(u32 id)
-#else
-struct fib_table * __init fib_hash_init(u32 id)
-#endif
+ trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
+ max(sizeof(struct leaf),
+ sizeof(struct leaf_info)),
+ 0, SLAB_PANIC, NULL);
+}
+
+
+/* Fix more generic FIB names for init later */
+struct fib_table *fib_hash_table(u32 id)
{
struct fib_table *tb;
struct trie *t;
- if (fn_alias_kmem == NULL)
- fn_alias_kmem = kmem_cache_create("ip_fib_alias",
- sizeof(struct fib_alias),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
-
tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
GFP_KERNEL);
if (tb == NULL)
return NULL;
tb->tb_id = id;
+ tb->tb_default = -1;
tb->tb_lookup = fn_trie_lookup;
tb->tb_insert = fn_trie_insert;
tb->tb_delete = fn_trie_delete;
tb->tb_flush = fn_trie_flush;
tb->tb_select_default = fn_trie_select_default;
tb->tb_dump = fn_trie_dump;
- memset(tb->tb_data, 0, sizeof(struct trie));
t = (struct trie *) tb->tb_data;
-
- trie_init(t);
-
- if (id == RT_TABLE_LOCAL)
- trie_local = t;
- else if (id == RT_TABLE_MAIN)
- trie_main = t;
+ memset(t, 0, sizeof(*t));
if (id == RT_TABLE_LOCAL)
- printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
+ pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
return tb;
}
@@ -2016,6 +1997,8 @@ struct fib_table * __init fib_hash_init(u32 id)
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
+ struct seq_net_private p;
+ struct trie *trie_local, *trie_main;
struct tnode *tnode;
struct trie *trie;
unsigned index;
@@ -2036,7 +2019,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
iter->tnode, iter->index, iter->depth);
rescan:
while (cindex < (1<<tn->bits)) {
- struct node *n = tnode_get_child(tn, cindex);
+ struct node *n = tnode_get_child_rcu(tn, cindex);
if (n) {
if (IS_LEAF(n)) {
@@ -2055,7 +2038,7 @@ rescan:
}
/* Current node exhausted, pop back up */
- p = node_parent((struct node *)tn);
+ p = node_parent_rcu((struct node *)tn);
if (p) {
cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
tn = p;
@@ -2108,10 +2091,17 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
for (n = fib_trie_get_first(&iter, t); n;
n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
+ struct leaf *l = (struct leaf *)n;
+ struct leaf_info *li;
+ struct hlist_node *tmp;
+
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
+
+ hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
+ ++s->prefixes;
} else {
const struct tnode *tn = (const struct tnode *) n;
int i;
@@ -2140,13 +2130,17 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
else
avdepth = 0;
- seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
+ seq_printf(seq, "\tAver depth: %u.%02d\n",
+ avdepth / 100, avdepth % 100);
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
-
bytes = sizeof(struct leaf) * stat->leaves;
- seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
+
+ seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
+ bytes += sizeof(struct leaf_info) * stat->prefixes;
+
+ seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
bytes += sizeof(struct tnode) * stat->tnodes;
max = MAX_STAT_DEPTH;
@@ -2156,60 +2150,89 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
pointers = 0;
for (i = 1; i <= max; i++)
if (stat->nodesizes[i] != 0) {
- seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
+ seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
pointers += (1<<i) * stat->nodesizes[i];
}
seq_putc(seq, '\n');
- seq_printf(seq, "\tPointers: %d\n", pointers);
+ seq_printf(seq, "\tPointers: %u\n", pointers);
bytes += sizeof(struct node *) * pointers;
- seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
- seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
+ seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
+ seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
+}
#ifdef CONFIG_IP_FIB_TRIE_STATS
- seq_printf(seq, "Counters:\n---------\n");
- seq_printf(seq,"gets = %d\n", t->stats.gets);
- seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
- seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
- seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
- seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
- seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
-#ifdef CLEAR_STATS
- memset(&(t->stats), 0, sizeof(t->stats));
-#endif
+static void trie_show_usage(struct seq_file *seq,
+ const struct trie_use_stats *stats)
+{
+ seq_printf(seq, "\nCounters:\n---------\n");
+ seq_printf(seq, "gets = %u\n", stats->gets);
+ seq_printf(seq, "backtracks = %u\n", stats->backtrack);
+ seq_printf(seq, "semantic match passed = %u\n",
+ stats->semantic_match_passed);
+ seq_printf(seq, "semantic match miss = %u\n",
+ stats->semantic_match_miss);
+ seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
+ seq_printf(seq, "skipped node resize = %u\n\n",
+ stats->resize_node_skipped);
+}
#endif /* CONFIG_IP_FIB_TRIE_STATS */
+
+static void fib_trie_show(struct seq_file *seq, const char *name,
+ struct trie *trie)
+{
+ struct trie_stat stat;
+
+ trie_collect_stats(trie, &stat);
+ seq_printf(seq, "%s:\n", name);
+ trie_show_stats(seq, &stat);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ trie_show_usage(seq, &trie->stats);
+#endif
}
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
- struct trie_stat *stat;
-
- stat = kmalloc(sizeof(*stat), GFP_KERNEL);
- if (!stat)
- return -ENOMEM;
+ struct net *net = (struct net *)seq->private;
+ struct fib_table *tb;
- seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
+ seq_printf(seq,
+ "Basic info: size of leaf:"
+ " %Zd bytes, size of tnode: %Zd bytes.\n",
sizeof(struct leaf), sizeof(struct tnode));
- if (trie_local) {
- seq_printf(seq, "Local:\n");
- trie_collect_stats(trie_local, stat);
- trie_show_stats(seq, stat);
- }
+ tb = fib_get_table(net, RT_TABLE_LOCAL);
+ if (tb)
+ fib_trie_show(seq, "Local", (struct trie *) tb->tb_data);
- if (trie_main) {
- seq_printf(seq, "Main:\n");
- trie_collect_stats(trie_main, stat);
- trie_show_stats(seq, stat);
- }
- kfree(stat);
+ tb = fib_get_table(net, RT_TABLE_MAIN);
+ if (tb)
+ fib_trie_show(seq, "Main", (struct trie *) tb->tb_data);
return 0;
}
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
{
- return single_open(file, fib_triestat_seq_show, NULL);
+ int err;
+ struct net *net;
+
+ net = get_proc_net(inode);
+ if (net == NULL)
+ return -ENXIO;
+ err = single_open(file, fib_triestat_seq_show, net);
+ if (err < 0) {
+ put_net(net);
+ return err;
+ }
+ return 0;
+}
+
+static int fib_triestat_seq_release(struct inode *ino, struct file *f)
+{
+ struct seq_file *seq = f->private_data;
+ put_net(seq->private);
+ return single_release(ino, f);
}
static const struct file_operations fib_triestat_fops = {
@@ -2217,7 +2240,7 @@ static const struct file_operations fib_triestat_fops = {
.open = fib_triestat_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = fib_triestat_seq_release,
};
static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
@@ -2226,13 +2249,13 @@ static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
loff_t idx = 0;
struct node *n;
- for (n = fib_trie_get_first(iter, trie_local);
+ for (n = fib_trie_get_first(iter, iter->trie_local);
n; ++idx, n = fib_trie_get_next(iter)) {
if (pos == idx)
return n;
}
- for (n = fib_trie_get_first(iter, trie_main);
+ for (n = fib_trie_get_first(iter, iter->trie_main);
n; ++idx, n = fib_trie_get_next(iter)) {
if (pos == idx)
return n;
@@ -2241,11 +2264,25 @@ static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
}
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
+ struct fib_trie_iter *iter = seq->private;
+ struct fib_table *tb;
+
+ if (!iter->trie_local) {
+ tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
+ if (tb)
+ iter->trie_local = (struct trie *) tb->tb_data;
+ }
+ if (!iter->trie_main) {
+ tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
+ if (tb)
+ iter->trie_main = (struct trie *) tb->tb_data;
+ }
rcu_read_lock();
if (*pos == 0)
return SEQ_START_TOKEN;
- return fib_trie_get_idx(seq->private, *pos - 1);
+ return fib_trie_get_idx(iter, *pos - 1);
}
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2263,13 +2300,14 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return v;
/* continue scan in next trie */
- if (iter->trie == trie_local)
- return fib_trie_get_first(iter, trie_main);
+ if (iter->trie == iter->trie_local)
+ return fib_trie_get_first(iter, iter->trie_main);
return NULL;
}
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
rcu_read_unlock();
}
@@ -2279,10 +2317,8 @@ static void seq_indent(struct seq_file *seq, int n)
while (n-- > 0) seq_puts(seq, " ");
}
-static inline const char *rtn_scope(enum rt_scope_t s)
+static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
{
- static char buf[32];
-
switch (s) {
case RT_SCOPE_UNIVERSE: return "universe";
case RT_SCOPE_SITE: return "site";
@@ -2290,7 +2326,7 @@ static inline const char *rtn_scope(enum rt_scope_t s)
case RT_SCOPE_HOST: return "host";
case RT_SCOPE_NOWHERE: return "nowhere";
default:
- snprintf(buf, sizeof(buf), "scope=%d", s);
+ snprintf(buf, len, "scope=%d", s);
return buf;
}
}
@@ -2310,13 +2346,11 @@ static const char *rtn_type_names[__RTN_MAX] = {
[RTN_XRESOLVE] = "XRESOLVE",
};
-static inline const char *rtn_type(unsigned t)
+static inline const char *rtn_type(char *buf, size_t len, unsigned t)
{
- static char buf[32];
-
if (t < __RTN_MAX && rtn_type_names[t])
return rtn_type_names[t];
- snprintf(buf, sizeof(buf), "type %d", t);
+ snprintf(buf, len, "type %u", t);
return buf;
}
@@ -2329,8 +2363,8 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN)
return 0;
- if (!node_parent(n)) {
- if (iter->trie == trie_local)
+ if (!node_parent_rcu(n)) {
+ if (iter->trie == iter->trie_local)
seq_puts(seq, "<local>:\n");
else
seq_puts(seq, "<main>:\n");
@@ -2347,25 +2381,29 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
} else {
struct leaf *l = (struct leaf *) n;
- int i;
+ struct leaf_info *li;
+ struct hlist_node *node;
__be32 val = htonl(l->key);
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
- for (i = 32; i >= 0; i--) {
- struct leaf_info *li = find_leaf_info(l, i);
- if (li) {
- struct fib_alias *fa;
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- seq_indent(seq, iter->depth+1);
- seq_printf(seq, " /%d %s %s", i,
- rtn_scope(fa->fa_scope),
- rtn_type(fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, "tos =%d\n",
- fa->fa_tos);
- seq_putc(seq, '\n');
- }
+
+ hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ struct fib_alias *fa;
+
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ char buf1[32], buf2[32];
+
+ seq_indent(seq, iter->depth+1);
+ seq_printf(seq, " /%d %s %s", li->plen,
+ rtn_scope(buf1, sizeof(buf1),
+ fa->fa_scope),
+ rtn_type(buf2, sizeof(buf2),
+ fa->fa_type));
+ if (fa->fa_tos)
+ seq_printf(seq, "tos =%d\n",
+ fa->fa_tos);
+ seq_putc(seq, '\n');
}
}
}
@@ -2382,8 +2420,8 @@ static const struct seq_operations fib_trie_seq_ops = {
static int fib_trie_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &fib_trie_seq_ops,
- sizeof(struct fib_trie_iter));
+ return seq_open_net(inode, file, &fib_trie_seq_ops,
+ sizeof(struct fib_trie_iter));
}
static const struct file_operations fib_trie_fops = {
@@ -2391,7 +2429,7 @@ static const struct file_operations fib_trie_fops = {
.open = fib_trie_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
@@ -2419,8 +2457,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
struct leaf *l = v;
- int i;
- char bf[128];
+ struct leaf_info *li;
+ struct hlist_node *node;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2429,25 +2467,23 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
return 0;
}
- if (iter->trie == trie_local)
+ if (iter->trie == iter->trie_local)
return 0;
+
if (IS_TNODE(l))
return 0;
- for (i=32; i>=0; i--) {
- struct leaf_info *li = find_leaf_info(l, i);
+ hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
struct fib_alias *fa;
__be32 mask, prefix;
- if (!li)
- continue;
-
mask = inet_make_mask(li->plen);
prefix = htonl(l->key);
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
const struct fib_info *fi = fa->fa_info;
unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+ char bf[128];
if (fa->fa_type == RTN_BROADCAST
|| fa->fa_type == RTN_MULTICAST)
@@ -2461,7 +2497,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
fi->fib_nh->nh_gw, flags, 0, 0,
fi->fib_priority,
mask,
- (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+ (fi->fib_advmss ?
+ fi->fib_advmss + 40 : 0),
fi->fib_window,
fi->fib_rtt >> 3);
else
@@ -2486,8 +2523,8 @@ static const struct seq_operations fib_route_seq_ops = {
static int fib_route_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &fib_route_seq_ops,
- sizeof(struct fib_trie_iter));
+ return seq_open_net(inode, file, &fib_route_seq_ops,
+ sizeof(struct fib_trie_iter));
}
static const struct file_operations fib_route_fops = {
@@ -2495,35 +2532,36 @@ static const struct file_operations fib_route_fops = {
.open = fib_route_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
-int __init fib_proc_init(void)
+int __net_init fib_proc_init(struct net *net)
{
- if (!proc_net_fops_create(&init_net, "fib_trie", S_IRUGO, &fib_trie_fops))
+ if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
goto out1;
- if (!proc_net_fops_create(&init_net, "fib_triestat", S_IRUGO, &fib_triestat_fops))
+ if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
+ &fib_triestat_fops))
goto out2;
- if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_route_fops))
+ if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
goto out3;
return 0;
out3:
- proc_net_remove(&init_net, "fib_triestat");
+ proc_net_remove(net, "fib_triestat");
out2:
- proc_net_remove(&init_net, "fib_trie");
+ proc_net_remove(net, "fib_trie");
out1:
return -ENOMEM;
}
-void __init fib_proc_exit(void)
+void __net_exit fib_proc_exit(struct net *net)
{
- proc_net_remove(&init_net, "fib_trie");
- proc_net_remove(&init_net, "fib_triestat");
- proc_net_remove(&init_net, "route");
+ proc_net_remove(net, "fib_trie");
+ proc_net_remove(net, "fib_triestat");
+ proc_net_remove(net, "route");
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 82baea026484..a7321a82df6d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -92,6 +92,7 @@
#include <asm/system.h>
#include <asm/uaccess.h>
#include <net/checksum.h>
+#include <net/xfrm.h>
/*
* Build xmit assembly blocks
@@ -231,7 +232,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL;
#define icmp_socket __get_cpu_var(__icmp_socket)
-static __inline__ int icmp_xmit_lock(void)
+static inline int icmp_xmit_lock(void)
{
local_bh_disable();
@@ -245,7 +246,7 @@ static __inline__ int icmp_xmit_lock(void)
return 0;
}
-static void icmp_xmit_unlock(void)
+static inline void icmp_xmit_unlock(void)
{
spin_unlock_bh(&icmp_socket->sk->sk_lock.slock);
}
@@ -274,18 +275,19 @@ static void icmp_xmit_unlock(void)
#define XRLIM_BURST_FACTOR 6
int xrlim_allow(struct dst_entry *dst, int timeout)
{
- unsigned long now;
+ unsigned long now, token = dst->rate_tokens;
int rc = 0;
now = jiffies;
- dst->rate_tokens += now - dst->rate_last;
+ token += now - dst->rate_last;
dst->rate_last = now;
- if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout)
- dst->rate_tokens = XRLIM_BURST_FACTOR * timeout;
- if (dst->rate_tokens >= timeout) {
- dst->rate_tokens -= timeout;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ if (token >= timeout) {
+ token -= timeout;
rc = 1;
}
+ dst->rate_tokens = token;
return rc;
}
@@ -403,7 +405,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
.tos = RT_TOS(ip_hdr(skb)->tos) } },
.proto = IPPROTO_ICMP };
security_skb_classify_flow(skb, &fl);
- if (ip_route_output_key(&rt, &fl))
+ if (ip_route_output_key(rt->u.dst.dev->nd_net, &rt, &fl))
goto out_unlock;
}
if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type,
@@ -435,9 +437,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
struct ipcm_cookie ipc;
__be32 saddr;
u8 tos;
+ struct net *net;
if (!rt)
goto out;
+ net = rt->u.dst.dev->nd_net;
/*
* Find the original header. It is expected to be valid, of course.
@@ -513,7 +517,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
struct net_device *dev = NULL;
if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr)
- dev = dev_get_by_index(&init_net, rt->fl.iif);
+ dev = dev_get_by_index(net, rt->fl.iif);
if (dev) {
saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -563,11 +567,71 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
}
};
+ int err;
+ struct rtable *rt2;
+
security_skb_classify_flow(skb_in, &fl);
- if (ip_route_output_key(&rt, &fl))
+ if (__ip_route_output_key(net, &rt, &fl))
+ goto out_unlock;
+
+ /* No need to clone since we're just using its address. */
+ rt2 = rt;
+
+ err = xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0);
+ switch (err) {
+ case 0:
+ if (rt != rt2)
+ goto route_done;
+ break;
+ case -EPERM:
+ rt = NULL;
+ break;
+ default:
+ goto out_unlock;
+ }
+
+ if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
+ goto out_unlock;
+
+ if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
+ err = __ip_route_output_key(net, &rt2, &fl);
+ else {
+ struct flowi fl2 = {};
+ struct dst_entry *odst;
+
+ fl2.fl4_dst = fl.fl4_src;
+ if (ip_route_output_key(net, &rt2, &fl2))
+ goto out_unlock;
+
+ /* Ugh! */
+ odst = skb_in->dst;
+ err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
+ RT_TOS(tos), rt2->u.dst.dev);
+
+ dst_release(&rt2->u.dst);
+ rt2 = (struct rtable *)skb_in->dst;
+ skb_in->dst = odst;
+ }
+
+ if (err)
+ goto out_unlock;
+
+ err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL,
+ XFRM_LOOKUP_ICMP);
+ if (err == -ENOENT) {
+ if (!rt)
+ goto out_unlock;
+ goto route_done;
+ }
+
+ dst_release(&rt->u.dst);
+ rt = rt2;
+
+ if (err)
goto out_unlock;
}
+route_done:
if (!icmpv4_xrlim_allow(rt, type, code))
goto ende;
@@ -603,8 +667,10 @@ static void icmp_unreach(struct sk_buff *skb)
struct icmphdr *icmph;
int hash, protocol;
struct net_protocol *ipprot;
- struct sock *raw_sk;
u32 info = 0;
+ struct net *net;
+
+ net = skb->dst->dev->nd_net;
/*
* Incomplete header ?
@@ -635,7 +701,7 @@ static void icmp_unreach(struct sk_buff *skb)
"and DF set.\n",
NIPQUAD(iph->daddr));
} else {
- info = ip_rt_frag_needed(iph,
+ info = ip_rt_frag_needed(net, iph,
ntohs(icmph->un.frag.mtu));
if (!info)
goto out;
@@ -673,7 +739,7 @@ static void icmp_unreach(struct sk_buff *skb)
*/
if (!sysctl_icmp_ignore_bogus_error_responses &&
- inet_addr_type(iph->daddr) == RTN_BROADCAST) {
+ inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
if (net_ratelimit())
printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP "
"type %u, code %u "
@@ -697,21 +763,9 @@ static void icmp_unreach(struct sk_buff *skb)
/*
* Deliver ICMP message to raw sockets. Pretty useless feature?
*/
+ raw_icmp_error(skb, protocol, info);
- /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
hash = protocol & (MAX_INET_PROTOS - 1);
- read_lock(&raw_v4_lock);
- if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) {
- while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr,
- iph->saddr,
- skb->dev->ifindex)) != NULL) {
- raw_err(raw_sk, skb, info);
- raw_sk = sk_next(raw_sk);
- iph = (struct iphdr *)skb->data;
- }
- }
- read_unlock(&raw_v4_lock);
-
rcu_read_lock();
ipprot = rcu_dereference(inet_protos[hash]);
if (ipprot && ipprot->err_handler)
@@ -929,6 +983,25 @@ int icmp_rcv(struct sk_buff *skb)
struct icmphdr *icmph;
struct rtable *rt = (struct rtable *)skb->dst;
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ int nh;
+
+ if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags &
+ XFRM_STATE_ICMP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
+ goto drop;
+
+ nh = skb_network_offset(skb);
+ skb_set_network_header(skb, sizeof(*icmph));
+
+ if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ skb_set_network_header(skb, nh);
+ }
+
ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);
switch (skb->ip_summed) {
@@ -942,8 +1015,7 @@ int icmp_rcv(struct sk_buff *skb)
goto error;
}
- if (!pskb_pull(skb, sizeof(struct icmphdr)))
- goto error;
+ __skb_pull(skb, sizeof(*icmph));
icmph = icmp_hdr(skb);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 7dbc282d4f9f..994648be80ab 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -130,12 +130,12 @@
*/
#define IGMP_V1_SEEN(in_dev) \
- (IPV4_DEVCONF_ALL(FORCE_IGMP_VERSION) == 1 || \
+ (IPV4_DEVCONF_ALL(in_dev->dev->nd_net, FORCE_IGMP_VERSION) == 1 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
((in_dev)->mr_v1_seen && \
time_before(jiffies, (in_dev)->mr_v1_seen)))
#define IGMP_V2_SEEN(in_dev) \
- (IPV4_DEVCONF_ALL(FORCE_IGMP_VERSION) == 2 || \
+ (IPV4_DEVCONF_ALL(in_dev->dev->nd_net, FORCE_IGMP_VERSION) == 2 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
((in_dev)->mr_v2_seen && \
time_before(jiffies, (in_dev)->mr_v2_seen)))
@@ -301,7 +301,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
.nl_u = { .ip4_u = {
.daddr = IGMPV3_ALL_MCR } },
.proto = IPPROTO_IGMP };
- if (ip_route_output_key(&rt, &fl)) {
+ if (ip_route_output_key(&init_net, &rt, &fl)) {
kfree_skb(skb);
return NULL;
}
@@ -349,17 +349,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
static int igmpv3_sendpack(struct sk_buff *skb)
{
- struct iphdr *pip = ip_hdr(skb);
struct igmphdr *pig = igmp_hdr(skb);
- const int iplen = skb->tail - skb->network_header;
const int igmplen = skb->tail - skb->transport_header;
- pip->tot_len = htons(iplen);
- ip_send_check(pip);
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
- return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
- dst_output);
+ return ip_local_out(skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
@@ -650,7 +645,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
struct flowi fl = { .oif = dev->ifindex,
.nl_u = { .ip4_u = { .daddr = dst } },
.proto = IPPROTO_IGMP };
- if (ip_route_output_key(&rt, &fl))
+ if (ip_route_output_key(&init_net, &rt, &fl))
return -1;
}
if (rt->rt_src == 0) {
@@ -680,13 +675,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->daddr = dst;
iph->saddr = rt->rt_src;
iph->protocol = IPPROTO_IGMP;
- iph->tot_len = htons(IGMP_SIZE);
ip_select_ident(iph, &rt->u.dst, NULL);
((u8*)&iph[1])[0] = IPOPT_RA;
((u8*)&iph[1])[1] = 4;
((u8*)&iph[1])[2] = 0;
((u8*)&iph[1])[3] = 0;
- ip_send_check(iph);
ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
ih->type=type;
@@ -695,8 +688,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
ih->group=group;
ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- dst_output);
+ return ip_local_out(skb);
}
static void igmp_gq_timer_expire(unsigned long data)
@@ -1234,9 +1226,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
im->tm_running=0;
- init_timer(&im->timer);
- im->timer.data=(unsigned long)im;
- im->timer.function=&igmp_timer_expire;
+ setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
im->unsolicit_count = IGMP_Unsolicited_Report_Count;
im->reporter = 0;
im->gsquery = 0;
@@ -1338,13 +1328,11 @@ void ip_mc_init_dev(struct in_device *in_dev)
in_dev->mc_tomb = NULL;
#ifdef CONFIG_IP_MULTICAST
in_dev->mr_gq_running = 0;
- init_timer(&in_dev->mr_gq_timer);
- in_dev->mr_gq_timer.data=(unsigned long) in_dev;
- in_dev->mr_gq_timer.function=&igmp_gq_timer_expire;
+ setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
+ (unsigned long)in_dev);
in_dev->mr_ifc_count = 0;
- init_timer(&in_dev->mr_ifc_timer);
- in_dev->mr_ifc_timer.data=(unsigned long) in_dev;
- in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire;
+ setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
+ (unsigned long)in_dev);
in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
#endif
@@ -1401,19 +1389,19 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
struct in_device *idev = NULL;
if (imr->imr_ifindex) {
- idev = inetdev_by_index(imr->imr_ifindex);
+ idev = inetdev_by_index(&init_net, imr->imr_ifindex);
if (idev)
__in_dev_put(idev);
return idev;
}
if (imr->imr_address.s_addr) {
- dev = ip_dev_find(imr->imr_address.s_addr);
+ dev = ip_dev_find(&init_net, imr->imr_address.s_addr);
if (!dev)
return NULL;
dev_put(dev);
}
- if (!dev && !ip_route_output_key(&rt, &fl)) {
+ if (!dev && !ip_route_output_key(&init_net, &rt, &fl)) {
dev = rt->u.dst.dev;
ip_rt_put(rt);
}
@@ -1754,7 +1742,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
int ifindex;
int count = 0;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
rtnl_lock();
@@ -1867,7 +1855,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
int leavegroup = 0;
int i, j, rv;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
rtnl_lock();
@@ -1997,7 +1985,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
struct ip_sf_socklist *newpsl, *psl;
int leavegroup = 0;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
if (msf->imsf_fmode != MCAST_INCLUDE &&
msf->imsf_fmode != MCAST_EXCLUDE)
@@ -2080,7 +2068,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
rtnl_lock();
@@ -2142,7 +2130,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
if (psin->sin_family != AF_INET)
return -EINVAL;
addr = psin->sin_addr.s_addr;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
rtnl_lock();
@@ -2192,7 +2180,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
struct ip_sf_socklist *psl;
int i;
- if (!MULTICAST(loc_addr))
+ if (!ipv4_is_multicast(loc_addr))
return 1;
for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
@@ -2234,7 +2222,7 @@ void ip_mc_drop_socket(struct sock *sk)
struct in_device *in_dev;
inet->mc_list = iml->next;
- in_dev = inetdev_by_index(iml->multi.imr_ifindex);
+ in_dev = inetdev_by_index(&init_net, iml->multi.imr_ifindex);
(void) ip_mc_leave_src(sk, iml, in_dev);
if (in_dev != NULL) {
ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
@@ -2341,6 +2329,7 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(dev_base_lock)
{
read_lock(&dev_base_lock);
return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
@@ -2358,6 +2347,7 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+ __releases(dev_base_lock)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
if (likely(state->in_dev != NULL)) {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8fb6ca23700a..7801cceb2d1b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -277,18 +277,11 @@ void inet_csk_init_xmit_timers(struct sock *sk,
{
struct inet_connection_sock *icsk = inet_csk(sk);
- init_timer(&icsk->icsk_retransmit_timer);
- init_timer(&icsk->icsk_delack_timer);
- init_timer(&sk->sk_timer);
-
- icsk->icsk_retransmit_timer.function = retransmit_handler;
- icsk->icsk_delack_timer.function = delack_handler;
- sk->sk_timer.function = keepalive_handler;
-
- icsk->icsk_retransmit_timer.data =
- icsk->icsk_delack_timer.data =
- sk->sk_timer.data = (unsigned long)sk;
-
+ setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
+ (unsigned long)sk);
+ setup_timer(&icsk->icsk_delack_timer, delack_handler,
+ (unsigned long)sk);
+ setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
@@ -340,7 +333,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
.dport = ireq->rmt_port } } };
security_req_classify_flow(req, &fl);
- if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+ if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) {
IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e468e7a7aac4..605ed2cd7972 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -935,7 +935,7 @@ out_free_table:
static void __exit inet_diag_exit(void)
{
- sock_release(idiagnl->sk_socket);
+ netlink_kernel_release(idiagnl);
kfree(inet_diag_table);
}
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index e15e04fc6661..724d69aed031 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -47,7 +47,7 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
}
write_unlock(&f->lock);
- mod_timer(&f->secret_timer, now + f->ctl->secret_interval);
+ mod_timer(&f->secret_timer, now + f->secret_interval);
}
void inet_frags_init(struct inet_frags *f)
@@ -57,35 +57,45 @@ void inet_frags_init(struct inet_frags *f)
for (i = 0; i < INETFRAGS_HASHSZ; i++)
INIT_HLIST_HEAD(&f->hash[i]);
- INIT_LIST_HEAD(&f->lru_list);
rwlock_init(&f->lock);
f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
(jiffies ^ (jiffies >> 6)));
- f->nqueues = 0;
- atomic_set(&f->mem, 0);
-
- init_timer(&f->secret_timer);
- f->secret_timer.function = inet_frag_secret_rebuild;
- f->secret_timer.data = (unsigned long)f;
- f->secret_timer.expires = jiffies + f->ctl->secret_interval;
+ setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
+ (unsigned long)f);
+ f->secret_timer.expires = jiffies + f->secret_interval;
add_timer(&f->secret_timer);
}
EXPORT_SYMBOL(inet_frags_init);
+void inet_frags_init_net(struct netns_frags *nf)
+{
+ nf->nqueues = 0;
+ atomic_set(&nf->mem, 0);
+ INIT_LIST_HEAD(&nf->lru_list);
+}
+EXPORT_SYMBOL(inet_frags_init_net);
+
void inet_frags_fini(struct inet_frags *f)
{
del_timer(&f->secret_timer);
}
EXPORT_SYMBOL(inet_frags_fini);
+void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+{
+ nf->low_thresh = 0;
+ inet_frag_evictor(nf, f);
+}
+EXPORT_SYMBOL(inet_frags_exit_net);
+
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
write_lock(&f->lock);
hlist_del(&fq->list);
list_del(&fq->lru_list);
- f->nqueues--;
+ fq->net->nqueues--;
write_unlock(&f->lock);
}
@@ -103,13 +113,13 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
EXPORT_SYMBOL(inet_frag_kill);
-static inline void frag_kfree_skb(struct inet_frags *f, struct sk_buff *skb,
- int *work)
+static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
+ struct sk_buff *skb, int *work)
{
if (work)
*work -= skb->truesize;
- atomic_sub(skb->truesize, &f->mem);
+ atomic_sub(skb->truesize, &nf->mem);
if (f->skb_free)
f->skb_free(skb);
kfree_skb(skb);
@@ -119,22 +129,24 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
int *work)
{
struct sk_buff *fp;
+ struct netns_frags *nf;
BUG_TRAP(q->last_in & COMPLETE);
BUG_TRAP(del_timer(&q->timer) == 0);
/* Release all fragment data. */
fp = q->fragments;
+ nf = q->net;
while (fp) {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(f, fp, work);
+ frag_kfree_skb(nf, f, fp, work);
fp = xp;
}
if (work)
*work -= f->qsize;
- atomic_sub(f->qsize, &f->mem);
+ atomic_sub(f->qsize, &nf->mem);
if (f->destructor)
f->destructor(q);
@@ -143,20 +155,20 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
}
EXPORT_SYMBOL(inet_frag_destroy);
-int inet_frag_evictor(struct inet_frags *f)
+int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f)
{
struct inet_frag_queue *q;
int work, evicted = 0;
- work = atomic_read(&f->mem) - f->ctl->low_thresh;
+ work = atomic_read(&nf->mem) - nf->low_thresh;
while (work > 0) {
read_lock(&f->lock);
- if (list_empty(&f->lru_list)) {
+ if (list_empty(&nf->lru_list)) {
read_unlock(&f->lock);
break;
}
- q = list_first_entry(&f->lru_list,
+ q = list_first_entry(&nf->lru_list,
struct inet_frag_queue, lru_list);
atomic_inc(&q->refcnt);
read_unlock(&f->lock);
@@ -175,8 +187,9 @@ int inet_frag_evictor(struct inet_frags *f)
}
EXPORT_SYMBOL(inet_frag_evictor);
-static struct inet_frag_queue *inet_frag_intern(struct inet_frag_queue *qp_in,
- struct inet_frags *f, unsigned int hash, void *arg)
+static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
+ struct inet_frag_queue *qp_in, struct inet_frags *f,
+ unsigned int hash, void *arg)
{
struct inet_frag_queue *qp;
#ifdef CONFIG_SMP
@@ -190,7 +203,7 @@ static struct inet_frag_queue *inet_frag_intern(struct inet_frag_queue *qp_in,
* promoted read lock to write lock.
*/
hlist_for_each_entry(qp, n, &f->hash[hash], list) {
- if (f->match(qp, arg)) {
+ if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
write_unlock(&f->lock);
qp_in->last_in |= COMPLETE;
@@ -200,18 +213,19 @@ static struct inet_frag_queue *inet_frag_intern(struct inet_frag_queue *qp_in,
}
#endif
qp = qp_in;
- if (!mod_timer(&qp->timer, jiffies + f->ctl->timeout))
+ if (!mod_timer(&qp->timer, jiffies + nf->timeout))
atomic_inc(&qp->refcnt);
atomic_inc(&qp->refcnt);
hlist_add_head(&qp->list, &f->hash[hash]);
- list_add_tail(&qp->lru_list, &f->lru_list);
- f->nqueues++;
+ list_add_tail(&qp->lru_list, &nf->lru_list);
+ nf->nqueues++;
write_unlock(&f->lock);
return qp;
}
-static struct inet_frag_queue *inet_frag_alloc(struct inet_frags *f, void *arg)
+static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+ struct inet_frags *f, void *arg)
{
struct inet_frag_queue *q;
@@ -220,35 +234,36 @@ static struct inet_frag_queue *inet_frag_alloc(struct inet_frags *f, void *arg)
return NULL;
f->constructor(q, arg);
- atomic_add(f->qsize, &f->mem);
+ atomic_add(f->qsize, &nf->mem);
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
+ q->net = nf;
return q;
}
-static struct inet_frag_queue *inet_frag_create(struct inet_frags *f,
- void *arg, unsigned int hash)
+static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
+ struct inet_frags *f, void *arg, unsigned int hash)
{
struct inet_frag_queue *q;
- q = inet_frag_alloc(f, arg);
+ q = inet_frag_alloc(nf, f, arg);
if (q == NULL)
return NULL;
- return inet_frag_intern(q, f, hash, arg);
+ return inet_frag_intern(nf, q, f, hash, arg);
}
-struct inet_frag_queue *inet_frag_find(struct inet_frags *f, void *key,
- unsigned int hash)
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
+ struct inet_frags *f, void *key, unsigned int hash)
{
struct inet_frag_queue *q;
struct hlist_node *n;
read_lock(&f->lock);
hlist_for_each_entry(q, n, &f->hash[hash], list) {
- if (f->match(q, key)) {
+ if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
read_unlock(&f->lock);
return q;
@@ -256,6 +271,6 @@ struct inet_frag_queue *inet_frag_find(struct inet_frags *f, void *key,
}
read_unlock(&f->lock);
- return inet_frag_create(f, key, hash);
+ return inet_frag_create(nf, f, key, hash);
}
EXPORT_SYMBOL(inet_frag_find);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 67704da04fc4..619c63c6948a 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -96,6 +96,7 @@ EXPORT_SYMBOL(inet_put_port);
* exclusive lock release). It should be ifdefed really.
*/
void inet_listen_wlock(struct inet_hashinfo *hashinfo)
+ __acquires(hashinfo->lhash_lock)
{
write_lock(&hashinfo->lhash_lock);
@@ -190,6 +191,44 @@ sherry_cache:
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+struct sock * __inet_lookup_established(struct inet_hashinfo *hashinfo,
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const u16 hnum,
+ const int dif)
+{
+ INET_ADDR_COOKIE(acookie, saddr, daddr)
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ struct sock *sk;
+ const struct hlist_node *node;
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways.
+ */
+ unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+ rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
+
+ prefetch(head->chain.first);
+ read_lock(lock);
+ sk_for_each(sk, node, &head->chain) {
+ if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
+ goto hit; /* You sunk my battleship! */
+ }
+
+ /* Must check for a TIME_WAIT'er before going to listener hash. */
+ sk_for_each(sk, node, &head->twchain) {
+ if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
+ goto hit;
+ }
+ sk = NULL;
+out:
+ read_unlock(lock);
+ return sk;
+hit:
+ sock_hold(sk);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_established);
+
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, __u16 lport,
@@ -239,7 +278,7 @@ unique:
sk->sk_hash = hash;
BUG_TRAP(sk_unhashed(sk));
__sk_add_node(sk, &head->chain);
- sock_prot_inc_use(sk->sk_prot);
+ sock_prot_inuse_add(sk->sk_prot, 1);
write_unlock(lock);
if (twp) {
@@ -267,6 +306,48 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
inet->dport);
}
+void __inet_hash_nolisten(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+ struct hlist_head *list;
+ rwlock_t *lock;
+ struct inet_ehash_bucket *head;
+
+ BUG_TRAP(sk_unhashed(sk));
+
+ sk->sk_hash = inet_sk_ehashfn(sk);
+ head = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ list = &head->chain;
+ lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ write_lock(lock);
+ __sk_add_node(sk, list);
+ sock_prot_inuse_add(sk->sk_prot, 1);
+ write_unlock(lock);
+}
+EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
+
+void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+ struct hlist_head *list;
+ rwlock_t *lock;
+
+ if (sk->sk_state != TCP_LISTEN) {
+ __inet_hash_nolisten(hashinfo, sk);
+ return;
+ }
+
+ BUG_TRAP(sk_unhashed(sk));
+ list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+ lock = &hashinfo->lhash_lock;
+
+ inet_listen_wlock(hashinfo);
+ __sk_add_node(sk, list);
+ sock_prot_inuse_add(sk->sk_prot, 1);
+ write_unlock(lock);
+ wake_up(&hashinfo->lhash_wait);
+}
+EXPORT_SYMBOL_GPL(__inet_hash);
+
/*
* Bind a port for a connect operation and hash it.
*/
@@ -334,7 +415,7 @@ ok:
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->sport = htons(port);
- __inet_hash(hinfo, sk, 0);
+ __inet_hash_nolisten(hinfo, sk);
}
spin_unlock(&head->lock);
@@ -351,7 +432,7 @@ ok:
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __inet_hash(hinfo, sk, 0);
+ __inet_hash_nolisten(hinfo, sk);
spin_unlock_bh(&head->lock);
return 0;
} else {
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index a60b99e0ebdc..876169f3a528 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -48,6 +48,21 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
inet_twsk_put(tw);
}
+void inet_twsk_put(struct inet_timewait_sock *tw)
+{
+ if (atomic_dec_and_test(&tw->tw_refcnt)) {
+ struct module *owner = tw->tw_prot->owner;
+ twsk_destructor((struct sock *)tw);
+#ifdef SOCK_REFCNT_DEBUG
+ printk(KERN_DEBUG "%s timewait_sock %p released\n",
+ tw->tw_prot->name, tw);
+#endif
+ kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
+ module_put(owner);
+ }
+}
+EXPORT_SYMBOL_GPL(inet_twsk_put);
+
/*
* Enter the time wait state. This is called with locally disabled BH.
* Essentially we whip up a timewait bucket, copy the relevant info into it
@@ -76,7 +91,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
/* Step 2: Remove SK from established hash. */
if (__sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
+ sock_prot_inuse_add(sk->sk_prot, -1);
/* Step 3: Hash TW into TIMEWAIT chain. */
inet_twsk_add_node(tw, &ehead->twchain);
@@ -194,16 +209,14 @@ out:
EXPORT_SYMBOL_GPL(inet_twdr_hangman);
-extern void twkill_slots_invalid(void);
-
void inet_twdr_twkill_work(struct work_struct *work)
{
struct inet_timewait_death_row *twdr =
container_of(work, struct inet_timewait_death_row, twkill_work);
int i;
- if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
- twkill_slots_invalid();
+ BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
+ (sizeof(twdr->thread_slots) * 8));
while (twdr->thread_slots) {
spin_lock_bh(&twdr->death_lock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 877da3ed52e2..0b3b328d82db 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -110,7 +110,7 @@ int ip_forward(struct sk_buff *skb)
skb->priority = rt_tos2priority(iph->tos);
- return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev,
+ return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev,
ip_forward_finish);
sr_failed:
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 2143bf30597a..a2e92f9709db 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -50,7 +50,7 @@
* as well. Or notify me, at least. --ANK
*/
-int sysctl_ipfrag_max_dist __read_mostly = 64;
+static int sysctl_ipfrag_max_dist __read_mostly = 64;
struct ipfrag_skb_cb
{
@@ -74,35 +74,16 @@ struct ipq {
struct inet_peer *peer;
};
-struct inet_frags_ctl ip4_frags_ctl __read_mostly = {
- /*
- * Fragment cache limits. We will commit 256K at one time. Should we
- * cross that limit we will prune down to 192K. This should cope with
- * even the most extreme cases without allowing an attacker to
- * measurably harm machine performance.
- */
- .high_thresh = 256 * 1024,
- .low_thresh = 192 * 1024,
-
- /*
- * Important NOTE! Fragment queue must be destroyed before MSL expires.
- * RFC791 is wrong proposing to prolongate timer each fragment arrival
- * by TTL.
- */
- .timeout = IP_FRAG_TIME,
- .secret_interval = 10 * 60 * HZ,
-};
-
static struct inet_frags ip4_frags;
-int ip_frag_nqueues(void)
+int ip_frag_nqueues(struct net *net)
{
- return ip4_frags.nqueues;
+ return net->ipv4.frags.nqueues;
}
-int ip_frag_mem(void)
+int ip_frag_mem(struct net *net)
{
- return atomic_read(&ip4_frags.mem);
+ return atomic_read(&net->ipv4.frags.mem);
}
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
@@ -142,11 +123,12 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
}
/* Memory Tracking Functions. */
-static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work)
+static __inline__ void frag_kfree_skb(struct netns_frags *nf,
+ struct sk_buff *skb, int *work)
{
if (work)
*work -= skb->truesize;
- atomic_sub(skb->truesize, &ip4_frags.mem);
+ atomic_sub(skb->truesize, &nf->mem);
kfree_skb(skb);
}
@@ -192,11 +174,11 @@ static void ipq_kill(struct ipq *ipq)
/* Memory limiting on fragments. Evictor trashes the oldest
* fragment queue until we are back under the threshold.
*/
-static void ip_evictor(void)
+static void ip_evictor(struct net *net)
{
int evicted;
- evicted = inet_frag_evictor(&ip4_frags);
+ evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags);
if (evicted)
IP_ADD_STATS_BH(IPSTATS_MIB_REASMFAILS, evicted);
}
@@ -236,7 +218,7 @@ out:
/* Find the correct entry in the "incomplete datagrams" queue for
* this IP datagram, and create new one, if nothing is found.
*/
-static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
+static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
{
struct inet_frag_queue *q;
struct ip4_create_arg arg;
@@ -246,7 +228,7 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
arg.user = user;
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
- q = inet_frag_find(&ip4_frags, &arg, hash);
+ q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
if (q == NULL)
goto out_nomem;
@@ -286,7 +268,7 @@ static int ip_frag_reinit(struct ipq *qp)
{
struct sk_buff *fp;
- if (!mod_timer(&qp->q.timer, jiffies + ip4_frags_ctl.timeout)) {
+ if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
atomic_inc(&qp->q.refcnt);
return -ETIMEDOUT;
}
@@ -294,7 +276,7 @@ static int ip_frag_reinit(struct ipq *qp)
fp = qp->q.fragments;
do {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(fp, NULL);
+ frag_kfree_skb(qp->q.net, fp, NULL);
fp = xp;
} while (fp);
@@ -431,7 +413,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
qp->q.fragments = next;
qp->q.meat -= free_it->len;
- frag_kfree_skb(free_it, NULL);
+ frag_kfree_skb(qp->q.net, free_it, NULL);
}
}
@@ -451,7 +433,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
}
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
- atomic_add(skb->truesize, &ip4_frags.mem);
+ atomic_add(skb->truesize, &qp->q.net->mem);
if (offset == 0)
qp->q.last_in |= FIRST_IN;
@@ -459,7 +441,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
return ip_frag_reasm(qp, prev, dev);
write_lock(&ip4_frags.lock);
- list_move_tail(&qp->q.lru_list, &ip4_frags.lru_list);
+ list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
write_unlock(&ip4_frags.lock);
return -EINPROGRESS;
@@ -534,12 +516,12 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- atomic_add(clone->truesize, &ip4_frags.mem);
+ atomic_add(clone->truesize, &qp->q.net->mem);
}
skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- atomic_sub(head->truesize, &ip4_frags.mem);
+ atomic_sub(head->truesize, &qp->q.net->mem);
for (fp=head->next; fp; fp = fp->next) {
head->data_len += fp->len;
@@ -549,7 +531,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
- atomic_sub(fp->truesize, &ip4_frags.mem);
+ atomic_sub(fp->truesize, &qp->q.net->mem);
}
head->next = NULL;
@@ -582,15 +564,17 @@ out_fail:
int ip_defrag(struct sk_buff *skb, u32 user)
{
struct ipq *qp;
+ struct net *net;
IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);
+ net = skb->dev->nd_net;
/* Start by cleaning up the memory. */
- if (atomic_read(&ip4_frags.mem) > ip4_frags_ctl.high_thresh)
- ip_evictor();
+ if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
+ ip_evictor(net);
/* Lookup (or create) queue header */
- if ((qp = ip_find(ip_hdr(skb), user)) != NULL) {
+ if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
int ret;
spin_lock(&qp->q.lock);
@@ -607,9 +591,142 @@ int ip_defrag(struct sk_buff *skb, u32 user)
return -ENOMEM;
}
+#ifdef CONFIG_SYSCTL
+static int zero;
+
+static struct ctl_table ip4_frags_ctl_table[] = {
+ {
+ .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH,
+ .procname = "ipfrag_high_thresh",
+ .data = &init_net.ipv4.frags.high_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
+ .procname = "ipfrag_low_thresh",
+ .data = &init_net.ipv4.frags.low_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = NET_IPV4_IPFRAG_TIME,
+ .procname = "ipfrag_time",
+ .data = &init_net.ipv4.frags.timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+ {
+ .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL,
+ .procname = "ipfrag_secret_interval",
+ .data = &ip4_frags.secret_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies
+ },
+ {
+ .procname = "ipfrag_max_dist",
+ .data = &sysctl_ipfrag_max_dist,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero
+ },
+ { }
+};
+
+static int ip4_frags_ctl_register(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = ip4_frags_ctl_table;
+ if (net != &init_net) {
+ table = kmemdup(table, sizeof(ip4_frags_ctl_table), GFP_KERNEL);
+ if (table == NULL)
+ goto err_alloc;
+
+ table[0].data = &net->ipv4.frags.high_thresh;
+ table[1].data = &net->ipv4.frags.low_thresh;
+ table[2].data = &net->ipv4.frags.timeout;
+ table[3].mode &= ~0222;
+ table[4].mode &= ~0222;
+ }
+
+ hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table);
+ if (hdr == NULL)
+ goto err_reg;
+
+ net->ipv4.frags_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (net != &init_net)
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void ip4_frags_ctl_unregister(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ipv4.frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.frags_hdr);
+ kfree(table);
+}
+#else
+static inline int ip4_frags_ctl_register(struct net *net)
+{
+ return 0;
+}
+
+static inline void ip4_frags_ctl_unregister(struct net *net)
+{
+}
+#endif
+
+static int ipv4_frags_init_net(struct net *net)
+{
+ /*
+ * Fragment cache limits. We will commit 256K at one time. Should we
+ * cross that limit we will prune down to 192K. This should cope with
+ * even the most extreme cases without allowing an attacker to
+ * measurably harm machine performance.
+ */
+ net->ipv4.frags.high_thresh = 256 * 1024;
+ net->ipv4.frags.low_thresh = 192 * 1024;
+ /*
+ * Important NOTE! Fragment queue must be destroyed before MSL expires.
+ * RFC791 is wrong proposing to prolongate timer each fragment arrival
+ * by TTL.
+ */
+ net->ipv4.frags.timeout = IP_FRAG_TIME;
+
+ inet_frags_init_net(&net->ipv4.frags);
+
+ return ip4_frags_ctl_register(net);
+}
+
+static void ipv4_frags_exit_net(struct net *net)
+{
+ ip4_frags_ctl_unregister(net);
+ inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+}
+
+static struct pernet_operations ip4_frags_ops = {
+ .init = ipv4_frags_init_net,
+ .exit = ipv4_frags_exit_net,
+};
+
void __init ipfrag_init(void)
{
- ip4_frags.ctl = &ip4_frags_ctl;
+ register_pernet_subsys(&ip4_frags_ops);
ip4_frags.hashfn = ip4_hashfn;
ip4_frags.constructor = ip4_frag_init;
ip4_frags.destructor = ip4_frag_free;
@@ -617,6 +734,7 @@ void __init ipfrag_init(void)
ip4_frags.qsize = sizeof(struct ipq);
ip4_frags.match = ip4_frag_match;
ip4_frags.frag_expire = ip_expire;
+ ip4_frags.secret_interval = 10 * 60 * HZ;
inet_frags_init(&ip4_frags);
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 4b93f32de10d..63f691719353 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -176,7 +176,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be3
}
for (t = tunnels_l[h1]; t; t = t->next) {
if (local == t->parms.iph.saddr ||
- (local == t->parms.iph.daddr && MULTICAST(local))) {
+ (local == t->parms.iph.daddr &&
+ ipv4_is_multicast(local))) {
if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
return t;
}
@@ -201,7 +202,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms)
if (local)
prio |= 1;
- if (remote && !MULTICAST(remote)) {
+ if (remote && !ipv4_is_multicast(remote)) {
prio |= 2;
h ^= HASH(remote);
}
@@ -367,7 +368,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
read_lock(&ipgre_lock);
t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((__be32*)p) + (grehlen>>2) - 1) : 0);
- if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
+ if (t == NULL || t->parms.iph.daddr == 0 ||
+ ipv4_is_multicast(t->parms.iph.daddr))
goto out;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
@@ -478,7 +480,7 @@ out:
fl.fl4_dst = eiph->saddr;
fl.fl4_tos = RT_TOS(eiph->tos);
fl.proto = IPPROTO_GRE;
- if (ip_route_output_key(&rt, &fl)) {
+ if (ip_route_output_key(&init_net, &rt, &fl)) {
kfree_skb(skb2);
return;
}
@@ -491,7 +493,7 @@ out:
fl.fl4_dst = eiph->daddr;
fl.fl4_src = eiph->saddr;
fl.fl4_tos = eiph->tos;
- if (ip_route_output_key(&rt, &fl) ||
+ if (ip_route_output_key(&init_net, &rt, &fl) ||
rt->u.dst.dev->type != ARPHRD_IPGRE) {
ip_rt_put(rt);
kfree_skb(skb2);
@@ -619,7 +621,7 @@ static int ipgre_rcv(struct sk_buff *skb)
skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
skb->pkt_type = PACKET_HOST;
#ifdef CONFIG_NET_IPGRE_BROADCAST
- if (MULTICAST(iph->daddr)) {
+ if (ipv4_is_multicast(iph->daddr)) {
/* Looped back packet, drop it! */
if (((struct rtable*)skb->dst)->fl.iif == 0)
goto drop;
@@ -746,7 +748,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
.saddr = tiph->saddr,
.tos = RT_TOS(tos) } },
.proto = IPPROTO_GRE };
- if (ip_route_output_key(&rt, &fl)) {
+ if (ip_route_output_key(&init_net, &rt, &fl)) {
tunnel->stat.tx_carrier_errors++;
goto tx_error;
}
@@ -783,7 +785,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
- if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
+ if ((tunnel->parms.iph.daddr &&
+ !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
rt6->rt6i_dst.plen == 128) {
rt6->rt6i_flags |= RTF_MODIFIED;
skb->dst->metrics[RTAX_MTU-1] = mtu;
@@ -896,6 +899,59 @@ tx_error:
return 0;
}
+static void ipgre_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel;
+ struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = ETH_DATA_LEN;
+ int addend = sizeof(struct iphdr) + 4;
+
+ tunnel = netdev_priv(dev);
+ iph = &tunnel->parms.iph;
+
+ /* Guess output device to choose reasonable mtu and hard_header_len */
+
+ if (iph->daddr) {
+ struct flowi fl = { .oif = tunnel->parms.link,
+ .nl_u = { .ip4_u =
+ { .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos) } },
+ .proto = IPPROTO_GRE };
+ struct rtable *rt;
+ if (!ip_route_output_key(&init_net, &rt, &fl)) {
+ tdev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len;
+ mtu = tdev->mtu;
+ }
+ dev->iflink = tunnel->parms.link;
+
+ /* Precalculate GRE options length */
+ if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
+ if (tunnel->parms.o_flags&GRE_CSUM)
+ addend += 4;
+ if (tunnel->parms.o_flags&GRE_KEY)
+ addend += 4;
+ if (tunnel->parms.o_flags&GRE_SEQ)
+ addend += 4;
+ }
+ dev->hard_header_len = hlen + addend;
+ dev->mtu = mtu - addend;
+ tunnel->hlen = addend;
+
+}
+
static int
ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
{
@@ -956,7 +1012,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
t = netdev_priv(dev);
- if (MULTICAST(p.iph.daddr))
+ if (ipv4_is_multicast(p.iph.daddr))
nflags = IFF_BROADCAST;
else if (p.iph.daddr)
nflags = IFF_POINTOPOINT;
@@ -983,6 +1039,11 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
t->parms.iph.ttl = p.iph.ttl;
t->parms.iph.tos = p.iph.tos;
t->parms.iph.frag_off = p.iph.frag_off;
+ if (t->parms.link != p.link) {
+ t->parms.link = p.link;
+ ipgre_tunnel_bind_dev(dev);
+ netdev_state_change(dev);
+ }
}
if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
err = -EFAULT;
@@ -1085,7 +1146,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
memcpy(&iph->daddr, daddr, 4);
return t->hlen;
}
- if (iph->daddr && !MULTICAST(iph->daddr))
+ if (iph->daddr && !ipv4_is_multicast(iph->daddr))
return t->hlen;
return -t->hlen;
@@ -1108,7 +1169,7 @@ static int ipgre_open(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
- if (MULTICAST(t->parms.iph.daddr)) {
+ if (ipv4_is_multicast(t->parms.iph.daddr)) {
struct flowi fl = { .oif = t->parms.link,
.nl_u = { .ip4_u =
{ .daddr = t->parms.iph.daddr,
@@ -1116,7 +1177,7 @@ static int ipgre_open(struct net_device *dev)
.tos = RT_TOS(t->parms.iph.tos) } },
.proto = IPPROTO_GRE };
struct rtable *rt;
- if (ip_route_output_key(&rt, &fl))
+ if (ip_route_output_key(&init_net, &rt, &fl))
return -EADDRNOTAVAIL;
dev = rt->u.dst.dev;
ip_rt_put(rt);
@@ -1131,8 +1192,9 @@ static int ipgre_open(struct net_device *dev)
static int ipgre_close(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
- if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
- struct in_device *in_dev = inetdev_by_index(t->mlink);
+ if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
+ struct in_device *in_dev;
+ in_dev = inetdev_by_index(dev->nd_net, t->mlink);
if (in_dev) {
ip_mc_dec_group(in_dev, t->parms.iph.daddr);
in_dev_put(in_dev);
@@ -1162,12 +1224,8 @@ static void ipgre_tunnel_setup(struct net_device *dev)
static int ipgre_tunnel_init(struct net_device *dev)
{
- struct net_device *tdev = NULL;
struct ip_tunnel *tunnel;
struct iphdr *iph;
- int hlen = LL_MAX_HEADER;
- int mtu = ETH_DATA_LEN;
- int addend = sizeof(struct iphdr) + 4;
tunnel = netdev_priv(dev);
iph = &tunnel->parms.iph;
@@ -1178,25 +1236,11 @@ static int ipgre_tunnel_init(struct net_device *dev)
memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
- /* Guess output device to choose reasonable mtu and hard_header_len */
+ ipgre_tunnel_bind_dev(dev);
if (iph->daddr) {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_GRE };
- struct rtable *rt;
- if (!ip_route_output_key(&rt, &fl)) {
- tdev = rt->u.dst.dev;
- ip_rt_put(rt);
- }
-
- dev->flags |= IFF_POINTOPOINT;
-
#ifdef CONFIG_NET_IPGRE_BROADCAST
- if (MULTICAST(iph->daddr)) {
+ if (ipv4_is_multicast(iph->daddr)) {
if (!iph->saddr)
return -EINVAL;
dev->flags = IFF_BROADCAST;
@@ -1205,31 +1249,9 @@ static int ipgre_tunnel_init(struct net_device *dev)
dev->stop = ipgre_close;
}
#endif
- } else {
+ } else
dev->header_ops = &ipgre_header_ops;
- }
-
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
-
- if (tdev) {
- hlen = tdev->hard_header_len;
- mtu = tdev->mtu;
- }
- dev->iflink = tunnel->parms.link;
- /* Precalculate GRE options length */
- if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
- if (tunnel->parms.o_flags&GRE_CSUM)
- addend += 4;
- if (tunnel->parms.o_flags&GRE_KEY)
- addend += 4;
- if (tunnel->parms.o_flags&GRE_SEQ)
- addend += 4;
- }
- dev->hard_header_len = hlen + addend;
- dev->mtu = mtu - addend;
- tunnel->hlen = addend;
return 0;
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 168c871fcd79..65631391d479 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -204,22 +204,14 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
rcu_read_lock();
{
- /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
int protocol = ip_hdr(skb)->protocol;
- int hash;
- struct sock *raw_sk;
+ int hash, raw;
struct net_protocol *ipprot;
resubmit:
- hash = protocol & (MAX_INET_PROTOS - 1);
- raw_sk = sk_head(&raw_v4_htable[hash]);
-
- /* If there maybe a raw socket we must check - if not we
- * don't care less
- */
- if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
- raw_sk = NULL;
+ raw = raw_local_deliver(skb, protocol);
+ hash = protocol & (MAX_INET_PROTOS - 1);
if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
int ret;
@@ -237,7 +229,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
}
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
} else {
- if (!raw_sk) {
+ if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
@@ -268,7 +260,7 @@ int ip_local_deliver(struct sk_buff *skb)
return 0;
}
- return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
+ return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
@@ -347,7 +339,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
#ifdef CONFIG_NET_CLS_ROUTE
if (unlikely(skb->dst->tclassid)) {
- struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
+ struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
u32 idx = skb->dst->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes+=skb->len;
@@ -442,7 +434,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
+ return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 2f14745a9e1f..4d315158fd3c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -151,7 +151,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
__be32 addr;
memcpy(&addr, sptr+soffset-1, 4);
- if (inet_addr_type(addr) != RTN_LOCAL) {
+ if (inet_addr_type(&init_net, addr) != RTN_LOCAL) {
dopt->ts_needtime = 1;
soffset += 8;
}
@@ -400,7 +400,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
{
__be32 addr;
memcpy(&addr, &optptr[optptr[2]-1], 4);
- if (inet_addr_type(addr) == RTN_UNICAST)
+ if (inet_addr_type(&init_net, addr) == RTN_UNICAST)
break;
if (skb)
timeptr = (__be32*)&optptr[optptr[2]+3];
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index bc9e57550e86..18070ca65771 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -91,6 +91,28 @@ __inline__ void ip_send_check(struct iphdr *iph)
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
+int __ip_local_out(struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ iph->tot_len = htons(skb->len);
+ ip_send_check(iph);
+ return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
+ dst_output);
+}
+
+int ip_local_out(struct sk_buff *skb)
+{
+ int err;
+
+ err = __ip_local_out(skb);
+ if (likely(err == 1))
+ err = dst_output(skb);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_local_out);
+
/* dev_loopback_xmit for use with netfilter. */
static int ip_dev_loopback_xmit(struct sk_buff *newskb)
{
@@ -138,20 +160,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
iph->protocol = sk->sk_protocol;
- iph->tot_len = htons(skb->len);
ip_select_ident(iph, &rt->u.dst, sk);
if (opt && opt->optlen) {
iph->ihl += opt->optlen>>2;
ip_options_build(skb, opt, daddr, rt, 0);
}
- ip_send_check(iph);
skb->priority = sk->sk_priority;
/* Send it out. */
- return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- dst_output);
+ return ip_local_out(skb);
}
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
@@ -251,8 +270,8 @@ int ip_mc_output(struct sk_buff *skb)
) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
- newskb->dev,
+ NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
+ NULL, newskb->dev,
ip_dev_loopback_xmit);
}
@@ -267,11 +286,11 @@ int ip_mc_output(struct sk_buff *skb)
if (rt->rt_flags&RTCF_BROADCAST) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
+ NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
newskb->dev, ip_dev_loopback_xmit);
}
- return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
+ return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -285,7 +304,7 @@ int ip_output(struct sk_buff *skb)
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
+ return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -331,7 +350,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
* itself out.
*/
security_sk_classify_flow(sk, &fl);
- if (ip_route_output_flow(&rt, &fl, sk, 0))
+ if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0))
goto no_route;
}
sk_setup_caps(sk, &rt->u.dst);
@@ -347,7 +366,6 @@ packet_routed:
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
- iph->tot_len = htons(skb->len);
if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
iph->frag_off = htons(IP_DF);
else
@@ -366,13 +384,9 @@ packet_routed:
ip_select_ident_more(iph, &rt->u.dst, sk,
(skb_shinfo(skb)->gso_segs ?: 1) - 1);
- /* Add an IP checksum. */
- ip_send_check(iph);
-
skb->priority = sk->sk_priority;
- return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- dst_output);
+ return ip_local_out(skb);
no_route:
IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
@@ -1262,14 +1276,12 @@ int ip_push_pending_frames(struct sock *sk)
ip_options_build(skb, opt, inet->cork.addr, rt, 0);
}
iph->tos = inet->tos;
- iph->tot_len = htons(skb->len);
iph->frag_off = df;
ip_select_ident(iph, &rt->u.dst, sk);
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
- ip_send_check(iph);
skb->priority = sk->sk_priority;
skb->dst = dst_clone(&rt->u.dst);
@@ -1279,8 +1291,7 @@ int ip_push_pending_frames(struct sock *sk)
skb_transport_header(skb))->type);
/* Netfilter gets whole the not fragmented skb. */
- err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
- skb->dst->dev, dst_output);
+ err = ip_local_out(skb);
if (err) {
if (err > 0)
err = inet->recverr ? net_xmit_errno(err) : 0;
@@ -1330,8 +1341,6 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
*
* Should run single threaded per socket because it uses the sock
* structure to pass arguments.
- *
- * LATER: switch from ip_build_xmit to ip_append_*
*/
void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
unsigned int len)
@@ -1370,7 +1379,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
.dport = tcp_hdr(skb)->source } },
.proto = sk->sk_protocol };
security_skb_classify_flow(skb, &fl);
- if (ip_route_output_key(&rt, &fl))
+ if (ip_route_output_key(sk->sk_net, &rt, &fl))
return;
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 82817e554363..754b0a5bbfe9 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -594,7 +594,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
err = 0;
break;
}
- dev = ip_dev_find(mreq.imr_address.s_addr);
+ dev = ip_dev_find(&init_net, mreq.imr_address.s_addr);
if (dev) {
mreq.imr_ifindex = dev->ifindex;
dev_put(dev);
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 2c44a94c2135..f4af99ad8fdb 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -182,7 +182,6 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
{
struct xfrm_state *t;
- u8 mode = XFRM_MODE_TUNNEL;
t = xfrm_state_alloc();
if (t == NULL)
@@ -193,9 +192,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
t->id.daddr.a4 = x->id.daddr.a4;
memcpy(&t->sel, &x->sel, sizeof(t->sel));
t->props.family = AF_INET;
- if (x->props.mode == XFRM_MODE_BEET)
- mode = x->props.mode;
- t->props.mode = mode;
+ t->props.mode = x->props.mode;
t->props.saddr.a4 = x->props.saddr.a4;
t->props.flags = x->props.flags;
@@ -389,15 +386,22 @@ static int ipcomp_init_state(struct xfrm_state *x)
if (x->encap)
goto out;
+ x->props.header_len = 0;
+ switch (x->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ break;
+ case XFRM_MODE_TUNNEL:
+ x->props.header_len += sizeof(struct iphdr);
+ break;
+ default:
+ goto out;
+ }
+
err = -ENOMEM;
ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL);
if (!ipcd)
goto out;
- x->props.header_len = 0;
- if (x->props.mode == XFRM_MODE_TUNNEL)
- x->props.header_len += sizeof(struct iphdr);
-
mutex_lock(&ipcomp_resource_mutex);
if (!ipcomp_alloc_scratches())
goto error;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b8f7763b2261..a52b5853aaa8 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -140,6 +140,9 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */
__be32 root_server_addr = NONE; /* Address of NFS server */
u8 root_server_path[256] = { 0, }; /* Path to mount as root */
+/* vendor class identifier */
+static char vendor_class_identifier[253] __initdata;
+
/* Persistent data: */
static int ic_proto_used; /* Protocol used, if any */
@@ -299,7 +302,7 @@ static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
mm_segment_t oldfs = get_fs();
set_fs(get_ds());
- res = ip_rt_ioctl(cmd, (void __user *) arg);
+ res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg);
set_fs(oldfs);
return res;
}
@@ -588,6 +591,7 @@ ic_dhcp_init_options(u8 *options)
u8 mt = ((ic_servaddr == NONE)
? DHCPDISCOVER : DHCPREQUEST);
u8 *e = options;
+ int len;
#ifdef IPCONFIG_DEBUG
printk("DHCP: Sending message type %d\n", mt);
@@ -628,6 +632,16 @@ ic_dhcp_init_options(u8 *options)
*e++ = sizeof(ic_req_params);
memcpy(e, ic_req_params, sizeof(ic_req_params));
e += sizeof(ic_req_params);
+
+ if (*vendor_class_identifier) {
+ printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
+ vendor_class_identifier);
+ *e++ = 60; /* Class-identifier */
+ len = strlen(vendor_class_identifier);
+ *e++ = len;
+ memcpy(e, vendor_class_identifier, len);
+ e += len;
+ }
}
*e++ = 255; /* End of the list */
@@ -1513,5 +1527,16 @@ static int __init nfsaddrs_config_setup(char *addrs)
return ip_auto_config_setup(addrs);
}
+static int __init vendor_class_identifier_setup(char *addrs)
+{
+ if (strlcpy(vendor_class_identifier, addrs,
+ sizeof(vendor_class_identifier))
+ >= sizeof(vendor_class_identifier))
+ printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"",
+ vendor_class_identifier);
+ return 1;
+}
+
__setup("ip=", ip_auto_config_setup);
__setup("nfsaddrs=", nfsaddrs_config_setup);
+__setup("dhcpclass=", vendor_class_identifier_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 8c2b2b0741da..da281581692c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -405,7 +405,7 @@ out:
fl.fl4_daddr = eiph->saddr;
fl.fl4_tos = RT_TOS(eiph->tos);
fl.proto = IPPROTO_IPIP;
- if (ip_route_output_key(&rt, &key)) {
+ if (ip_route_output_key(&init_net, &rt, &key)) {
kfree_skb(skb2);
return 0;
}
@@ -418,7 +418,7 @@ out:
fl.fl4_daddr = eiph->daddr;
fl.fl4_src = eiph->saddr;
fl.fl4_tos = eiph->tos;
- if (ip_route_output_key(&rt, &fl) ||
+ if (ip_route_output_key(&init_net, &rt, &fl) ||
rt->u.dst.dev->type != ARPHRD_TUNNEL) {
ip_rt_put(rt);
kfree_skb(skb2);
@@ -547,7 +547,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
.saddr = tiph->saddr,
.tos = RT_TOS(tos) } },
.proto = IPPROTO_IPIP };
- if (ip_route_output_key(&rt, &fl)) {
+ if (ip_route_output_key(&init_net, &rt, &fl)) {
tunnel->stat.tx_carrier_errors++;
goto tx_error_icmp;
}
@@ -651,6 +651,40 @@ tx_error:
return 0;
}
+static void ipip_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel;
+ struct iphdr *iph;
+
+ tunnel = netdev_priv(dev);
+ iph = &tunnel->parms.iph;
+
+ if (iph->daddr) {
+ struct flowi fl = { .oif = tunnel->parms.link,
+ .nl_u = { .ip4_u =
+ { .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos) } },
+ .proto = IPPROTO_IPIP };
+ struct rtable *rt;
+ if (!ip_route_output_key(&init_net, &rt, &fl)) {
+ tdev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
+
+ if (tdev) {
+ dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
+ dev->mtu = tdev->mtu - sizeof(struct iphdr);
+ }
+ dev->iflink = tunnel->parms.link;
+}
+
static int
ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
{
@@ -723,6 +757,11 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
t->parms.iph.ttl = p.iph.ttl;
t->parms.iph.tos = p.iph.tos;
t->parms.iph.frag_off = p.iph.frag_off;
+ if (t->parms.link != p.link) {
+ t->parms.link = p.link;
+ ipip_tunnel_bind_dev(dev);
+ netdev_state_change(dev);
+ }
}
if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
err = -EFAULT;
@@ -791,12 +830,9 @@ static void ipip_tunnel_setup(struct net_device *dev)
static int ipip_tunnel_init(struct net_device *dev)
{
- struct net_device *tdev = NULL;
struct ip_tunnel *tunnel;
- struct iphdr *iph;
tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
tunnel->dev = dev;
strcpy(tunnel->parms.name, dev->name);
@@ -804,29 +840,7 @@ static int ipip_tunnel_init(struct net_device *dev)
memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
- if (iph->daddr) {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
- struct rtable *rt;
- if (!ip_route_output_key(&rt, &fl)) {
- tdev = rt->u.dst.dev;
- ip_rt_put(rt);
- }
- dev->flags |= IFF_POINTOPOINT;
- }
-
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
-
- if (tdev) {
- dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
- dev->mtu = tdev->mtu - sizeof(struct iphdr);
- }
- dev->iflink = tunnel->parms.link;
+ ipip_tunnel_bind_dev(dev);
return 0;
}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 37bb497d92af..a94f52c207a7 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -141,7 +141,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
p.iph.ihl = 5;
p.iph.protocol = IPPROTO_IPIP;
sprintf(p.name, "dvmrp%d", v->vifc_vifi);
- ifr.ifr_ifru.ifru_data = (void*)&p;
+ ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
oldfs = get_fs(); set_fs(KERNEL_DS);
err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
@@ -321,7 +321,7 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
e->error = -ETIMEDOUT;
memset(&e->msg, 0, sizeof(e->msg));
- rtnl_unicast(skb, NETLINK_CB(skb).pid);
+ rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
} else
kfree_skb(skb);
}
@@ -423,7 +423,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
return -ENOBUFS;
break;
case 0:
- dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
+ dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
if (!dev)
return -EADDRNOTAVAIL;
dev_put(dev);
@@ -533,7 +533,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
memset(&e->msg, 0, sizeof(e->msg));
}
- rtnl_unicast(skb, NETLINK_CB(skb).pid);
+ rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
} else
ip_mr_forward(skb, c, 0);
}
@@ -749,7 +749,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
return 0;
}
- if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
+ if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
return -EINVAL;
c=ipmr_cache_alloc();
@@ -849,7 +849,7 @@ static void mrtsock_destruct(struct sock *sk)
{
rtnl_lock();
if (sk == mroute_socket) {
- IPV4_DEVCONF_ALL(MC_FORWARDING)--;
+ IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)--;
write_lock_bh(&mrt_lock);
mroute_socket=NULL;
@@ -898,7 +898,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt
mroute_socket=sk;
write_unlock_bh(&mrt_lock);
- IPV4_DEVCONF_ALL(MC_FORWARDING)++;
+ IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)++;
}
rtnl_unlock();
return ret;
@@ -954,10 +954,12 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt
#ifdef CONFIG_IP_PIMSM
case MRT_PIM:
{
- int v, ret;
+ int v;
+
if (get_user(v,(int __user *)optval))
return -EFAULT;
- v = (v)?1:0;
+ v = (v) ? 1 : 0;
+
rtnl_lock();
ret = 0;
if (v != mroute_do_pim) {
@@ -1183,7 +1185,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
.saddr = vif->local,
.tos = RT_TOS(iph->tos) } },
.proto = IPPROTO_IPIP };
- if (ip_route_output_key(&rt, &fl))
+ if (ip_route_output_key(&init_net, &rt, &fl))
goto out_free;
encap = sizeof(struct iphdr);
} else {
@@ -1192,7 +1194,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
{ .daddr = iph->daddr,
.tos = RT_TOS(iph->tos) } },
.proto = IPPROTO_IPIP };
- if (ip_route_output_key(&rt, &fl))
+ if (ip_route_output_key(&init_net, &rt, &fl))
goto out_free;
}
@@ -1245,7 +1247,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
+ NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
ipmr_forward_finish);
return;
@@ -1461,7 +1463,7 @@ int pim_rcv_v1(struct sk_buff * skb)
b. packet is not a NULL-REGISTER
c. packet is not truncated
*/
- if (!MULTICAST(encap->daddr) ||
+ if (!ipv4_is_multicast(encap->daddr) ||
encap->tot_len == 0 ||
ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
goto drop;
@@ -1517,7 +1519,7 @@ static int pim_rcv(struct sk_buff * skb)
/* check if the inner packet is destined to mcast group */
encap = (struct iphdr *)(skb_transport_header(skb) +
sizeof(struct pimreghdr));
- if (!MULTICAST(encap->daddr) ||
+ if (!ipv4_is_multicast(encap->daddr) ||
encap->tot_len == 0 ||
ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
goto drop;
@@ -1659,6 +1661,7 @@ static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
}
static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(mrt_lock)
{
read_lock(&mrt_lock);
return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
@@ -1682,6 +1685,7 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
+ __releases(mrt_lock)
{
read_unlock(&mrt_lock);
}
@@ -1889,8 +1893,7 @@ void __init ip_mr_init(void)
sizeof(struct mfc_cache),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
- init_timer(&ipmr_expire_timer);
- ipmr_expire_timer.function=ipmr_expire_process;
+ setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
register_netdevice_notifier(&ip_mr_notifier);
#ifdef CONFIG_PROC_FS
proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops);
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index 664cb8e97c1c..535abe0c45e7 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -51,18 +51,13 @@ static DEFINE_MUTEX(__ip_vs_app_mutex);
*/
static inline int ip_vs_app_get(struct ip_vs_app *app)
{
- /* test and get the module atomically */
- if (app->module)
- return try_module_get(app->module);
- else
- return 1;
+ return try_module_get(app->module);
}
static inline void ip_vs_app_put(struct ip_vs_app *app)
{
- if (app->module)
- module_put(app->module);
+ module_put(app->module);
}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 0a9f3c37e18d..65f1ba112752 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -393,7 +393,15 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
atomic_inc(&dest->refcnt);
/* Bind with the destination and its corresponding transmitter */
- cp->flags |= atomic_read(&dest->conn_flags);
+ if ((cp->flags & IP_VS_CONN_F_SYNC) &&
+ (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
+ /* if the connection is not template and is created
+ * by sync, preserve the activity flag.
+ */
+ cp->flags |= atomic_read(&dest->conn_flags) &
+ (~IP_VS_CONN_F_INACTIVE);
+ else
+ cp->flags |= atomic_read(&dest->conn_flags);
cp->dest = dest;
IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
@@ -412,7 +420,11 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
/* It is a normal connection, so increase the inactive
connection counter because it is in TCP SYNRECV
state (inactive) or other protocol inacive state */
- atomic_inc(&dest->inactconns);
+ if ((cp->flags & IP_VS_CONN_F_SYNC) &&
+ (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
+ atomic_inc(&dest->activeconns);
+ else
+ atomic_inc(&dest->inactconns);
} else {
/* It is a persistent connection/template, so increase
the peristent connection counter */
@@ -629,9 +641,7 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport
}
INIT_LIST_HEAD(&cp->c_list);
- init_timer(&cp->timer);
- cp->timer.data = (unsigned long)cp;
- cp->timer.function = ip_vs_conn_expire;
+ setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
cp->protocol = proto;
cp->caddr = caddr;
cp->cport = cport;
@@ -783,6 +793,57 @@ static const struct file_operations ip_vs_conn_fops = {
.llseek = seq_lseek,
.release = seq_release,
};
+
+static const char *ip_vs_origin_name(unsigned flags)
+{
+ if (flags & IP_VS_CONN_F_SYNC)
+ return "SYNC";
+ else
+ return "LOCAL";
+}
+
+static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
+{
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
+ else {
+ const struct ip_vs_conn *cp = v;
+
+ seq_printf(seq,
+ "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n",
+ ip_vs_proto_name(cp->protocol),
+ ntohl(cp->caddr), ntohs(cp->cport),
+ ntohl(cp->vaddr), ntohs(cp->vport),
+ ntohl(cp->daddr), ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_origin_name(cp->flags),
+ (cp->timer.expires-jiffies)/HZ);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_conn_sync_seq_ops = {
+ .start = ip_vs_conn_seq_start,
+ .next = ip_vs_conn_seq_next,
+ .stop = ip_vs_conn_seq_stop,
+ .show = ip_vs_conn_sync_seq_show,
+};
+
+static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ip_vs_conn_sync_seq_ops);
+}
+
+static const struct file_operations ip_vs_conn_sync_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_conn_sync_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
#endif
@@ -942,6 +1003,7 @@ int ip_vs_conn_init(void)
}
proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
+ proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
/* calculate the random value for connection hash */
get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
@@ -958,5 +1020,6 @@ void ip_vs_conn_cleanup(void)
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
proc_net_remove(&init_net, "ip_vs_conn");
+ proc_net_remove(&init_net, "ip_vs_conn_sync");
vfree(ip_vs_conn_tab);
}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 8fba20256f52..963981a9d501 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -423,7 +423,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
and the destination is RTN_UNICAST (and not local), then create
a cache_bypass connection entry */
if (sysctl_ip_vs_cache_bypass && svc->fwmark
- && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
+ && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) {
int ret, cs;
struct ip_vs_conn *cp;
@@ -481,7 +481,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
/*
- * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
+ * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
* chain, and is used for VS/NAT.
* It detects packets for VS/NAT connections and sends the packets
* immediately. This can avoid that iptable_nat mangles the packets
@@ -679,7 +679,7 @@ static inline int is_tcp_reset(const struct sk_buff *skb)
}
/*
- * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
+ * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
* Check if outgoing packet belongs to the established ip_vs_conn,
* rewrite addresses of the packet and send it on its way...
*/
@@ -814,7 +814,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
/* reassemble IP fragments */
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
- if (ip_vs_gather_frags(skb, hooknum == NF_IP_LOCAL_IN ?
+ if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
return NF_STOLEN;
}
@@ -1003,12 +1003,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
/*
- * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
+ * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
* related packets destined for 0.0.0.0/0.
* When fwmark-based virtual service is used, such as transparent
* cache cluster, TCP packets can be marked and routed to ip_vs_in,
* but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
- * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
+ * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
* and send them to ip_vs_in_icmp.
*/
static unsigned int
@@ -1025,43 +1025,42 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
}
-/* After packet filtering, forward packet through VS/DR, VS/TUN,
- or VS/NAT(change destination), so that filtering rules can be
- applied to IPVS. */
-static struct nf_hook_ops ip_vs_in_ops = {
- .hook = ip_vs_in,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = 100,
-};
-
-/* After packet filtering, change source only for VS/NAT */
-static struct nf_hook_ops ip_vs_out_ops = {
- .hook = ip_vs_out,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
- .priority = 100,
-};
-
-/* After packet filtering (but before ip_vs_out_icmp), catch icmp
- destined for 0.0.0.0/0, which is for incoming IPVS connections */
-static struct nf_hook_ops ip_vs_forward_icmp_ops = {
- .hook = ip_vs_forward_icmp,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
- .priority = 99,
-};
-
-/* Before the netfilter connection tracking, exit from POST_ROUTING */
-static struct nf_hook_ops ip_vs_post_routing_ops = {
- .hook = ip_vs_post_routing,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_NAT_SRC-1,
+static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+ /* After packet filtering, forward packet through VS/DR, VS/TUN,
+ * or VS/NAT(change destination), so that filtering rules can be
+ * applied to IPVS. */
+ {
+ .hook = ip_vs_in,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = 100,
+ },
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_out,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
+ },
+ /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+ * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+ {
+ .hook = ip_vs_forward_icmp,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
+ },
+ /* Before the netfilter connection tracking, exit from POST_ROUTING */
+ {
+ .hook = ip_vs_post_routing,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC-1,
+ },
};
@@ -1092,37 +1091,15 @@ static int __init ip_vs_init(void)
goto cleanup_app;
}
- ret = nf_register_hook(&ip_vs_in_ops);
+ ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
if (ret < 0) {
- IP_VS_ERR("can't register in hook.\n");
+ IP_VS_ERR("can't register hooks.\n");
goto cleanup_conn;
}
- ret = nf_register_hook(&ip_vs_out_ops);
- if (ret < 0) {
- IP_VS_ERR("can't register out hook.\n");
- goto cleanup_inops;
- }
- ret = nf_register_hook(&ip_vs_post_routing_ops);
- if (ret < 0) {
- IP_VS_ERR("can't register post_routing hook.\n");
- goto cleanup_outops;
- }
- ret = nf_register_hook(&ip_vs_forward_icmp_ops);
- if (ret < 0) {
- IP_VS_ERR("can't register forward_icmp hook.\n");
- goto cleanup_postroutingops;
- }
-
IP_VS_INFO("ipvs loaded.\n");
return ret;
- cleanup_postroutingops:
- nf_unregister_hook(&ip_vs_post_routing_ops);
- cleanup_outops:
- nf_unregister_hook(&ip_vs_out_ops);
- cleanup_inops:
- nf_unregister_hook(&ip_vs_in_ops);
cleanup_conn:
ip_vs_conn_cleanup();
cleanup_app:
@@ -1136,10 +1113,7 @@ static int __init ip_vs_init(void)
static void __exit ip_vs_cleanup(void)
{
- nf_unregister_hook(&ip_vs_forward_icmp_ops);
- nf_unregister_hook(&ip_vs_post_routing_ops);
- nf_unregister_hook(&ip_vs_out_ops);
- nf_unregister_hook(&ip_vs_in_ops);
+ nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
ip_vs_conn_cleanup();
ip_vs_app_cleanup();
ip_vs_protocol_cleanup();
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 693d92490c11..94c5767c8e01 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -704,7 +704,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
/* check if local node and update the flags */
- if (inet_addr_type(udest->addr) == RTN_LOCAL) {
+ if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) {
conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
| IP_VS_CONN_F_LOCALNODE;
}
@@ -756,7 +756,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
EnterFunction(2);
- atype = inet_addr_type(udest->addr);
+ atype = inet_addr_type(&init_net, udest->addr);
if (atype != RTN_LOCAL && atype != RTN_UNICAST)
return -EINVAL;
@@ -1591,34 +1591,13 @@ static struct ctl_table vs_vars[] = {
{ .ctl_name = 0 }
};
-static ctl_table vs_table[] = {
- {
- .procname = "vs",
- .mode = 0555,
- .child = vs_vars
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipvs_ipv4_table[] = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = vs_table,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table vs_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ipvs_ipv4_table,
- },
- { .ctl_name = 0 }
+struct ctl_path net_vs_ctl_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "ipv4", .ctl_name = NET_IPV4, },
+ { .procname = "vs", },
+ { }
};
+EXPORT_SYMBOL_GPL(net_vs_ctl_path);
static struct ctl_table_header * sysctl_header;
@@ -2345,7 +2324,7 @@ int ip_vs_control_init(void)
proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
- sysctl_header = register_sysctl_table(vs_root_table);
+ sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 7d68b80c4c19..dfa0d713c801 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/interrupt.h>
+#include <linux/sysctl.h>
#include <net/ip_vs.h>
@@ -146,9 +147,8 @@ int ip_vs_new_estimator(struct ip_vs_stats *stats)
write_lock_bh(&est_lock);
est->next = est_list;
if (est->next == NULL) {
- init_timer(&est_timer);
+ setup_timer(&est_timer, estimation_timer, 0);
est_timer.expires = jiffies + 2*HZ;
- est_timer.function = estimation_timer;
add_timer(&est_timer);
}
est_list = est;
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index ad89644ef5d2..3888642706ad 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -123,35 +123,6 @@ static ctl_table vs_vars_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table vs_table[] = {
- {
- .procname = "vs",
- .mode = 0555,
- .child = vs_vars_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipvs_ipv4_table[] = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = vs_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table lblc_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ipvs_ipv4_table
- },
- { .ctl_name = 0 }
-};
-
static struct ctl_table_header * sysctl_header;
/*
@@ -391,9 +362,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
/*
* Hook periodic timer for garbage collection
*/
- init_timer(&tbl->periodic_timer);
- tbl->periodic_timer.data = (unsigned long)tbl;
- tbl->periodic_timer.function = ip_vs_lblc_check_expire;
+ setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
+ (unsigned long)tbl);
tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
add_timer(&tbl->periodic_timer);
@@ -583,7 +553,7 @@ static int __init ip_vs_lblc_init(void)
int ret;
INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
- sysctl_header = register_sysctl_table(lblc_root_table);
+ sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
if (ret)
unregister_sysctl_table(sysctl_header);
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 2a5ed85a3352..daa260eb21cf 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -311,35 +311,6 @@ static ctl_table vs_vars_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table vs_table[] = {
- {
- .procname = "vs",
- .mode = 0555,
- .child = vs_vars_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipvs_ipv4_table[] = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = vs_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table lblcr_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ipvs_ipv4_table
- },
- { .ctl_name = 0 }
-};
-
static struct ctl_table_header * sysctl_header;
/*
@@ -575,9 +546,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
/*
* Hook periodic timer for garbage collection
*/
- init_timer(&tbl->periodic_timer);
- tbl->periodic_timer.data = (unsigned long)tbl;
- tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
+ setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
+ (unsigned long)tbl);
tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
add_timer(&tbl->periodic_timer);
@@ -772,7 +742,7 @@ static int __init ip_vs_lblcr_init(void)
int ret;
INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
- sysctl_header = register_sysctl_table(lblcr_root_table);
+ sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
if (ret)
unregister_sysctl_table(sysctl_header);
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
index c0e11ec8f0f9..dde28a250d92 100644
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -165,7 +165,7 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
if (ih == NULL)
sprintf(buf, "%s TRUNCATED", pp->name);
- else if (ih->frag_off & __constant_htons(IP_OFFSET))
+ else if (ih->frag_off & htons(IP_OFFSET))
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
pp->name, NIPQUAD(ih->saddr),
NIPQUAD(ih->daddr));
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
index c36ccf057a19..aef0d3ee8e44 100644
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -52,15 +52,15 @@ esp_conn_in_get(const struct sk_buff *skb,
if (likely(!inverse)) {
cp = ip_vs_conn_in_get(IPPROTO_UDP,
iph->saddr,
- __constant_htons(PORT_ISAKMP),
+ htons(PORT_ISAKMP),
iph->daddr,
- __constant_htons(PORT_ISAKMP));
+ htons(PORT_ISAKMP));
} else {
cp = ip_vs_conn_in_get(IPPROTO_UDP,
iph->daddr,
- __constant_htons(PORT_ISAKMP),
+ htons(PORT_ISAKMP),
iph->saddr,
- __constant_htons(PORT_ISAKMP));
+ htons(PORT_ISAKMP));
}
if (!cp) {
@@ -89,15 +89,15 @@ esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
if (likely(!inverse)) {
cp = ip_vs_conn_out_get(IPPROTO_UDP,
iph->saddr,
- __constant_htons(PORT_ISAKMP),
+ htons(PORT_ISAKMP),
iph->daddr,
- __constant_htons(PORT_ISAKMP));
+ htons(PORT_ISAKMP));
} else {
cp = ip_vs_conn_out_get(IPPROTO_UDP,
iph->daddr,
- __constant_htons(PORT_ISAKMP),
+ htons(PORT_ISAKMP),
iph->saddr,
- __constant_htons(PORT_ISAKMP));
+ htons(PORT_ISAKMP));
}
if (!cp) {
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
index 432235861908..121a32b1b756 100644
--- a/net/ipv4/ipvs/ip_vs_sched.c
+++ b/net/ipv4/ipvs/ip_vs_sched.c
@@ -24,6 +24,7 @@
#include <linux/interrupt.h>
#include <asm/string.h>
#include <linux/kmod.h>
+#include <linux/sysctl.h>
#include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index bd930efc18da..948378d0a755 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -305,10 +305,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
for (i=0; i<m->nr_conns; i++) {
- unsigned flags;
+ unsigned flags, state;
s = (struct ip_vs_sync_conn *)p;
- flags = ntohs(s->flags);
+ flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
+ state = ntohs(s->state);
if (!(flags & IP_VS_CONN_F_TEMPLATE))
cp = ip_vs_conn_in_get(s->protocol,
s->caddr, s->cport,
@@ -326,6 +327,13 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
dest = ip_vs_find_dest(s->daddr, s->dport,
s->vaddr, s->vport,
s->protocol);
+ /* Set the approprite ativity flag */
+ if (s->protocol == IPPROTO_TCP) {
+ if (state != IP_VS_TCP_S_ESTABLISHED)
+ flags |= IP_VS_CONN_F_INACTIVE;
+ else
+ flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
cp = ip_vs_conn_new(s->protocol,
s->caddr, s->cport,
s->vaddr, s->vport,
@@ -337,7 +345,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
IP_VS_ERR("ip_vs_conn_new failed\n");
return;
}
- cp->state = ntohs(s->state);
+ cp->state = state;
} else if (!cp->dest) {
dest = ip_vs_try_bind_dest(cp);
if (!dest) {
@@ -346,8 +354,22 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
cp->flags = flags | IP_VS_CONN_F_HASHED;
} else
atomic_dec(&dest->refcnt);
- } /* Note that we don't touch its state and flags
- if it is a normal entry. */
+ } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
+ (cp->state != state)) {
+ /* update active/inactive flag for the connection */
+ dest = cp->dest;
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state != IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state == IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ }
if (flags & IP_VS_CONN_F_SEQ_MASK) {
opt = (struct ip_vs_sync_conn_options *)&s[1];
@@ -357,7 +379,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
p += SIMPLE_CONN_SIZE;
atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
- cp->state = ntohs(s->state);
+ cp->state = state;
pp = ip_vs_proto_get(s->protocol);
cp->timeout = pp->timeout_table[cp->state];
ip_vs_conn_put(cp);
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index 7c074e386c17..f63006caea03 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -16,8 +16,8 @@
*/
#include <linux/kernel.h>
-#include <linux/ip.h>
#include <linux/tcp.h> /* for tcphdr */
+#include <net/ip.h>
#include <net/tcp.h> /* for csum_tcpudp_magic */
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
@@ -59,7 +59,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
return dst;
}
-static inline struct rtable *
+static struct rtable *
__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
{
struct rtable *rt; /* Route to the other host */
@@ -78,7 +78,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
.tos = rtos, } },
};
- if (ip_route_output_key(&rt, &fl)) {
+ if (ip_route_output_key(&init_net, &rt, &fl)) {
spin_unlock(&dest->dst_lock);
IP_VS_DBG_RL("ip_route_output error, "
"dest: %u.%u.%u.%u\n",
@@ -101,7 +101,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
.tos = rtos, } },
};
- if (ip_route_output_key(&rt, &fl)) {
+ if (ip_route_output_key(&init_net, &rt, &fl)) {
IP_VS_DBG_RL("ip_route_output error, dest: "
"%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
return NULL;
@@ -129,7 +129,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
do { \
(skb)->ipvs_property = 1; \
skb_forward_csum(skb); \
- NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
+ NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \
(rt)->u.dst.dev, dst_output); \
} while (0)
@@ -170,7 +170,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
- if (ip_route_output_key(&rt, &fl)) {
+ if (ip_route_output_key(&init_net, &rt, &fl)) {
IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
"dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
goto tx_error_icmp;
@@ -406,14 +406,12 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
iph->ttl = old_iph->ttl;
- iph->tot_len = htons(skb->len);
ip_select_ident(iph, &rt->u.dst, NULL);
- ip_send_check(iph);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(skb, rt);
+ ip_local_out(skb);
LeaveFunction(10);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 5539debf4973..9a904c6c0dc8 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -7,6 +7,7 @@
#include <net/route.h>
#include <net/xfrm.h>
#include <net/ip.h>
+#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
@@ -18,12 +19,12 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
unsigned int hh_len;
unsigned int type;
- type = inet_addr_type(iph->saddr);
+ type = inet_addr_type(&init_net, iph->saddr);
if (addr_type == RTN_UNSPEC)
addr_type = type;
/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
- * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
+ * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
*/
if (addr_type == RTN_LOCAL) {
fl.nl_u.ip4_u.daddr = iph->daddr;
@@ -32,7 +33,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
fl.mark = skb->mark;
- if (ip_route_output_key(&rt, &fl) != 0)
+ if (ip_route_output_key(&init_net, &rt, &fl) != 0)
return -1;
/* Drop old route. */
@@ -42,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
/* non-local src, find valid iif to satisfy
* rp-filter when calling ip_route_input. */
fl.nl_u.ip4_u.daddr = iph->saddr;
- if (ip_route_output_key(&rt, &fl) != 0)
+ if (ip_route_output_key(&init_net, &rt, &fl) != 0)
return -1;
odst = skb->dst;
@@ -122,11 +123,12 @@ struct ip_rt_info {
u_int8_t tos;
};
-static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_info *info)
+static void nf_ip_saveroute(const struct sk_buff *skb,
+ struct nf_queue_entry *entry)
{
- struct ip_rt_info *rt_info = nf_info_reroute(info);
+ struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
- if (info->hook == NF_IP_LOCAL_OUT) {
+ if (entry->hook == NF_INET_LOCAL_OUT) {
const struct iphdr *iph = ip_hdr(skb);
rt_info->tos = iph->tos;
@@ -135,11 +137,12 @@ static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_info *info)
}
}
-static int nf_ip_reroute(struct sk_buff *skb, const struct nf_info *info)
+static int nf_ip_reroute(struct sk_buff *skb,
+ const struct nf_queue_entry *entry)
{
- const struct ip_rt_info *rt_info = nf_info_reroute(info);
+ const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
- if (info->hook == NF_IP_LOCAL_OUT) {
+ if (entry->hook == NF_INET_LOCAL_OUT) {
const struct iphdr *iph = ip_hdr(skb);
if (!(iph->tos == rt_info->tos
@@ -158,7 +161,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
- if (hook != NF_IP_PRE_ROUTING && hook != NF_IP_LOCAL_IN)
+ if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
break;
if ((protocol == 0 && !csum_fold(skb->csum)) ||
!csum_tcpudp_magic(iph->saddr, iph->daddr,
@@ -182,9 +185,15 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
EXPORT_SYMBOL(nf_ip_checksum);
-static struct nf_afinfo nf_ip_afinfo = {
+static int nf_ip_route(struct dst_entry **dst, struct flowi *fl)
+{
+ return ip_route_output_key(&init_net, (struct rtable **)dst, fl);
+}
+
+static const struct nf_afinfo nf_ip_afinfo = {
.family = AF_INET,
.checksum = nf_ip_checksum,
+ .route = nf_ip_route,
.saveroute = nf_ip_saveroute,
.reroute = nf_ip_reroute,
.route_key_size = sizeof(struct ip_rt_info),
@@ -202,3 +211,13 @@ static void ipv4_netfilter_fini(void)
module_init(ipv4_netfilter_init);
module_exit(ipv4_netfilter_fini);
+
+#ifdef CONFIG_SYSCTL
+struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "ipv4", .ctl_name = NET_IPV4, },
+ { .procname = "netfilter", .ctl_name = NET_IPV4_NETFILTER, },
+ { }
+};
+EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
+#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 9aca9c55687c..9a077cb24798 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -8,6 +8,7 @@ menu "IP: Netfilter Configuration"
config NF_CONNTRACK_IPV4
tristate "IPv4 connection tracking support (required for NAT)"
depends on NF_CONNTRACK
+ default m if NETFILTER_ADVANCED=n
---help---
Connection tracking keeps a record of what packets have passed
through your machine, in order to figure out how they are related
@@ -32,6 +33,7 @@ config NF_CONNTRACK_PROC_COMPAT
config IP_NF_QUEUE
tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
+ depends on NETFILTER_ADVANCED
help
Netfilter has the ability to queue packets to user space: the
netlink device can be used to access them using this driver.
@@ -44,6 +46,7 @@ config IP_NF_QUEUE
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
+ default m if NETFILTER_ADVANCED=n
select NETFILTER_XTABLES
help
iptables is a general, extensible packet identification framework.
@@ -54,27 +57,10 @@ config IP_NF_IPTABLES
To compile it as a module, choose M here. If unsure, say N.
# The matches.
-config IP_NF_MATCH_IPRANGE
- tristate "IP range match support"
- depends on IP_NF_IPTABLES
- help
- This option makes possible to match IP addresses against IP address
- ranges.
-
- To compile it as a module, choose M here. If unsure, say N.
-
-config IP_NF_MATCH_TOS
- tristate "TOS match support"
- depends on IP_NF_IPTABLES
- help
- TOS matching allows you to match packets based on the Type Of
- Service fields of the IP packet.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP_NF_MATCH_RECENT
- tristate "recent match support"
+ tristate '"recent" match support'
depends on IP_NF_IPTABLES
+ depends on NETFILTER_ADVANCED
help
This match is used for creating one or many lists of recently
used addresses and then matching against that/those list(s).
@@ -85,8 +71,9 @@ config IP_NF_MATCH_RECENT
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_MATCH_ECN
- tristate "ECN match support"
+ tristate '"ecn" match support'
depends on IP_NF_IPTABLES
+ depends on NETFILTER_ADVANCED
help
This option adds a `ECN' match, which allows you to match against
the IPv4 and TCP header ECN fields.
@@ -94,8 +81,9 @@ config IP_NF_MATCH_ECN
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_MATCH_AH
- tristate "AH match support"
+ tristate '"ah" match support'
depends on IP_NF_IPTABLES
+ depends on NETFILTER_ADVANCED
help
This match extension allows you to match a range of SPIs
inside AH header of IPSec packets.
@@ -103,30 +91,23 @@ config IP_NF_MATCH_AH
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_MATCH_TTL
- tristate "TTL match support"
+ tristate '"ttl" match support'
depends on IP_NF_IPTABLES
+ depends on NETFILTER_ADVANCED
help
This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user
to match packets by their TTL value.
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_MATCH_OWNER
- tristate "Owner match support"
- depends on IP_NF_IPTABLES
- help
- Packet owner matching allows you to match locally-generated packets
- based on who created them: the user, group, process or session.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP_NF_MATCH_ADDRTYPE
- tristate 'address type match support'
+ tristate '"addrtype" address type match support'
depends on IP_NF_IPTABLES
+ depends on NETFILTER_ADVANCED
help
This option allows you to match what routing thinks of an address,
eg. UNICAST, LOCAL, BROADCAST, ...
-
+
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
@@ -134,6 +115,7 @@ config IP_NF_MATCH_ADDRTYPE
config IP_NF_FILTER
tristate "Packet filtering"
depends on IP_NF_IPTABLES
+ default m if NETFILTER_ADVANCED=n
help
Packet filtering defines a table `filter', which has a series of
rules for simple packet filtering at local input, forwarding and
@@ -144,6 +126,7 @@ config IP_NF_FILTER
config IP_NF_TARGET_REJECT
tristate "REJECT target support"
depends on IP_NF_FILTER
+ default m if NETFILTER_ADVANCED=n
help
The REJECT target allows a filtering rule to specify that an ICMP
error should be issued in response to an incoming packet, rather
@@ -154,6 +137,7 @@ config IP_NF_TARGET_REJECT
config IP_NF_TARGET_LOG
tristate "LOG target support"
depends on IP_NF_IPTABLES
+ default m if NETFILTER_ADVANCED=n
help
This option adds a `LOG' target, which allows you to create rules in
any iptables table which records the packet header to the syslog.
@@ -163,6 +147,7 @@ config IP_NF_TARGET_LOG
config IP_NF_TARGET_ULOG
tristate "ULOG target support"
depends on IP_NF_IPTABLES
+ default m if NETFILTER_ADVANCED=n
---help---
This option enables the old IPv4-only "ipt_ULOG" implementation
@@ -183,6 +168,7 @@ config IP_NF_TARGET_ULOG
config NF_NAT
tristate "Full NAT"
depends on IP_NF_IPTABLES && NF_CONNTRACK_IPV4
+ default m if NETFILTER_ADVANCED=n
help
The Full NAT option allows masquerading, port forwarding and other
forms of full Network Address Port Translation. It is controlled by
@@ -198,6 +184,7 @@ config NF_NAT_NEEDED
config IP_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
depends on NF_NAT
+ default m if NETFILTER_ADVANCED=n
help
Masquerading is a special case of NAT: all outgoing connections are
changed to seem to come from a particular interface's address, and
@@ -210,6 +197,7 @@ config IP_NF_TARGET_MASQUERADE
config IP_NF_TARGET_REDIRECT
tristate "REDIRECT target support"
depends on NF_NAT
+ depends on NETFILTER_ADVANCED
help
REDIRECT is a special case of NAT: all incoming connections are
mapped onto the incoming interface's address, causing the packets to
@@ -221,6 +209,7 @@ config IP_NF_TARGET_REDIRECT
config IP_NF_TARGET_NETMAP
tristate "NETMAP target support"
depends on NF_NAT
+ depends on NETFILTER_ADVANCED
help
NETMAP is an implementation of static 1:1 NAT mapping of network
addresses. It maps the network address part, while keeping the host
@@ -229,18 +218,10 @@ config IP_NF_TARGET_NETMAP
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_TARGET_SAME
- tristate "SAME target support (OBSOLETE)"
- depends on NF_NAT
- help
- This option adds a `SAME' target, which works like the standard SNAT
- target, but attempts to give clients the same IP for all connections.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config NF_NAT_SNMP_BASIC
- tristate "Basic SNMP-ALG support (EXPERIMENTAL)"
- depends on EXPERIMENTAL && NF_NAT
+ tristate "Basic SNMP-ALG support"
+ depends on NF_NAT
+ depends on NETFILTER_ADVANCED
---help---
This module implements an Application Layer Gateway (ALG) for
@@ -304,6 +285,7 @@ config NF_NAT_SIP
config IP_NF_MANGLE
tristate "Packet mangling"
depends on IP_NF_IPTABLES
+ default m if NETFILTER_ADVANCED=n
help
This option adds a `mangle' table to iptables: see the man page for
iptables(8). This table is used for various packet alterations
@@ -311,19 +293,10 @@ config IP_NF_MANGLE
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_TARGET_TOS
- tristate "TOS target support"
- depends on IP_NF_MANGLE
- help
- This option adds a `TOS' target, which allows you to create rules in
- the `mangle' table which alter the Type Of Service field of an IP
- packet prior to routing.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP_NF_TARGET_ECN
tristate "ECN target support"
depends on IP_NF_MANGLE
+ depends on NETFILTER_ADVANCED
---help---
This option adds a `ECN' target, which can be used in the iptables mangle
table.
@@ -338,6 +311,7 @@ config IP_NF_TARGET_ECN
config IP_NF_TARGET_TTL
tristate 'TTL target support'
depends on IP_NF_MANGLE
+ depends on NETFILTER_ADVANCED
help
This option adds a `TTL' target, which enables the user to modify
the TTL value of the IP header.
@@ -353,6 +327,7 @@ config IP_NF_TARGET_CLUSTERIP
tristate "CLUSTERIP target support (EXPERIMENTAL)"
depends on IP_NF_MANGLE && EXPERIMENTAL
depends on NF_CONNTRACK_IPV4
+ depends on NETFILTER_ADVANCED
select NF_CONNTRACK_MARK
help
The CLUSTERIP target allows you to build load-balancing clusters of
@@ -365,6 +340,7 @@ config IP_NF_TARGET_CLUSTERIP
config IP_NF_RAW
tristate 'raw table support (required for NOTRACK/TRACE)'
depends on IP_NF_IPTABLES
+ depends on NETFILTER_ADVANCED
help
This option adds a `raw' table to iptables. This table is the very
first in the netfilter framework and hooks in at the PREROUTING
@@ -377,6 +353,7 @@ config IP_NF_RAW
config IP_NF_ARPTABLES
tristate "ARP tables support"
select NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
help
arptables is a general, extensible packet identification framework.
The ARP packet filtering and mangling (manipulation)subsystems
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 7456833d6ade..0c7dc78a62e9 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -44,10 +44,7 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
-obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
-obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
-obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
# targets
@@ -58,8 +55,6 @@ obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
-obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
-obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 2909c92ecd99..b4a810c28ac8 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -19,9 +19,10 @@
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/init.h>
-
-#include <asm/uaccess.h>
#include <linux/mutex.h>
+#include <linux/err.h>
+#include <net/compat.h>
+#include <asm/uaccess.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp/arp_tables.h>
@@ -83,7 +84,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
__be32 src_ipaddr, tgt_ipaddr;
int i, ret;
-#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg))
+#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
ARPT_INV_ARPOP)) {
@@ -179,6 +180,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
}
return 1;
+#undef FWINV
}
static inline int arp_checkentry(const struct arpt_arp *arp)
@@ -435,29 +437,9 @@ static int mark_source_chains(struct xt_table_info *newinfo,
return 1;
}
-static inline int standard_check(const struct arpt_entry_target *t,
- unsigned int max_offset)
-{
- /* Check standard info. */
- if (t->u.target_size
- != ARPT_ALIGN(sizeof(struct arpt_standard_target))) {
- duprintf("arpt_standard_check: target size %u != %Zu\n",
- t->u.target_size,
- ARPT_ALIGN(sizeof(struct arpt_standard_target)));
- return 0;
- }
-
- return 1;
-}
-
-static struct arpt_target arpt_standard_target;
-
-static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size,
- unsigned int *i)
+static inline int check_entry(struct arpt_entry *e, const char *name)
{
struct arpt_entry_target *t;
- struct arpt_target *target;
- int ret;
if (!arp_checkentry(&e->arp)) {
duprintf("arp_tables: arp check failed %p %s.\n", e, name);
@@ -471,35 +453,57 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i
if (e->target_offset + t->u.target_size > e->next_offset)
return -EINVAL;
+ return 0;
+}
+
+static inline int check_target(struct arpt_entry *e, const char *name)
+{
+ struct arpt_entry_target *t;
+ struct arpt_target *target;
+ int ret;
+
+ t = arpt_get_target(e);
+ target = t->u.kernel.target;
+
+ ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t),
+ name, e->comefrom, 0, 0);
+ if (!ret && t->u.kernel.target->checkentry
+ && !t->u.kernel.target->checkentry(name, e, target, t->data,
+ e->comefrom)) {
+ duprintf("arp_tables: check failed for `%s'.\n",
+ t->u.kernel.target->name);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static inline int
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+ unsigned int *i)
+{
+ struct arpt_entry_target *t;
+ struct arpt_target *target;
+ int ret;
+
+ ret = check_entry(e, name);
+ if (ret)
+ return ret;
+
+ t = arpt_get_target(e);
target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name,
t->u.user.revision),
"arpt_%s", t->u.user.name);
if (IS_ERR(target) || !target) {
- duprintf("check_entry: `%s' not found\n", t->u.user.name);
+ duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
ret = target ? PTR_ERR(target) : -ENOENT;
goto out;
}
t->u.kernel.target = target;
- ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t),
- name, e->comefrom, 0, 0);
+ ret = check_target(e, name);
if (ret)
goto err;
- if (t->u.kernel.target == &arpt_standard_target) {
- if (!standard_check(t, size)) {
- ret = -EINVAL;
- goto err;
- }
- } else if (t->u.kernel.target->checkentry
- && !t->u.kernel.target->checkentry(name, e, target, t->data,
- e->comefrom)) {
- duprintf("arp_tables: check failed for `%s'.\n",
- t->u.kernel.target->name);
- ret = -EINVAL;
- goto err;
- }
-
(*i)++;
return 0;
err:
@@ -633,7 +637,7 @@ static int translate_table(const char *name,
/* Finally, each sanity check must pass */
i = 0;
ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
- check_entry, name, size, &i);
+ find_check_entry, name, size, &i);
if (ret != 0) {
ARPT_ENTRY_ITERATE(entry0, newinfo->size,
@@ -704,16 +708,11 @@ static void get_counters(const struct xt_table_info *t,
}
}
-static int copy_entries_to_user(unsigned int total_size,
- struct arpt_table *table,
- void __user *userptr)
+static inline struct xt_counters *alloc_counters(struct arpt_table *table)
{
- unsigned int off, num, countersize;
- struct arpt_entry *e;
+ unsigned int countersize;
struct xt_counters *counters;
struct xt_table_info *private = table->private;
- int ret = 0;
- void *loc_cpu_entry;
/* We need atomic snapshot of counters: rest doesn't change
* (other than comefrom, which userspace doesn't care
@@ -723,13 +722,31 @@ static int copy_entries_to_user(unsigned int total_size,
counters = vmalloc_node(countersize, numa_node_id());
if (counters == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
/* First, sum counters... */
write_lock_bh(&table->lock);
get_counters(private, counters);
write_unlock_bh(&table->lock);
+ return counters;
+}
+
+static int copy_entries_to_user(unsigned int total_size,
+ struct arpt_table *table,
+ void __user *userptr)
+{
+ unsigned int off, num;
+ struct arpt_entry *e;
+ struct xt_counters *counters;
+ struct xt_table_info *private = table->private;
+ int ret = 0;
+ void *loc_cpu_entry;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
loc_cpu_entry = private->entries[raw_smp_processor_id()];
/* ... then copy entire thing ... */
if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
@@ -767,23 +784,159 @@ static int copy_entries_to_user(unsigned int total_size,
return ret;
}
-static int get_entries(const struct arpt_get_entries *entries,
- struct arpt_get_entries __user *uptr)
+#ifdef CONFIG_COMPAT
+static void compat_standard_from_user(void *dst, void *src)
+{
+ int v = *(compat_int_t *)src;
+
+ if (v > 0)
+ v += xt_compat_calc_jump(NF_ARP, v);
+ memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, void *src)
{
+ compat_int_t cv = *(int *)src;
+
+ if (cv > 0)
+ cv -= xt_compat_calc_jump(NF_ARP, cv);
+ return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(struct arpt_entry *e,
+ const struct xt_table_info *info,
+ void *base, struct xt_table_info *newinfo)
+{
+ struct arpt_entry_target *t;
+ unsigned int entry_offset;
+ int off, i, ret;
+
+ off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+ entry_offset = (void *)e - base;
+
+ t = arpt_get_target(e);
+ off += xt_compat_target_offset(t->u.kernel.target);
+ newinfo->size -= off;
+ ret = xt_compat_add_offset(NF_ARP, entry_offset, off);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ if (info->hook_entry[i] &&
+ (e < (struct arpt_entry *)(base + info->hook_entry[i])))
+ newinfo->hook_entry[i] -= off;
+ if (info->underflow[i] &&
+ (e < (struct arpt_entry *)(base + info->underflow[i])))
+ newinfo->underflow[i] -= off;
+ }
+ return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+ struct xt_table_info *newinfo)
+{
+ void *loc_cpu_entry;
+
+ if (!newinfo || !info)
+ return -EINVAL;
+
+ /* we dont care about newinfo->entries[] */
+ memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+ newinfo->initial_entries = 0;
+ loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ return ARPT_ENTRY_ITERATE(loc_cpu_entry, info->size,
+ compat_calc_entry, info, loc_cpu_entry,
+ newinfo);
+}
+#endif
+
+static int get_info(void __user *user, int *len, int compat)
+{
+ char name[ARPT_TABLE_MAXNAMELEN];
+ struct arpt_table *t;
int ret;
+
+ if (*len != sizeof(struct arpt_getinfo)) {
+ duprintf("length %u != %Zu\n", *len,
+ sizeof(struct arpt_getinfo));
+ return -EINVAL;
+ }
+
+ if (copy_from_user(name, user, sizeof(name)) != 0)
+ return -EFAULT;
+
+ name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_lock(NF_ARP);
+#endif
+ t = try_then_request_module(xt_find_table_lock(NF_ARP, name),
+ "arptable_%s", name);
+ if (t && !IS_ERR(t)) {
+ struct arpt_getinfo info;
+ struct xt_table_info *private = t->private;
+
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ struct xt_table_info tmp;
+ ret = compat_table_info(private, &tmp);
+ xt_compat_flush_offsets(NF_ARP);
+ private = &tmp;
+ }
+#endif
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = private->number;
+ info.size = private->size;
+ strcpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+ xt_table_unlock(t);
+ module_put(t->me);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_unlock(NF_ARP);
+#endif
+ return ret;
+}
+
+static int get_entries(struct arpt_get_entries __user *uptr, int *len)
+{
+ int ret;
+ struct arpt_get_entries get;
struct arpt_table *t;
- t = xt_find_table_lock(NF_ARP, entries->name);
+ if (*len < sizeof(get)) {
+ duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct arpt_get_entries) + get.size) {
+ duprintf("get_entries: %u != %Zu\n", *len,
+ sizeof(struct arpt_get_entries) + get.size);
+ return -EINVAL;
+ }
+
+ t = xt_find_table_lock(NF_ARP, get.name);
if (t && !IS_ERR(t)) {
struct xt_table_info *private = t->private;
duprintf("t->private->number = %u\n",
private->number);
- if (entries->size == private->size)
+ if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
else {
duprintf("get_entries: I've got %u not %u!\n",
- private->size, entries->size);
+ private->size, get.size);
ret = -EINVAL;
}
module_put(t->me);
@@ -794,71 +947,41 @@ static int get_entries(const struct arpt_get_entries *entries,
return ret;
}
-static int do_replace(void __user *user, unsigned int len)
+static int __do_replace(const char *name, unsigned int valid_hooks,
+ struct xt_table_info *newinfo,
+ unsigned int num_counters,
+ void __user *counters_ptr)
{
int ret;
- struct arpt_replace tmp;
struct arpt_table *t;
- struct xt_table_info *newinfo, *oldinfo;
+ struct xt_table_info *oldinfo;
struct xt_counters *counters;
- void *loc_cpu_entry, *loc_cpu_old_entry;
-
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
- return -EFAULT;
+ void *loc_cpu_old_entry;
- /* Hack: Causes ipchains to give correct error msg --RR */
- if (len != sizeof(tmp) + tmp.size)
- return -ENOPROTOOPT;
-
- /* overflow check */
- if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
- SMP_CACHE_BYTES)
- return -ENOMEM;
- if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
- return -ENOMEM;
-
- newinfo = xt_alloc_table_info(tmp.size);
- if (!newinfo)
- return -ENOMEM;
-
- /* choose the copy that is on our node/cpu */
- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
- ret = -EFAULT;
- goto free_newinfo;
- }
-
- counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters));
+ ret = 0;
+ counters = vmalloc_node(num_counters * sizeof(struct xt_counters),
+ numa_node_id());
if (!counters) {
ret = -ENOMEM;
- goto free_newinfo;
+ goto out;
}
- ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
- tmp.hook_entry, tmp.underflow);
- if (ret != 0)
- goto free_newinfo_counters;
-
- duprintf("arp_tables: Translated table\n");
-
- t = try_then_request_module(xt_find_table_lock(NF_ARP, tmp.name),
- "arptable_%s", tmp.name);
+ t = try_then_request_module(xt_find_table_lock(NF_ARP, name),
+ "arptable_%s", name);
if (!t || IS_ERR(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free_newinfo_counters_untrans;
}
/* You lied! */
- if (tmp.valid_hooks != t->valid_hooks) {
+ if (valid_hooks != t->valid_hooks) {
duprintf("Valid hook crap: %08X vs %08X\n",
- tmp.valid_hooks, t->valid_hooks);
+ valid_hooks, t->valid_hooks);
ret = -EINVAL;
goto put_module;
}
- oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret);
+ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
if (!oldinfo)
goto put_module;
@@ -876,11 +999,12 @@ static int do_replace(void __user *user, unsigned int len)
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
+ NULL);
xt_free_table_info(oldinfo);
- if (copy_to_user(tmp.counters, counters,
- sizeof(struct xt_counters) * tmp.num_counters) != 0)
+ if (copy_to_user(counters_ptr, counters,
+ sizeof(struct xt_counters) * num_counters) != 0)
ret = -EFAULT;
vfree(counters);
xt_table_unlock(t);
@@ -890,9 +1014,53 @@ static int do_replace(void __user *user, unsigned int len)
module_put(t->me);
xt_table_unlock(t);
free_newinfo_counters_untrans:
- ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
- free_newinfo_counters:
vfree(counters);
+ out:
+ return ret;
+}
+
+static int do_replace(void __user *user, unsigned int len)
+{
+ int ret;
+ struct arpt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_table(tmp.name, tmp.valid_hooks,
+ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
+ tmp.hook_entry, tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("arp_tables: Translated table\n");
+
+ ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, tmp.counters);
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
@@ -912,31 +1080,59 @@ static inline int add_counter_to_entry(struct arpt_entry *e,
return 0;
}
-static int do_add_counters(void __user *user, unsigned int len)
+static int do_add_counters(void __user *user, unsigned int len, int compat)
{
unsigned int i;
- struct xt_counters_info tmp, *paddc;
+ struct xt_counters_info tmp;
+ struct xt_counters *paddc;
+ unsigned int num_counters;
+ char *name;
+ int size;
+ void *ptmp;
struct arpt_table *t;
struct xt_table_info *private;
int ret = 0;
void *loc_cpu_entry;
+#ifdef CONFIG_COMPAT
+ struct compat_xt_counters_info compat_tmp;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (compat) {
+ ptmp = &compat_tmp;
+ size = sizeof(struct compat_xt_counters_info);
+ } else
+#endif
+ {
+ ptmp = &tmp;
+ size = sizeof(struct xt_counters_info);
+ }
+
+ if (copy_from_user(ptmp, user, size) != 0)
return -EFAULT;
- if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters))
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ num_counters = compat_tmp.num_counters;
+ name = compat_tmp.name;
+ } else
+#endif
+ {
+ num_counters = tmp.num_counters;
+ name = tmp.name;
+ }
+
+ if (len != size + num_counters * sizeof(struct xt_counters))
return -EINVAL;
- paddc = vmalloc(len);
+ paddc = vmalloc_node(len - size, numa_node_id());
if (!paddc)
return -ENOMEM;
- if (copy_from_user(paddc, user, len) != 0) {
+ if (copy_from_user(paddc, user + size, len - size) != 0) {
ret = -EFAULT;
goto free;
}
- t = xt_find_table_lock(NF_ARP, tmp.name);
+ t = xt_find_table_lock(NF_ARP, name);
if (!t || IS_ERR(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free;
@@ -944,7 +1140,7 @@ static int do_add_counters(void __user *user, unsigned int len)
write_lock_bh(&t->lock);
private = t->private;
- if (private->number != tmp.num_counters) {
+ if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
}
@@ -955,7 +1151,7 @@ static int do_add_counters(void __user *user, unsigned int len)
ARPT_ENTRY_ITERATE(loc_cpu_entry,
private->size,
add_counter_to_entry,
- paddc->counters,
+ paddc,
&i);
unlock_up_free:
write_unlock_bh(&t->lock);
@@ -967,7 +1163,329 @@ static int do_add_counters(void __user *user, unsigned int len)
return ret;
}
-static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+#ifdef CONFIG_COMPAT
+static inline int
+compat_release_entry(struct compat_arpt_entry *e, unsigned int *i)
+{
+ struct arpt_entry_target *t;
+
+ if (i && (*i)-- == 0)
+ return 1;
+
+ t = compat_arpt_get_target(e);
+ module_put(t->u.kernel.target->me);
+ return 0;
+}
+
+static inline int
+check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
+ struct xt_table_info *newinfo,
+ unsigned int *size,
+ unsigned char *base,
+ unsigned char *limit,
+ unsigned int *hook_entries,
+ unsigned int *underflows,
+ unsigned int *i,
+ const char *name)
+{
+ struct arpt_entry_target *t;
+ struct xt_target *target;
+ unsigned int entry_offset;
+ int ret, off, h;
+
+ duprintf("check_compat_entry_size_and_hooks %p\n", e);
+ if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0
+ || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
+ duprintf("Bad offset %p, limit = %p\n", e, limit);
+ return -EINVAL;
+ }
+
+ if (e->next_offset < sizeof(struct compat_arpt_entry) +
+ sizeof(struct compat_xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* For purposes of check_entry casting the compat entry is fine */
+ ret = check_entry((struct arpt_entry *)e, name);
+ if (ret)
+ return ret;
+
+ off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+ entry_offset = (void *)e - (void *)base;
+
+ t = compat_arpt_get_target(e);
+ target = try_then_request_module(xt_find_target(NF_ARP,
+ t->u.user.name,
+ t->u.user.revision),
+ "arpt_%s", t->u.user.name);
+ if (IS_ERR(target) || !target) {
+ duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
+ t->u.user.name);
+ ret = target ? PTR_ERR(target) : -ENOENT;
+ goto out;
+ }
+ t->u.kernel.target = target;
+
+ off += xt_compat_target_offset(target);
+ *size += off;
+ ret = xt_compat_add_offset(NF_ARP, entry_offset, off);
+ if (ret)
+ goto release_target;
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h])
+ newinfo->underflow[h] = underflows[h];
+ }
+
+ /* Clear counters and comefrom */
+ memset(&e->counters, 0, sizeof(e->counters));
+ e->comefrom = 0;
+
+ (*i)++;
+ return 0;
+
+release_target:
+ module_put(t->u.kernel.target->me);
+out:
+ return ret;
+}
+
+static int
+compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
+ unsigned int *size, const char *name,
+ struct xt_table_info *newinfo, unsigned char *base)
+{
+ struct arpt_entry_target *t;
+ struct xt_target *target;
+ struct arpt_entry *de;
+ unsigned int origsize;
+ int ret, h;
+
+ ret = 0;
+ origsize = *size;
+ de = (struct arpt_entry *)*dstptr;
+ memcpy(de, e, sizeof(struct arpt_entry));
+ memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+ *dstptr += sizeof(struct arpt_entry);
+ *size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+ de->target_offset = e->target_offset - (origsize - *size);
+ t = compat_arpt_get_target(e);
+ target = t->u.kernel.target;
+ xt_compat_target_from_user(t, dstptr, size);
+
+ de->next_offset = e->next_offset - (origsize - *size);
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if ((unsigned char *)de - base < newinfo->hook_entry[h])
+ newinfo->hook_entry[h] -= origsize - *size;
+ if ((unsigned char *)de - base < newinfo->underflow[h])
+ newinfo->underflow[h] -= origsize - *size;
+ }
+ return ret;
+}
+
+static inline int compat_check_entry(struct arpt_entry *e, const char *name,
+ unsigned int *i)
+{
+ int ret;
+
+ ret = check_target(e, name);
+ if (ret)
+ return ret;
+
+ (*i)++;
+ return 0;
+}
+
+static int translate_compat_table(const char *name,
+ unsigned int valid_hooks,
+ struct xt_table_info **pinfo,
+ void **pentry0,
+ unsigned int total_size,
+ unsigned int number,
+ unsigned int *hook_entries,
+ unsigned int *underflows)
+{
+ unsigned int i, j;
+ struct xt_table_info *newinfo, *info;
+ void *pos, *entry0, *entry1;
+ unsigned int size;
+ int ret;
+
+ info = *pinfo;
+ entry0 = *pentry0;
+ size = total_size;
+ info->number = number;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ info->hook_entry[i] = 0xFFFFFFFF;
+ info->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_compat_table: size %u\n", info->size);
+ j = 0;
+ xt_compat_lock(NF_ARP);
+ /* Walk through entries, checking offsets. */
+ ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size,
+ check_compat_entry_size_and_hooks,
+ info, &size, entry0,
+ entry0 + total_size,
+ hook_entries, underflows, &j, name);
+ if (ret != 0)
+ goto out_unlock;
+
+ ret = -EINVAL;
+ if (j != number) {
+ duprintf("translate_compat_table: %u not %u entries\n",
+ j, number);
+ goto out_unlock;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(valid_hooks & (1 << i)))
+ continue;
+ if (info->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, hook_entries[i]);
+ goto out_unlock;
+ }
+ if (info->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, underflows[i]);
+ goto out_unlock;
+ }
+ }
+
+ ret = -ENOMEM;
+ newinfo = xt_alloc_table_info(size);
+ if (!newinfo)
+ goto out_unlock;
+
+ newinfo->number = number;
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = info->hook_entry[i];
+ newinfo->underflow[i] = info->underflow[i];
+ }
+ entry1 = newinfo->entries[raw_smp_processor_id()];
+ pos = entry1;
+ size = total_size;
+ ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size,
+ compat_copy_entry_from_user,
+ &pos, &size, name, newinfo, entry1);
+ xt_compat_flush_offsets(NF_ARP);
+ xt_compat_unlock(NF_ARP);
+ if (ret)
+ goto free_newinfo;
+
+ ret = -ELOOP;
+ if (!mark_source_chains(newinfo, valid_hooks, entry1))
+ goto free_newinfo;
+
+ i = 0;
+ ret = ARPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry,
+ name, &i);
+ if (ret) {
+ j -= i;
+ COMPAT_ARPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i,
+ compat_release_entry, &j);
+ ARPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i);
+ xt_free_table_info(newinfo);
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i)
+ if (newinfo->entries[i] && newinfo->entries[i] != entry1)
+ memcpy(newinfo->entries[i], entry1, newinfo->size);
+
+ *pinfo = newinfo;
+ *pentry0 = entry1;
+ xt_free_table_info(info);
+ return 0;
+
+free_newinfo:
+ xt_free_table_info(newinfo);
+out:
+ COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j);
+ return ret;
+out_unlock:
+ xt_compat_flush_offsets(NF_ARP);
+ xt_compat_unlock(NF_ARP);
+ goto out;
+}
+
+struct compat_arpt_replace {
+ char name[ARPT_TABLE_MAXNAMELEN];
+ u32 valid_hooks;
+ u32 num_entries;
+ u32 size;
+ u32 hook_entry[NF_ARP_NUMHOOKS];
+ u32 underflow[NF_ARP_NUMHOOKS];
+ u32 num_counters;
+ compat_uptr_t counters;
+ struct compat_arpt_entry entries[0];
+};
+
+static int compat_do_replace(void __user *user, unsigned int len)
+{
+ int ret;
+ struct compat_arpt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.size >= INT_MAX / num_possible_cpus())
+ return -ENOMEM;
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_compat_table(tmp.name, tmp.valid_hooks,
+ &newinfo, &loc_cpu_entry, tmp.size,
+ tmp.num_entries, tmp.hook_entry,
+ tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("compat_do_replace: Translated table\n");
+
+ ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, compat_ptr(tmp.counters));
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
+ unsigned int len)
{
int ret;
@@ -976,11 +1494,11 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned
switch (cmd) {
case ARPT_SO_SET_REPLACE:
- ret = do_replace(user, len);
+ ret = compat_do_replace(user, len);
break;
case ARPT_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(user, len);
+ ret = do_add_counters(user, len, 1);
break;
default:
@@ -991,74 +1509,190 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned
return ret;
}
-static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
+ compat_uint_t *size,
+ struct xt_counters *counters,
+ unsigned int *i)
{
+ struct arpt_entry_target *t;
+ struct compat_arpt_entry __user *ce;
+ u_int16_t target_offset, next_offset;
+ compat_uint_t origsize;
int ret;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
+ ret = -EFAULT;
+ origsize = *size;
+ ce = (struct compat_arpt_entry __user *)*dstptr;
+ if (copy_to_user(ce, e, sizeof(struct arpt_entry)))
+ goto out;
- switch (cmd) {
- case ARPT_SO_GET_INFO: {
- char name[ARPT_TABLE_MAXNAMELEN];
- struct arpt_table *t;
+ if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i])))
+ goto out;
+
+ *dstptr += sizeof(struct compat_arpt_entry);
+ *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
- if (*len != sizeof(struct arpt_getinfo)) {
- duprintf("length %u != %Zu\n", *len,
- sizeof(struct arpt_getinfo));
+ target_offset = e->target_offset - (origsize - *size);
+
+ t = arpt_get_target(e);
+ ret = xt_compat_target_to_user(t, dstptr, size);
+ if (ret)
+ goto out;
+ ret = -EFAULT;
+ next_offset = e->next_offset - (origsize - *size);
+ if (put_user(target_offset, &ce->target_offset))
+ goto out;
+ if (put_user(next_offset, &ce->next_offset))
+ goto out;
+
+ (*i)++;
+ return 0;
+out:
+ return ret;
+}
+
+static int compat_copy_entries_to_user(unsigned int total_size,
+ struct arpt_table *table,
+ void __user *userptr)
+{
+ struct xt_counters *counters;
+ struct xt_table_info *private = table->private;
+ void __user *pos;
+ unsigned int size;
+ int ret = 0;
+ void *loc_cpu_entry;
+ unsigned int i = 0;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ /* choose the copy on our node/cpu */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ pos = userptr;
+ size = total_size;
+ ret = ARPT_ENTRY_ITERATE(loc_cpu_entry, total_size,
+ compat_copy_entry_to_user,
+ &pos, &size, counters, &i);
+ vfree(counters);
+ return ret;
+}
+
+struct compat_arpt_get_entries {
+ char name[ARPT_TABLE_MAXNAMELEN];
+ compat_uint_t size;
+ struct compat_arpt_entry entrytable[0];
+};
+
+static int compat_get_entries(struct compat_arpt_get_entries __user *uptr,
+ int *len)
+{
+ int ret;
+ struct compat_arpt_get_entries get;
+ struct arpt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct compat_arpt_get_entries) + get.size) {
+ duprintf("compat_get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
+ return -EINVAL;
+ }
+
+ xt_compat_lock(NF_ARP);
+ t = xt_find_table_lock(NF_ARP, get.name);
+ if (t && !IS_ERR(t)) {
+ struct xt_table_info *private = t->private;
+ struct xt_table_info info;
+
+ duprintf("t->private->number = %u\n", private->number);
+ ret = compat_table_info(private, &info);
+ if (!ret && get.size == info.size) {
+ ret = compat_copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ } else if (!ret) {
+ duprintf("compat_get_entries: I've got %u not %u!\n",
+ private->size, get.size);
ret = -EINVAL;
- break;
}
+ xt_compat_flush_offsets(NF_ARP);
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
- if (copy_from_user(name, user, sizeof(name)) != 0) {
- ret = -EFAULT;
- break;
- }
- name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
-
- t = try_then_request_module(xt_find_table_lock(NF_ARP, name),
- "arptable_%s", name);
- if (t && !IS_ERR(t)) {
- struct arpt_getinfo info;
- struct xt_table_info *private = t->private;
-
- info.valid_hooks = t->valid_hooks;
- memcpy(info.hook_entry, private->hook_entry,
- sizeof(info.hook_entry));
- memcpy(info.underflow, private->underflow,
- sizeof(info.underflow));
- info.num_entries = private->number;
- info.size = private->size;
- strcpy(info.name, name);
-
- if (copy_to_user(user, &info, *len) != 0)
- ret = -EFAULT;
- else
- ret = 0;
- xt_table_unlock(t);
- module_put(t->me);
- } else
- ret = t ? PTR_ERR(t) : -ENOENT;
+ xt_compat_unlock(NF_ARP);
+ return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *, int, void __user *, int *);
+
+static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,
+ int *len)
+{
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_GET_INFO:
+ ret = get_info(user, len, 1);
+ break;
+ case ARPT_SO_GET_ENTRIES:
+ ret = compat_get_entries(user, len);
+ break;
+ default:
+ ret = do_arpt_get_ctl(sk, cmd, user, len);
}
- break;
+ return ret;
+}
+#endif
- case ARPT_SO_GET_ENTRIES: {
- struct arpt_get_entries get;
+static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ int ret;
- if (*len < sizeof(get)) {
- duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
- ret = -EINVAL;
- } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
- ret = -EFAULT;
- } else if (*len != sizeof(struct arpt_get_entries) + get.size) {
- duprintf("get_entries: %u != %Zu\n", *len,
- sizeof(struct arpt_get_entries) + get.size);
- ret = -EINVAL;
- } else
- ret = get_entries(&get, user);
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_SET_REPLACE:
+ ret = do_replace(user, len);
+ break;
+
+ case ARPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(user, len, 0);
break;
+
+ default:
+ duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
}
+ return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_GET_INFO:
+ ret = get_info(user, len, 0);
+ break;
+
+ case ARPT_SO_GET_ENTRIES:
+ ret = get_entries(user, len);
+ break;
+
case ARPT_SO_GET_REVISION_TARGET: {
struct xt_get_revision rev;
@@ -1090,7 +1724,7 @@ int arpt_register_table(struct arpt_table *table,
{
int ret;
struct xt_table_info *newinfo;
- static struct xt_table_info bootstrap
+ struct xt_table_info bootstrap
= { 0, 0, 0, { 0 }, { 0 }, { } };
void *loc_cpu_entry;
@@ -1144,6 +1778,11 @@ static struct arpt_target arpt_standard_target __read_mostly = {
.name = ARPT_STANDARD_TARGET,
.targetsize = sizeof(int),
.family = NF_ARP,
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
+#endif
};
static struct arpt_target arpt_error_target __read_mostly = {
@@ -1158,9 +1797,15 @@ static struct nf_sockopt_ops arpt_sockopts = {
.set_optmin = ARPT_BASE_CTL,
.set_optmax = ARPT_SO_SET_MAX+1,
.set = do_arpt_set_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_set = compat_do_arpt_set_ctl,
+#endif
.get_optmin = ARPT_BASE_CTL,
.get_optmax = ARPT_SO_GET_MAX+1,
.get = do_arpt_get_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_get = compat_do_arpt_get_ctl,
+#endif
.owner = THIS_MODULE,
};
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 302d3da5f696..7201511d54d2 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -64,7 +64,7 @@ static unsigned int arpt_hook(unsigned int hook,
return arpt_do_table(skb, hook, in, out, &packet_filter);
}
-static struct nf_hook_ops arpt_ops[] = {
+static struct nf_hook_ops arpt_ops[] __read_mostly = {
{
.hook = arpt_hook,
.owner = THIS_MODULE,
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index 14d64a383db1..5109839da222 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -28,19 +28,15 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/route.h>
+#include <net/netfilter/nf_queue.h>
+#include <net/ip.h>
#define IPQ_QMAX_DEFAULT 1024
#define IPQ_PROC_FS_NAME "ip_queue"
#define NET_IPQ_QMAX 2088
#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
-struct ipq_queue_entry {
- struct list_head list;
- struct nf_info *info;
- struct sk_buff *skb;
-};
-
-typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
+typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
@@ -54,76 +50,13 @@ static struct sock *ipqnl __read_mostly;
static LIST_HEAD(queue_list);
static DEFINE_MUTEX(ipqnl_mutex);
-static void
-ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
-{
- /* TCP input path (and probably other bits) assume to be called
- * from softirq context, not from syscall, like ipq_issue_verdict is
- * called. TCP input path deadlocks with locks taken from timer
- * softirq, e.g. We therefore emulate this by local_bh_disable() */
-
- local_bh_disable();
- nf_reinject(entry->skb, entry->info, verdict);
- local_bh_enable();
-
- kfree(entry);
-}
-
static inline void
-__ipq_enqueue_entry(struct ipq_queue_entry *entry)
+__ipq_enqueue_entry(struct nf_queue_entry *entry)
{
- list_add(&entry->list, &queue_list);
+ list_add_tail(&entry->list, &queue_list);
queue_total++;
}
-/*
- * Find and return a queued entry matched by cmpfn, or return the last
- * entry if cmpfn is NULL.
- */
-static inline struct ipq_queue_entry *
-__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct list_head *p;
-
- list_for_each_prev(p, &queue_list) {
- struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
-
- if (!cmpfn || cmpfn(entry, data))
- return entry;
- }
- return NULL;
-}
-
-static inline void
-__ipq_dequeue_entry(struct ipq_queue_entry *entry)
-{
- list_del(&entry->list);
- queue_total--;
-}
-
-static inline struct ipq_queue_entry *
-__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct ipq_queue_entry *entry;
-
- entry = __ipq_find_entry(cmpfn, data);
- if (entry == NULL)
- return NULL;
-
- __ipq_dequeue_entry(entry);
- return entry;
-}
-
-
-static inline void
-__ipq_flush(int verdict)
-{
- struct ipq_queue_entry *entry;
-
- while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
- ipq_issue_verdict(entry, verdict);
-}
-
static inline int
__ipq_set_mode(unsigned char mode, unsigned int range)
{
@@ -150,36 +83,64 @@ __ipq_set_mode(unsigned char mode, unsigned int range)
return status;
}
+static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data);
+
static inline void
__ipq_reset(void)
{
peer_pid = 0;
net_disable_timestamp();
__ipq_set_mode(IPQ_COPY_NONE, 0);
- __ipq_flush(NF_DROP);
+ __ipq_flush(NULL, 0);
}
-static struct ipq_queue_entry *
-ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
+static struct nf_queue_entry *
+ipq_find_dequeue_entry(unsigned long id)
{
- struct ipq_queue_entry *entry;
+ struct nf_queue_entry *entry = NULL, *i;
write_lock_bh(&queue_lock);
- entry = __ipq_find_dequeue_entry(cmpfn, data);
+
+ list_for_each_entry(i, &queue_list, list) {
+ if ((unsigned long)i == id) {
+ entry = i;
+ break;
+ }
+ }
+
+ if (entry) {
+ list_del(&entry->list);
+ queue_total--;
+ }
+
write_unlock_bh(&queue_lock);
return entry;
}
static void
-ipq_flush(int verdict)
+__ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
+{
+ struct nf_queue_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, &queue_list, list) {
+ if (!cmpfn || cmpfn(entry, data)) {
+ list_del(&entry->list);
+ queue_total--;
+ nf_reinject(entry, NF_DROP);
+ }
+ }
+}
+
+static void
+ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
{
write_lock_bh(&queue_lock);
- __ipq_flush(verdict);
+ __ipq_flush(cmpfn, data);
write_unlock_bh(&queue_lock);
}
static struct sk_buff *
-ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
+ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
{
sk_buff_data_t old_tail;
size_t size = 0;
@@ -236,20 +197,20 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
pmsg->timestamp_sec = tv.tv_sec;
pmsg->timestamp_usec = tv.tv_usec;
pmsg->mark = entry->skb->mark;
- pmsg->hook = entry->info->hook;
+ pmsg->hook = entry->hook;
pmsg->hw_protocol = entry->skb->protocol;
- if (entry->info->indev)
- strcpy(pmsg->indev_name, entry->info->indev->name);
+ if (entry->indev)
+ strcpy(pmsg->indev_name, entry->indev->name);
else
pmsg->indev_name[0] = '\0';
- if (entry->info->outdev)
- strcpy(pmsg->outdev_name, entry->info->outdev->name);
+ if (entry->outdev)
+ strcpy(pmsg->outdev_name, entry->outdev->name);
else
pmsg->outdev_name[0] = '\0';
- if (entry->info->indev && entry->skb->dev) {
+ if (entry->indev && entry->skb->dev) {
pmsg->hw_type = entry->skb->dev->type;
pmsg->hw_addrlen = dev_parse_header(entry->skb,
pmsg->hw_addr);
@@ -271,28 +232,17 @@ nlmsg_failure:
}
static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
- unsigned int queuenum, void *data)
+ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
{
int status = -EINVAL;
struct sk_buff *nskb;
- struct ipq_queue_entry *entry;
if (copy_mode == IPQ_COPY_NONE)
return -EAGAIN;
- entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
- if (entry == NULL) {
- printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n");
- return -ENOMEM;
- }
-
- entry->info = info;
- entry->skb = skb;
-
nskb = ipq_build_packet_message(entry, &status);
if (nskb == NULL)
- goto err_out_free;
+ return status;
write_lock_bh(&queue_lock);
@@ -326,14 +276,11 @@ err_out_free_nskb:
err_out_unlock:
write_unlock_bh(&queue_lock);
-
-err_out_free:
- kfree(entry);
return status;
}
static int
-ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
+ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e)
{
int diff;
int err;
@@ -368,21 +315,15 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
return 0;
}
-static inline int
-id_cmp(struct ipq_queue_entry *e, unsigned long id)
-{
- return (id == (unsigned long )e);
-}
-
static int
ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
{
- struct ipq_queue_entry *entry;
+ struct nf_queue_entry *entry;
if (vmsg->value > NF_MAX_VERDICT)
return -EINVAL;
- entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
+ entry = ipq_find_dequeue_entry(vmsg->id);
if (entry == NULL)
return -ENOENT;
else {
@@ -392,7 +333,7 @@ ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
if (ipq_mangle_ipv4(vmsg, entry) < 0)
verdict = NF_DROP;
- ipq_issue_verdict(entry, verdict);
+ nf_reinject(entry, verdict);
return 0;
}
}
@@ -437,13 +378,13 @@ ipq_receive_peer(struct ipq_peer_msg *pmsg,
}
static int
-dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
+dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
{
- if (entry->info->indev)
- if (entry->info->indev->ifindex == ifindex)
+ if (entry->indev)
+ if (entry->indev->ifindex == ifindex)
return 1;
- if (entry->info->outdev)
- if (entry->info->outdev->ifindex == ifindex)
+ if (entry->outdev)
+ if (entry->outdev->ifindex == ifindex)
return 1;
#ifdef CONFIG_BRIDGE_NETFILTER
if (entry->skb->nf_bridge) {
@@ -461,10 +402,7 @@ dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
static void
ipq_dev_drop(int ifindex)
{
- struct ipq_queue_entry *entry;
-
- while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
- ipq_issue_verdict(entry, NF_DROP);
+ ipq_flush(dev_cmp, ifindex);
}
#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
@@ -588,26 +526,6 @@ static ctl_table ipq_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table ipq_dir_table[] = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = ipq_table
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ipq_root_table[] = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = ipq_dir_table
- },
- { .ctl_name = 0 }
-};
-
static int ip_queue_show(struct seq_file *m, void *v)
{
read_lock_bh(&queue_lock);
@@ -645,7 +563,7 @@ static const struct file_operations ip_queue_proc_fops = {
.owner = THIS_MODULE,
};
-static struct nf_queue_handler nfqh = {
+static const struct nf_queue_handler nfqh = {
.name = "ip_queue",
.outfn = &ipq_enqueue_packet,
};
@@ -673,7 +591,7 @@ static int __init ip_queue_init(void)
}
register_netdevice_notifier(&ipq_dev_notifier);
- ipq_sysctl_header = register_sysctl_table(ipq_root_table);
+ ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table);
status = nf_register_queue_handler(PF_INET, &nfqh);
if (status < 0) {
@@ -687,7 +605,7 @@ cleanup_sysctl:
unregister_netdevice_notifier(&ipq_dev_notifier);
proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
cleanup_ipqnl:
- sock_release(ipqnl->sk_socket);
+ netlink_kernel_release(ipqnl);
mutex_lock(&ipqnl_mutex);
mutex_unlock(&ipqnl_mutex);
@@ -700,13 +618,13 @@ static void __exit ip_queue_fini(void)
{
nf_unregister_queue_handlers(&nfqh);
synchronize_net();
- ipq_flush(NF_DROP);
+ ipq_flush(NULL, 0);
unregister_sysctl_table(ipq_sysctl_header);
unregister_netdevice_notifier(&ipq_dev_notifier);
proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
- sock_release(ipqnl->sk_socket);
+ netlink_kernel_release(ipqnl);
mutex_lock(&ipqnl_mutex);
mutex_unlock(&ipqnl_mutex);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b9b189c26208..982b7f986291 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -26,6 +26,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/netfilter/nf_log.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -74,7 +75,8 @@ do { \
Hence the start of any table is given by get_table() below. */
/* Returns whether matches rule or not. */
-static inline int
+/* Performance critical - called for every packet */
+static inline bool
ip_packet_match(const struct iphdr *ip,
const char *indev,
const char *outdev,
@@ -84,7 +86,7 @@ ip_packet_match(const struct iphdr *ip,
size_t i;
unsigned long ret;
-#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg))
+#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
IPT_INV_SRCIP)
@@ -102,7 +104,7 @@ ip_packet_match(const struct iphdr *ip,
NIPQUAD(ipinfo->dmsk.s_addr),
NIPQUAD(ipinfo->dst.s_addr),
ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
- return 0;
+ return false;
}
/* Look for ifname matches; this should unroll nicely. */
@@ -116,7 +118,7 @@ ip_packet_match(const struct iphdr *ip,
dprintf("VIA in mismatch (%s vs %s).%s\n",
indev, ipinfo->iniface,
ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
- return 0;
+ return false;
}
for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
@@ -129,7 +131,7 @@ ip_packet_match(const struct iphdr *ip,
dprintf("VIA out mismatch (%s vs %s).%s\n",
outdev, ipinfo->outiface,
ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
- return 0;
+ return false;
}
/* Check specific protocol */
@@ -138,7 +140,7 @@ ip_packet_match(const struct iphdr *ip,
dprintf("Packet protocol %hi does not match %hi.%s\n",
ip->protocol, ipinfo->proto,
ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
- return 0;
+ return false;
}
/* If we have a fragment rule but the packet is not a fragment
@@ -146,13 +148,13 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
dprintf("Fragment rule but not fragment.%s\n",
ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
- return 0;
+ return false;
}
- return 1;
+ return true;
}
-static inline bool
+static bool
ip_checkentry(const struct ipt_ip *ip)
{
if (ip->flags & ~IPT_F_MASK) {
@@ -182,8 +184,9 @@ ipt_error(struct sk_buff *skb,
return NF_DROP;
}
-static inline
-bool do_match(struct ipt_entry_match *m,
+/* Performance critical - called for every packet */
+static inline bool
+do_match(struct ipt_entry_match *m,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -198,6 +201,7 @@ bool do_match(struct ipt_entry_match *m,
return false;
}
+/* Performance critical */
static inline struct ipt_entry *
get_entry(void *base, unsigned int offset)
{
@@ -205,6 +209,7 @@ get_entry(void *base, unsigned int offset)
}
/* All zeroes == unconditional rule. */
+/* Mildly perf critical (only if packet tracing is on) */
static inline int
unconditional(const struct ipt_ip *ip)
{
@@ -215,16 +220,17 @@ unconditional(const struct ipt_ip *ip)
return 0;
return 1;
+#undef FWINV
}
#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
-static const char *hooknames[] = {
- [NF_IP_PRE_ROUTING] = "PREROUTING",
- [NF_IP_LOCAL_IN] = "INPUT",
- [NF_IP_FORWARD] = "FORWARD",
- [NF_IP_LOCAL_OUT] = "OUTPUT",
- [NF_IP_POST_ROUTING] = "POSTROUTING",
+static const char *const hooknames[] = {
+ [NF_INET_PRE_ROUTING] = "PREROUTING",
+ [NF_INET_LOCAL_IN] = "INPUT",
+ [NF_INET_FORWARD] = "FORWARD",
+ [NF_INET_LOCAL_OUT] = "OUTPUT",
+ [NF_INET_POST_ROUTING] = "POSTROUTING",
};
enum nf_ip_trace_comments {
@@ -233,7 +239,7 @@ enum nf_ip_trace_comments {
NF_IP_TRACE_COMMENT_POLICY,
};
-static const char *comments[] = {
+static const char *const comments[] = {
[NF_IP_TRACE_COMMENT_RULE] = "rule",
[NF_IP_TRACE_COMMENT_RETURN] = "return",
[NF_IP_TRACE_COMMENT_POLICY] = "policy",
@@ -249,6 +255,7 @@ static struct nf_loginfo trace_loginfo = {
},
};
+/* Mildly perf critical (only if packet tracing is on) */
static inline int
get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e,
char *hookname, char **chainname,
@@ -465,10 +472,9 @@ mark_source_chains(struct xt_table_info *newinfo,
/* No recursion; use packet counter to save back ptrs (reset
to 0 as we leave), and comefrom to save source hook bitmask */
- for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
+ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
- struct ipt_entry *e
- = (struct ipt_entry *)(entry0 + pos);
+ struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos);
if (!(valid_hooks & (1 << hook)))
continue;
@@ -481,13 +487,12 @@ mark_source_chains(struct xt_table_info *newinfo,
= (void *)ipt_get_target(e);
int visited = e->comefrom & (1 << hook);
- if (e->comefrom & (1 << NF_IP_NUMHOOKS)) {
+ if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
printk("iptables: loop hook %u pos %u %08X.\n",
hook, pos, e->comefrom);
return 0;
}
- e->comefrom
- |= ((1 << hook) | (1 << NF_IP_NUMHOOKS));
+ e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
if ((e->target_offset == sizeof(struct ipt_entry)
@@ -507,10 +512,10 @@ mark_source_chains(struct xt_table_info *newinfo,
/* Return: backtrack through the last
big jump. */
do {
- e->comefrom ^= (1<<NF_IP_NUMHOOKS);
+ e->comefrom ^= (1<<NF_INET_NUMHOOKS);
#ifdef DEBUG_IP_FIREWALL_USER
if (e->comefrom
- & (1 << NF_IP_NUMHOOKS)) {
+ & (1 << NF_INET_NUMHOOKS)) {
duprintf("Back unset "
"on hook %u "
"rule %u\n",
@@ -567,7 +572,7 @@ mark_source_chains(struct xt_table_info *newinfo,
return 1;
}
-static inline int
+static int
cleanup_match(struct ipt_entry_match *m, unsigned int *i)
{
if (i && (*i)-- == 0)
@@ -579,7 +584,7 @@ cleanup_match(struct ipt_entry_match *m, unsigned int *i)
return 0;
}
-static inline int
+static int
check_entry(struct ipt_entry *e, const char *name)
{
struct ipt_entry_target *t;
@@ -589,7 +594,8 @@ check_entry(struct ipt_entry *e, const char *name)
return -EINVAL;
}
- if (e->target_offset + sizeof(struct ipt_entry_target) > e->next_offset)
+ if (e->target_offset + sizeof(struct ipt_entry_target) >
+ e->next_offset)
return -EINVAL;
t = ipt_get_target(e);
@@ -599,9 +605,10 @@ check_entry(struct ipt_entry *e, const char *name)
return 0;
}
-static inline int check_match(struct ipt_entry_match *m, const char *name,
- const struct ipt_ip *ip, unsigned int hookmask,
- unsigned int *i)
+static int
+check_match(struct ipt_entry_match *m, const char *name,
+ const struct ipt_ip *ip,
+ unsigned int hookmask, unsigned int *i)
{
struct xt_match *match;
int ret;
@@ -622,18 +629,18 @@ static inline int check_match(struct ipt_entry_match *m, const char *name,
return ret;
}
-static inline int
+static int
find_check_match(struct ipt_entry_match *m,
- const char *name,
- const struct ipt_ip *ip,
- unsigned int hookmask,
- unsigned int *i)
+ const char *name,
+ const struct ipt_ip *ip,
+ unsigned int hookmask,
+ unsigned int *i)
{
struct xt_match *match;
int ret;
match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
- m->u.user.revision),
+ m->u.user.revision),
"ipt_%s", m->u.user.name);
if (IS_ERR(match) || !match) {
duprintf("find_check_match: `%s' not found\n", m->u.user.name);
@@ -651,7 +658,7 @@ err:
return ret;
}
-static inline int check_target(struct ipt_entry *e, const char *name)
+static int check_target(struct ipt_entry *e, const char *name)
{
struct ipt_entry_target *t;
struct xt_target *target;
@@ -663,8 +670,8 @@ static inline int check_target(struct ipt_entry *e, const char *name)
name, e->comefrom, e->ip.proto,
e->ip.invflags & IPT_INV_PROTO);
if (!ret && t->u.kernel.target->checkentry
- && !t->u.kernel.target->checkentry(name, e, target,
- t->data, e->comefrom)) {
+ && !t->u.kernel.target->checkentry(name, e, target, t->data,
+ e->comefrom)) {
duprintf("ip_tables: check failed for `%s'.\n",
t->u.kernel.target->name);
ret = -EINVAL;
@@ -672,9 +679,9 @@ static inline int check_target(struct ipt_entry *e, const char *name)
return ret;
}
-static inline int
+static int
find_check_entry(struct ipt_entry *e, const char *name, unsigned int size,
- unsigned int *i)
+ unsigned int *i)
{
struct ipt_entry_target *t;
struct xt_target *target;
@@ -687,14 +694,14 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size,
j = 0;
ret = IPT_MATCH_ITERATE(e, find_check_match, name, &e->ip,
- e->comefrom, &j);
+ e->comefrom, &j);
if (ret != 0)
goto cleanup_matches;
t = ipt_get_target(e);
target = try_then_request_module(xt_find_target(AF_INET,
- t->u.user.name,
- t->u.user.revision),
+ t->u.user.name,
+ t->u.user.revision),
"ipt_%s", t->u.user.name);
if (IS_ERR(target) || !target) {
duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
@@ -716,7 +723,7 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size,
return ret;
}
-static inline int
+static int
check_entry_size_and_hooks(struct ipt_entry *e,
struct xt_table_info *newinfo,
unsigned char *base,
@@ -741,7 +748,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
}
/* Check hooks & underflows */
- for (h = 0; h < NF_IP_NUMHOOKS; h++) {
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h])
@@ -759,7 +766,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
return 0;
}
-static inline int
+static int
cleanup_entry(struct ipt_entry *e, unsigned int *i)
{
struct ipt_entry_target *t;
@@ -795,7 +802,7 @@ translate_table(const char *name,
newinfo->number = number;
/* Init all hooks to impossible value. */
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
newinfo->hook_entry[i] = 0xFFFFFFFF;
newinfo->underflow[i] = 0xFFFFFFFF;
}
@@ -819,7 +826,7 @@ translate_table(const char *name,
}
/* Check hooks all assigned */
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
/* Only hooks which are valid */
if (!(valid_hooks & (1 << i)))
continue;
@@ -915,7 +922,7 @@ get_counters(const struct xt_table_info *t,
}
}
-static inline struct xt_counters * alloc_counters(struct xt_table *table)
+static struct xt_counters * alloc_counters(struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
@@ -959,7 +966,6 @@ copy_entries_to_user(unsigned int total_size,
* allowed to migrate to another cpu)
*/
loc_cpu_entry = private->entries[raw_smp_processor_id()];
- /* ... then copy entire thing ... */
if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
@@ -1014,63 +1020,12 @@ copy_entries_to_user(unsigned int total_size,
}
#ifdef CONFIG_COMPAT
-struct compat_delta {
- struct compat_delta *next;
- unsigned int offset;
- short delta;
-};
-
-static struct compat_delta *compat_offsets = NULL;
-
-static int compat_add_offset(unsigned int offset, short delta)
-{
- struct compat_delta *tmp;
-
- tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
- tmp->offset = offset;
- tmp->delta = delta;
- if (compat_offsets) {
- tmp->next = compat_offsets->next;
- compat_offsets->next = tmp;
- } else {
- compat_offsets = tmp;
- tmp->next = NULL;
- }
- return 0;
-}
-
-static void compat_flush_offsets(void)
-{
- struct compat_delta *tmp, *next;
-
- if (compat_offsets) {
- for(tmp = compat_offsets; tmp; tmp = next) {
- next = tmp->next;
- kfree(tmp);
- }
- compat_offsets = NULL;
- }
-}
-
-static short compat_calc_jump(unsigned int offset)
-{
- struct compat_delta *tmp;
- short delta;
-
- for(tmp = compat_offsets, delta = 0; tmp; tmp = tmp->next)
- if (tmp->offset < offset)
- delta += tmp->delta;
- return delta;
-}
-
static void compat_standard_from_user(void *dst, void *src)
{
int v = *(compat_int_t *)src;
if (v > 0)
- v += compat_calc_jump(v);
+ v += xt_compat_calc_jump(AF_INET, v);
memcpy(dst, &v, sizeof(v));
}
@@ -1079,64 +1034,61 @@ static int compat_standard_to_user(void __user *dst, void *src)
compat_int_t cv = *(int *)src;
if (cv > 0)
- cv -= compat_calc_jump(cv);
+ cv -= xt_compat_calc_jump(AF_INET, cv);
return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
}
static inline int
-compat_calc_match(struct ipt_entry_match *m, int * size)
+compat_calc_match(struct ipt_entry_match *m, int *size)
{
*size += xt_compat_match_offset(m->u.kernel.match);
return 0;
}
-static int compat_calc_entry(struct ipt_entry *e, struct xt_table_info *info,
- void *base, struct xt_table_info *newinfo)
+static int compat_calc_entry(struct ipt_entry *e,
+ const struct xt_table_info *info,
+ void *base, struct xt_table_info *newinfo)
{
struct ipt_entry_target *t;
unsigned int entry_offset;
int off, i, ret;
- off = 0;
+ off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
entry_offset = (void *)e - base;
IPT_MATCH_ITERATE(e, compat_calc_match, &off);
t = ipt_get_target(e);
off += xt_compat_target_offset(t->u.kernel.target);
newinfo->size -= off;
- ret = compat_add_offset(entry_offset, off);
+ ret = xt_compat_add_offset(AF_INET, entry_offset, off);
if (ret)
return ret;
- for (i = 0; i< NF_IP_NUMHOOKS; i++) {
- if (info->hook_entry[i] && (e < (struct ipt_entry *)
- (base + info->hook_entry[i])))
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ if (info->hook_entry[i] &&
+ (e < (struct ipt_entry *)(base + info->hook_entry[i])))
newinfo->hook_entry[i] -= off;
- if (info->underflow[i] && (e < (struct ipt_entry *)
- (base + info->underflow[i])))
+ if (info->underflow[i] &&
+ (e < (struct ipt_entry *)(base + info->underflow[i])))
newinfo->underflow[i] -= off;
}
return 0;
}
-static int compat_table_info(struct xt_table_info *info,
- struct xt_table_info *newinfo)
+static int compat_table_info(const struct xt_table_info *info,
+ struct xt_table_info *newinfo)
{
void *loc_cpu_entry;
- int i;
if (!newinfo || !info)
return -EINVAL;
- memset(newinfo, 0, sizeof(struct xt_table_info));
- newinfo->size = info->size;
- newinfo->number = info->number;
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
- newinfo->hook_entry[i] = info->hook_entry[i];
- newinfo->underflow[i] = info->underflow[i];
- }
+ /* we dont care about newinfo->entries[] */
+ memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+ newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size,
- compat_calc_entry, info, loc_cpu_entry, newinfo);
+ compat_calc_entry, info, loc_cpu_entry,
+ newinfo);
}
#endif
@@ -1147,8 +1099,8 @@ static int get_info(void __user *user, int *len, int compat)
int ret;
if (*len != sizeof(struct ipt_getinfo)) {
- duprintf("length %u != %u\n", *len,
- (unsigned int)sizeof(struct ipt_getinfo));
+ duprintf("length %u != %zu\n", *len,
+ sizeof(struct ipt_getinfo));
return -EINVAL;
}
@@ -1161,7 +1113,7 @@ static int get_info(void __user *user, int *len, int compat)
xt_compat_lock(AF_INET);
#endif
t = try_then_request_module(xt_find_table_lock(AF_INET, name),
- "iptable_%s", name);
+ "iptable_%s", name);
if (t && !IS_ERR(t)) {
struct ipt_getinfo info;
struct xt_table_info *private = t->private;
@@ -1170,15 +1122,15 @@ static int get_info(void __user *user, int *len, int compat)
if (compat) {
struct xt_table_info tmp;
ret = compat_table_info(private, &tmp);
- compat_flush_offsets();
- private = &tmp;
+ xt_compat_flush_offsets(AF_INET);
+ private = &tmp;
}
#endif
info.valid_hooks = t->valid_hooks;
memcpy(info.hook_entry, private->hook_entry,
- sizeof(info.hook_entry));
+ sizeof(info.hook_entry));
memcpy(info.underflow, private->underflow,
- sizeof(info.underflow));
+ sizeof(info.underflow));
info.num_entries = private->number;
info.size = private->size;
strcpy(info.name, name);
@@ -1207,31 +1159,27 @@ get_entries(struct ipt_get_entries __user *uptr, int *len)
struct xt_table *t;
if (*len < sizeof(get)) {
- duprintf("get_entries: %u < %d\n", *len,
- (unsigned int)sizeof(get));
+ duprintf("get_entries: %u < %zu\n", *len, sizeof(get));
return -EINVAL;
}
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
if (*len != sizeof(struct ipt_get_entries) + get.size) {
- duprintf("get_entries: %u != %u\n", *len,
- (unsigned int)(sizeof(struct ipt_get_entries) +
- get.size));
+ duprintf("get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
return -EINVAL;
}
t = xt_find_table_lock(AF_INET, get.name);
if (t && !IS_ERR(t)) {
struct xt_table_info *private = t->private;
- duprintf("t->private->number = %u\n",
- private->number);
+ duprintf("t->private->number = %u\n", private->number);
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
else {
duprintf("get_entries: I've got %u not %u!\n",
- private->size,
- get.size);
+ private->size, get.size);
ret = -EINVAL;
}
module_put(t->me);
@@ -1244,8 +1192,8 @@ get_entries(struct ipt_get_entries __user *uptr, int *len)
static int
__do_replace(const char *name, unsigned int valid_hooks,
- struct xt_table_info *newinfo, unsigned int num_counters,
- void __user *counters_ptr)
+ struct xt_table_info *newinfo, unsigned int num_counters,
+ void __user *counters_ptr)
{
int ret;
struct xt_table *t;
@@ -1293,7 +1241,8 @@ __do_replace(const char *name, unsigned int valid_hooks,
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
+ NULL);
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
sizeof(struct xt_counters) * num_counters) != 0)
@@ -1322,14 +1271,7 @@ do_replace(void __user *user, unsigned int len)
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
- /* Hack: Causes ipchains to give correct error msg --RR */
- if (len != sizeof(tmp) + tmp.size)
- return -ENOPROTOOPT;
-
/* overflow check */
- if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
- SMP_CACHE_BYTES)
- return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
@@ -1337,7 +1279,7 @@ do_replace(void __user *user, unsigned int len)
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is our node/cpu */
+ /* choose the copy that is on our node/cpu */
loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
@@ -1353,15 +1295,14 @@ do_replace(void __user *user, unsigned int len)
duprintf("ip_tables: Translated table\n");
- ret = __do_replace(tmp.name, tmp.valid_hooks,
- newinfo, tmp.num_counters,
- tmp.counters);
+ ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, tmp.counters);
if (ret)
goto free_newinfo_untrans;
return 0;
free_newinfo_untrans:
- IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
+ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
@@ -1369,7 +1310,7 @@ do_replace(void __user *user, unsigned int len)
/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK. */
-static inline int
+static int
add_counter_to_entry(struct ipt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
@@ -1479,19 +1420,13 @@ struct compat_ipt_replace {
u32 valid_hooks;
u32 num_entries;
u32 size;
- u32 hook_entry[NF_IP_NUMHOOKS];
- u32 underflow[NF_IP_NUMHOOKS];
+ u32 hook_entry[NF_INET_NUMHOOKS];
+ u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters; /* struct ipt_counters * */
struct compat_ipt_entry entries[0];
};
-static inline int compat_copy_match_to_user(struct ipt_entry_match *m,
- void __user **dstptr, compat_uint_t *size)
-{
- return xt_compat_match_to_user(m, dstptr, size);
-}
-
static int
compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
compat_uint_t *size, struct xt_counters *counters,
@@ -1513,7 +1448,9 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
goto out;
*dstptr += sizeof(struct compat_ipt_entry);
- ret = IPT_MATCH_ITERATE(e, compat_copy_match_to_user, dstptr, size);
+ *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+ ret = IPT_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size);
target_offset = e->target_offset - (origsize - *size);
if (ret)
goto out;
@@ -1534,21 +1471,21 @@ out:
return ret;
}
-static inline int
+static int
compat_find_calc_match(struct ipt_entry_match *m,
- const char *name,
- const struct ipt_ip *ip,
- unsigned int hookmask,
- int *size, int *i)
+ const char *name,
+ const struct ipt_ip *ip,
+ unsigned int hookmask,
+ int *size, int *i)
{
struct xt_match *match;
match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
- m->u.user.revision),
+ m->u.user.revision),
"ipt_%s", m->u.user.name);
if (IS_ERR(match) || !match) {
duprintf("compat_check_calc_match: `%s' not found\n",
- m->u.user.name);
+ m->u.user.name);
return match ? PTR_ERR(match) : -ENOENT;
}
m->u.kernel.match = match;
@@ -1558,7 +1495,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
return 0;
}
-static inline int
+static int
compat_release_match(struct ipt_entry_match *m, unsigned int *i)
{
if (i && (*i)-- == 0)
@@ -1568,8 +1505,8 @@ compat_release_match(struct ipt_entry_match *m, unsigned int *i)
return 0;
}
-static inline int
-compat_release_entry(struct ipt_entry *e, unsigned int *i)
+static int
+compat_release_entry(struct compat_ipt_entry *e, unsigned int *i)
{
struct ipt_entry_target *t;
@@ -1577,22 +1514,22 @@ compat_release_entry(struct ipt_entry *e, unsigned int *i)
return 1;
/* Cleanup all matches */
- IPT_MATCH_ITERATE(e, compat_release_match, NULL);
- t = ipt_get_target(e);
+ COMPAT_IPT_MATCH_ITERATE(e, compat_release_match, NULL);
+ t = compat_ipt_get_target(e);
module_put(t->u.kernel.target->me);
return 0;
}
-static inline int
-check_compat_entry_size_and_hooks(struct ipt_entry *e,
- struct xt_table_info *newinfo,
- unsigned int *size,
- unsigned char *base,
- unsigned char *limit,
- unsigned int *hook_entries,
- unsigned int *underflows,
- unsigned int *i,
- const char *name)
+static int
+check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
+ struct xt_table_info *newinfo,
+ unsigned int *size,
+ unsigned char *base,
+ unsigned char *limit,
+ unsigned int *hook_entries,
+ unsigned int *underflows,
+ unsigned int *i,
+ const char *name)
{
struct ipt_entry_target *t;
struct xt_target *target;
@@ -1607,32 +1544,33 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e,
}
if (e->next_offset < sizeof(struct compat_ipt_entry) +
- sizeof(struct compat_xt_entry_target)) {
+ sizeof(struct compat_xt_entry_target)) {
duprintf("checking: element %p size %u\n",
e, e->next_offset);
return -EINVAL;
}
- ret = check_entry(e, name);
+ /* For purposes of check_entry casting the compat entry is fine */
+ ret = check_entry((struct ipt_entry *)e, name);
if (ret)
return ret;
- off = 0;
+ off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
entry_offset = (void *)e - (void *)base;
j = 0;
- ret = IPT_MATCH_ITERATE(e, compat_find_calc_match, name, &e->ip,
- e->comefrom, &off, &j);
+ ret = COMPAT_IPT_MATCH_ITERATE(e, compat_find_calc_match, name,
+ &e->ip, e->comefrom, &off, &j);
if (ret != 0)
goto release_matches;
- t = ipt_get_target(e);
+ t = compat_ipt_get_target(e);
target = try_then_request_module(xt_find_target(AF_INET,
- t->u.user.name,
- t->u.user.revision),
+ t->u.user.name,
+ t->u.user.revision),
"ipt_%s", t->u.user.name);
if (IS_ERR(target) || !target) {
duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
- t->u.user.name);
+ t->u.user.name);
ret = target ? PTR_ERR(target) : -ENOENT;
goto release_matches;
}
@@ -1640,12 +1578,12 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e,
off += xt_compat_target_offset(target);
*size += off;
- ret = compat_add_offset(entry_offset, off);
+ ret = xt_compat_add_offset(AF_INET, entry_offset, off);
if (ret)
goto out;
/* Check hooks & underflows */
- for (h = 0; h < NF_IP_NUMHOOKS; h++) {
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h])
@@ -1653,7 +1591,7 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e,
}
/* Clear counters and comefrom */
- e->counters = ((struct ipt_counters) { 0, 0 });
+ memset(&e->counters, 0, sizeof(e->counters));
e->comefrom = 0;
(*i)++;
@@ -1666,17 +1604,10 @@ release_matches:
return ret;
}
-static inline int compat_copy_match_from_user(struct ipt_entry_match *m,
- void **dstptr, compat_uint_t *size, const char *name,
- const struct ipt_ip *ip, unsigned int hookmask)
-{
- xt_compat_match_from_user(m, dstptr, size);
- return 0;
-}
-
-static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
- unsigned int *size, const char *name,
- struct xt_table_info *newinfo, unsigned char *base)
+static int
+compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
+ unsigned int *size, const char *name,
+ struct xt_table_info *newinfo, unsigned char *base)
{
struct ipt_entry_target *t;
struct xt_target *target;
@@ -1688,19 +1619,22 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
origsize = *size;
de = (struct ipt_entry *)*dstptr;
memcpy(de, e, sizeof(struct ipt_entry));
+ memcpy(&de->counters, &e->counters, sizeof(e->counters));
- *dstptr += sizeof(struct compat_ipt_entry);
- ret = IPT_MATCH_ITERATE(e, compat_copy_match_from_user, dstptr, size,
- name, &de->ip, de->comefrom);
+ *dstptr += sizeof(struct ipt_entry);
+ *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+ ret = COMPAT_IPT_MATCH_ITERATE(e, xt_compat_match_from_user,
+ dstptr, size);
if (ret)
return ret;
de->target_offset = e->target_offset - (origsize - *size);
- t = ipt_get_target(e);
+ t = compat_ipt_get_target(e);
target = t->u.kernel.target;
xt_compat_target_from_user(t, dstptr, size);
de->next_offset = e->next_offset - (origsize - *size);
- for (h = 0; h < NF_IP_NUMHOOKS; h++) {
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
if ((unsigned char *)de - base < newinfo->hook_entry[h])
newinfo->hook_entry[h] -= origsize - *size;
if ((unsigned char *)de - base < newinfo->underflow[h])
@@ -1709,13 +1643,15 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
return ret;
}
-static inline int compat_check_entry(struct ipt_entry *e, const char *name,
- unsigned int *i)
+static int
+compat_check_entry(struct ipt_entry *e, const char *name,
+ unsigned int *i)
{
int j, ret;
j = 0;
- ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j);
+ ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip,
+ e->comefrom, &j);
if (ret)
goto cleanup_matches;
@@ -1733,13 +1669,13 @@ static inline int compat_check_entry(struct ipt_entry *e, const char *name,
static int
translate_compat_table(const char *name,
- unsigned int valid_hooks,
- struct xt_table_info **pinfo,
- void **pentry0,
- unsigned int total_size,
- unsigned int number,
- unsigned int *hook_entries,
- unsigned int *underflows)
+ unsigned int valid_hooks,
+ struct xt_table_info **pinfo,
+ void **pentry0,
+ unsigned int total_size,
+ unsigned int number,
+ unsigned int *hook_entries,
+ unsigned int *underflows)
{
unsigned int i, j;
struct xt_table_info *newinfo, *info;
@@ -1753,7 +1689,7 @@ translate_compat_table(const char *name,
info->number = number;
/* Init all hooks to impossible value. */
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
info->hook_entry[i] = 0xFFFFFFFF;
info->underflow[i] = 0xFFFFFFFF;
}
@@ -1762,11 +1698,11 @@ translate_compat_table(const char *name,
j = 0;
xt_compat_lock(AF_INET);
/* Walk through entries, checking offsets. */
- ret = IPT_ENTRY_ITERATE(entry0, total_size,
- check_compat_entry_size_and_hooks,
- info, &size, entry0,
- entry0 + total_size,
- hook_entries, underflows, &j, name);
+ ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size,
+ check_compat_entry_size_and_hooks,
+ info, &size, entry0,
+ entry0 + total_size,
+ hook_entries, underflows, &j, name);
if (ret != 0)
goto out_unlock;
@@ -1778,7 +1714,7 @@ translate_compat_table(const char *name,
}
/* Check hooks all assigned */
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
/* Only hooks which are valid */
if (!(valid_hooks & (1 << i)))
continue;
@@ -1800,17 +1736,17 @@ translate_compat_table(const char *name,
goto out_unlock;
newinfo->number = number;
- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
newinfo->hook_entry[i] = info->hook_entry[i];
newinfo->underflow[i] = info->underflow[i];
}
entry1 = newinfo->entries[raw_smp_processor_id()];
pos = entry1;
- size = total_size;
- ret = IPT_ENTRY_ITERATE(entry0, total_size,
- compat_copy_entry_from_user, &pos, &size,
- name, newinfo, entry1);
- compat_flush_offsets();
+ size = total_size;
+ ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size,
+ compat_copy_entry_from_user,
+ &pos, &size, name, newinfo, entry1);
+ xt_compat_flush_offsets(AF_INET);
xt_compat_unlock(AF_INET);
if (ret)
goto free_newinfo;
@@ -1821,11 +1757,11 @@ translate_compat_table(const char *name,
i = 0;
ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry,
- name, &i);
+ name, &i);
if (ret) {
j -= i;
- IPT_ENTRY_ITERATE_CONTINUE(entry1, newinfo->size, i,
- compat_release_entry, &j);
+ COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i,
+ compat_release_entry, &j);
IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i);
xt_free_table_info(newinfo);
return ret;
@@ -1844,10 +1780,10 @@ translate_compat_table(const char *name,
free_newinfo:
xt_free_table_info(newinfo);
out:
- IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j);
+ COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j);
return ret;
out_unlock:
- compat_flush_offsets();
+ xt_compat_flush_offsets(AF_INET);
xt_compat_unlock(AF_INET);
goto out;
}
@@ -1863,13 +1799,8 @@ compat_do_replace(void __user *user, unsigned int len)
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
- /* Hack: Causes ipchains to give correct error msg --RR */
- if (len != sizeof(tmp) + tmp.size)
- return -ENOPROTOOPT;
-
/* overflow check */
- if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
- SMP_CACHE_BYTES)
+ if (tmp.size >= INT_MAX / num_possible_cpus())
return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
@@ -1878,7 +1809,7 @@ compat_do_replace(void __user *user, unsigned int len)
if (!newinfo)
return -ENOMEM;
- /* choose the copy that is our node/cpu */
+ /* choose the copy that is on our node/cpu */
loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
@@ -1887,22 +1818,22 @@ compat_do_replace(void __user *user, unsigned int len)
}
ret = translate_compat_table(tmp.name, tmp.valid_hooks,
- &newinfo, &loc_cpu_entry, tmp.size,
- tmp.num_entries, tmp.hook_entry, tmp.underflow);
+ &newinfo, &loc_cpu_entry, tmp.size,
+ tmp.num_entries, tmp.hook_entry,
+ tmp.underflow);
if (ret != 0)
goto free_newinfo;
duprintf("compat_do_replace: Translated table\n");
- ret = __do_replace(tmp.name, tmp.valid_hooks,
- newinfo, tmp.num_counters,
- compat_ptr(tmp.counters));
+ ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, compat_ptr(tmp.counters));
if (ret)
goto free_newinfo_untrans;
return 0;
free_newinfo_untrans:
- IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
+ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
@@ -1910,7 +1841,7 @@ compat_do_replace(void __user *user, unsigned int len)
static int
compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
- unsigned int len)
+ unsigned int len)
{
int ret;
@@ -1934,15 +1865,15 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
return ret;
}
-struct compat_ipt_get_entries
-{
+struct compat_ipt_get_entries {
char name[IPT_TABLE_MAXNAMELEN];
compat_uint_t size;
struct compat_ipt_entry entrytable[0];
};
-static int compat_copy_entries_to_user(unsigned int total_size,
- struct xt_table *table, void __user *userptr)
+static int
+compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
+ void __user *userptr)
{
struct xt_counters *counters;
struct xt_table_info *private = table->private;
@@ -1978,10 +1909,8 @@ compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len)
struct compat_ipt_get_entries get;
struct xt_table *t;
-
if (*len < sizeof(get)) {
- duprintf("compat_get_entries: %u < %u\n",
- *len, (unsigned int)sizeof(get));
+ duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
return -EINVAL;
}
@@ -1989,9 +1918,8 @@ compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len)
return -EFAULT;
if (*len != sizeof(struct compat_ipt_get_entries) + get.size) {
- duprintf("compat_get_entries: %u != %u\n", *len,
- (unsigned int)(sizeof(struct compat_ipt_get_entries) +
- get.size));
+ duprintf("compat_get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
return -EINVAL;
}
@@ -2000,19 +1928,17 @@ compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len)
if (t && !IS_ERR(t)) {
struct xt_table_info *private = t->private;
struct xt_table_info info;
- duprintf("t->private->number = %u\n",
- private->number);
+ duprintf("t->private->number = %u\n", private->number);
ret = compat_table_info(private, &info);
if (!ret && get.size == info.size) {
ret = compat_copy_entries_to_user(private->size,
- t, uptr->entrytable);
+ t, uptr->entrytable);
} else if (!ret) {
duprintf("compat_get_entries: I've got %u not %u!\n",
- private->size,
- get.size);
+ private->size, get.size);
ret = -EINVAL;
}
- compat_flush_offsets();
+ xt_compat_flush_offsets(AF_INET);
module_put(t->me);
xt_table_unlock(t);
} else
@@ -2047,7 +1973,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
#endif
static int
-do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
int ret;
@@ -2126,7 +2052,7 @@ int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl)
{
int ret;
struct xt_table_info *newinfo;
- static struct xt_table_info bootstrap
+ struct xt_table_info bootstrap
= { 0, 0, 0, { 0 }, { 0 }, { } };
void *loc_cpu_entry;
@@ -2134,9 +2060,7 @@ int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl)
if (!newinfo)
return -ENOMEM;
- /* choose the copy on our node/cpu
- * but dont care of preemption
- */
+ /* choose the copy on our node/cpu, but dont care about preemption */
loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -2178,7 +2102,8 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
u_int8_t type, u_int8_t code,
bool invert)
{
- return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code))
+ return ((test_type == 0xFF) ||
+ (type == test_type && code >= min_code && code <= max_code))
^ invert;
}
@@ -2219,7 +2144,7 @@ icmp_match(const struct sk_buff *skb,
/* Called when user tries to insert an entry of this type. */
static bool
icmp_checkentry(const char *tablename,
- const void *info,
+ const void *entry,
const struct xt_match *match,
void *matchinfo,
unsigned int hook_mask)
@@ -2270,9 +2195,9 @@ static struct xt_match icmp_matchstruct __read_mostly = {
.name = "icmp",
.match = icmp_match,
.matchsize = sizeof(struct ipt_icmp),
+ .checkentry = icmp_checkentry,
.proto = IPPROTO_ICMP,
.family = AF_INET,
- .checkentry = icmp_checkentry,
};
static int __init ip_tables_init(void)
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2f544dac72df..1b31f7d14d46 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -32,7 +32,7 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("iptables target for CLUSTERIP");
+MODULE_DESCRIPTION("Xtables: CLUSTERIP target");
struct clusterip_config {
struct list_head list; /* list of all configs */
@@ -109,11 +109,9 @@ clusterip_config_entry_put(struct clusterip_config *c)
static struct clusterip_config *
__clusterip_config_find(__be32 clusterip)
{
- struct list_head *pos;
+ struct clusterip_config *c;
- list_for_each(pos, &clusterip_configs) {
- struct clusterip_config *c = list_entry(pos,
- struct clusterip_config, list);
+ list_for_each_entry(c, &clusterip_configs, list) {
if (c->clusterip == clusterip)
return c;
}
@@ -275,7 +273,7 @@ clusterip_hashfn(const struct sk_buff *skb,
}
/* node numbers are 1..n, not 0..n */
- return (hashval % config->num_total_nodes) + 1;
+ return (((u64)hashval * config->num_total_nodes) >> 32) + 1;
}
static inline int
@@ -289,12 +287,9 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
***********************************************************************/
static unsigned int
-target(struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+clusterip_tg(struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const struct xt_target *target, const void *targinfo)
{
const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
struct nf_conn *ct;
@@ -361,11 +356,9 @@ target(struct sk_buff *skb,
}
static bool
-checkentry(const char *tablename,
- const void *e_void,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+clusterip_tg_check(const char *tablename, const void *e_void,
+ const struct xt_target *target, void *targinfo,
+ unsigned int hook_mask)
{
struct ipt_clusterip_tgt_info *cipinfo = targinfo;
const struct ipt_entry *e = e_void;
@@ -421,7 +414,7 @@ checkentry(const char *tablename,
if (nf_ct_l3proto_try_module_get(target->family) < 0) {
printk(KERN_WARNING "can't load conntrack support for "
- "proto=%d\n", target->family);
+ "proto=%u\n", target->family);
return false;
}
@@ -429,7 +422,7 @@ checkentry(const char *tablename,
}
/* drop reference count of cluster config when rule is deleted */
-static void destroy(const struct xt_target *target, void *targinfo)
+static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo)
{
struct ipt_clusterip_tgt_info *cipinfo = targinfo;
@@ -456,12 +449,12 @@ struct compat_ipt_clusterip_tgt_info
};
#endif /* CONFIG_COMPAT */
-static struct xt_target clusterip_tgt __read_mostly = {
+static struct xt_target clusterip_tg_reg __read_mostly = {
.name = "CLUSTERIP",
.family = AF_INET,
- .target = target,
- .checkentry = checkentry,
- .destroy = destroy,
+ .target = clusterip_tg,
+ .checkentry = clusterip_tg_check,
+ .destroy = clusterip_tg_destroy,
.targetsize = sizeof(struct ipt_clusterip_tgt_info),
#ifdef CONFIG_COMPAT
.compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
@@ -558,7 +551,7 @@ arp_mangle(unsigned int hook,
return NF_ACCEPT;
}
-static struct nf_hook_ops cip_arp_ops = {
+static struct nf_hook_ops cip_arp_ops __read_mostly = {
.hook = arp_mangle,
.pf = NF_ARP,
.hooknum = NF_ARP_OUT,
@@ -714,11 +707,11 @@ static const struct file_operations clusterip_proc_fops = {
#endif /* CONFIG_PROC_FS */
-static int __init ipt_clusterip_init(void)
+static int __init clusterip_tg_init(void)
{
int ret;
- ret = xt_register_target(&clusterip_tgt);
+ ret = xt_register_target(&clusterip_tg_reg);
if (ret < 0)
return ret;
@@ -744,11 +737,11 @@ cleanup_hook:
nf_unregister_hook(&cip_arp_ops);
#endif /* CONFIG_PROC_FS */
cleanup_target:
- xt_unregister_target(&clusterip_tgt);
+ xt_unregister_target(&clusterip_tg_reg);
return ret;
}
-static void __exit ipt_clusterip_fini(void)
+static void __exit clusterip_tg_exit(void)
{
printk(KERN_NOTICE "ClusterIP Version %s unloading\n",
CLUSTERIP_VERSION);
@@ -756,8 +749,8 @@ static void __exit ipt_clusterip_fini(void)
remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
#endif
nf_unregister_hook(&cip_arp_ops);
- xt_unregister_target(&clusterip_tgt);
+ xt_unregister_target(&clusterip_tg_reg);
}
-module_init(ipt_clusterip_init);
-module_exit(ipt_clusterip_fini);
+module_init(clusterip_tg_init);
+module_exit(clusterip_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index add110060a22..21395bc2b27f 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -21,7 +21,7 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("iptables ECN modification module");
+MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag modification");
/* set ECT codepoint from IP header.
* return false if there was an error. */
@@ -38,7 +38,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
oldtos = iph->tos;
iph->tos &= ~IPT_ECN_IP_MASK;
iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
- nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
+ csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
}
return true;
}
@@ -71,18 +71,15 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
if (einfo->operation & IPT_ECN_OP_SET_CWR)
tcph->cwr = einfo->proto.tcp.cwr;
- nf_proto_csum_replace2(&tcph->check, skb,
- oldval, ((__be16 *)tcph)[6], 0);
+ inet_proto_csum_replace2(&tcph->check, skb,
+ oldval, ((__be16 *)tcph)[6], 0);
return true;
}
static unsigned int
-target(struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+ecn_tg(struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const struct xt_target *target, const void *targinfo)
{
const struct ipt_ECN_info *einfo = targinfo;
@@ -99,11 +96,9 @@ target(struct sk_buff *skb,
}
static bool
-checkentry(const char *tablename,
- const void *e_void,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+ecn_tg_check(const char *tablename, const void *e_void,
+ const struct xt_target *target, void *targinfo,
+ unsigned int hook_mask)
{
const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo;
const struct ipt_entry *e = e_void;
@@ -127,25 +122,25 @@ checkentry(const char *tablename,
return true;
}
-static struct xt_target ipt_ecn_reg __read_mostly = {
+static struct xt_target ecn_tg_reg __read_mostly = {
.name = "ECN",
.family = AF_INET,
- .target = target,
+ .target = ecn_tg,
.targetsize = sizeof(struct ipt_ECN_info),
.table = "mangle",
- .checkentry = checkentry,
+ .checkentry = ecn_tg_check,
.me = THIS_MODULE,
};
-static int __init ipt_ecn_init(void)
+static int __init ecn_tg_init(void)
{
- return xt_register_target(&ipt_ecn_reg);
+ return xt_register_target(&ecn_tg_reg);
}
-static void __exit ipt_ecn_fini(void)
+static void __exit ecn_tg_exit(void)
{
- xt_unregister_target(&ipt_ecn_reg);
+ xt_unregister_target(&ecn_tg_reg);
}
-module_init(ipt_ecn_init);
-module_exit(ipt_ecn_fini);
+module_init(ecn_tg_init);
+module_exit(ecn_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 4b5e8216a4e7..b38d7850f506 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -22,10 +22,11 @@
#include <linux/netfilter.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ipt_LOG.h>
+#include <net/netfilter/nf_log.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables syslog logging module");
+MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
/* Use lock to serialize, so printks don't overlap */
static DEFINE_SPINLOCK(log_lock);
@@ -337,7 +338,9 @@ static void dump_packet(const struct nf_loginfo *info,
if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
+ printk("UID=%u GID=%u",
+ skb->sk->sk_socket->file->f_uid,
+ skb->sk->sk_socket->file->f_gid);
read_unlock_bh(&skb->sk->sk_callback_lock);
}
@@ -418,12 +421,9 @@ ipt_log_packet(unsigned int pf,
}
static unsigned int
-ipt_log_target(struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+log_tg(struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const struct xt_target *target, const void *targinfo)
{
const struct ipt_log_info *loginfo = targinfo;
struct nf_loginfo li;
@@ -437,11 +437,10 @@ ipt_log_target(struct sk_buff *skb,
return XT_CONTINUE;
}
-static bool ipt_log_checkentry(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+static bool
+log_tg_check(const char *tablename, const void *e,
+ const struct xt_target *target, void *targinfo,
+ unsigned int hook_mask)
{
const struct ipt_log_info *loginfo = targinfo;
@@ -457,37 +456,37 @@ static bool ipt_log_checkentry(const char *tablename,
return true;
}
-static struct xt_target ipt_log_reg __read_mostly = {
+static struct xt_target log_tg_reg __read_mostly = {
.name = "LOG",
.family = AF_INET,
- .target = ipt_log_target,
+ .target = log_tg,
.targetsize = sizeof(struct ipt_log_info),
- .checkentry = ipt_log_checkentry,
+ .checkentry = log_tg_check,
.me = THIS_MODULE,
};
-static struct nf_logger ipt_log_logger ={
+static const struct nf_logger ipt_log_logger ={
.name = "ipt_LOG",
.logfn = &ipt_log_packet,
.me = THIS_MODULE,
};
-static int __init ipt_log_init(void)
+static int __init log_tg_init(void)
{
int ret;
- ret = xt_register_target(&ipt_log_reg);
+ ret = xt_register_target(&log_tg_reg);
if (ret < 0)
return ret;
nf_log_register(PF_INET, &ipt_log_logger);
return 0;
}
-static void __exit ipt_log_fini(void)
+static void __exit log_tg_exit(void)
{
nf_log_unregister(&ipt_log_logger);
- xt_unregister_target(&ipt_log_reg);
+ xt_unregister_target(&log_tg_reg);
}
-module_init(ipt_log_init);
-module_exit(ipt_log_fini);
+module_init(log_tg_init);
+module_exit(log_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 44b516e7cb79..d80fee8327e4 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -25,18 +25,16 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables MASQUERADE target module");
+MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
/* Lock protects masq region inside conntrack */
static DEFINE_RWLOCK(masq_lock);
/* FIXME: Multiple targets. --RR */
static bool
-masquerade_check(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+masquerade_tg_check(const char *tablename, const void *e,
+ const struct xt_target *target, void *targinfo,
+ unsigned int hook_mask)
{
const struct nf_nat_multi_range_compat *mr = targinfo;
@@ -52,12 +50,9 @@ masquerade_check(const char *tablename,
}
static unsigned int
-masquerade_target(struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+masquerade_tg(struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const struct xt_target *target, const void *targinfo)
{
struct nf_conn *ct;
struct nf_conn_nat *nat;
@@ -67,7 +62,7 @@ masquerade_target(struct sk_buff *skb,
const struct rtable *rt;
__be32 newsrc;
- NF_CT_ASSERT(hooknum == NF_IP_POST_ROUTING);
+ NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
ct = nf_ct_get(skb, &ctinfo);
nat = nfct_nat(ct);
@@ -100,7 +95,7 @@ masquerade_target(struct sk_buff *skb,
mr->range[0].min, mr->range[0].max });
/* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, hooknum);
+ return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC);
}
static int
@@ -166,22 +161,22 @@ static struct notifier_block masq_inet_notifier = {
.notifier_call = masq_inet_event,
};
-static struct xt_target masquerade __read_mostly = {
+static struct xt_target masquerade_tg_reg __read_mostly = {
.name = "MASQUERADE",
.family = AF_INET,
- .target = masquerade_target,
+ .target = masquerade_tg,
.targetsize = sizeof(struct nf_nat_multi_range_compat),
.table = "nat",
- .hooks = 1 << NF_IP_POST_ROUTING,
- .checkentry = masquerade_check,
+ .hooks = 1 << NF_INET_POST_ROUTING,
+ .checkentry = masquerade_tg_check,
.me = THIS_MODULE,
};
-static int __init ipt_masquerade_init(void)
+static int __init masquerade_tg_init(void)
{
int ret;
- ret = xt_register_target(&masquerade);
+ ret = xt_register_target(&masquerade_tg_reg);
if (ret == 0) {
/* Register for device down reports */
@@ -193,12 +188,12 @@ static int __init ipt_masquerade_init(void)
return ret;
}
-static void __exit ipt_masquerade_fini(void)
+static void __exit masquerade_tg_exit(void)
{
- xt_unregister_target(&masquerade);
+ xt_unregister_target(&masquerade_tg_reg);
unregister_netdevice_notifier(&masq_dev_notifier);
unregister_inetaddr_notifier(&masq_inet_notifier);
}
-module_init(ipt_masquerade_init);
-module_exit(ipt_masquerade_fini);
+module_init(masquerade_tg_init);
+module_exit(masquerade_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index f8699291e33d..6739abfd1521 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -20,14 +20,12 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
-MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target");
+MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
static bool
-check(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+netmap_tg_check(const char *tablename, const void *e,
+ const struct xt_target *target, void *targinfo,
+ unsigned int hook_mask)
{
const struct nf_nat_multi_range_compat *mr = targinfo;
@@ -43,12 +41,9 @@ check(const char *tablename,
}
static unsigned int
-target(struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+netmap_tg(struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const struct xt_target *target, const void *targinfo)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
@@ -56,14 +51,14 @@ target(struct sk_buff *skb,
const struct nf_nat_multi_range_compat *mr = targinfo;
struct nf_nat_range newrange;
- NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING
- || hooknum == NF_IP_POST_ROUTING
- || hooknum == NF_IP_LOCAL_OUT);
+ NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING
+ || hooknum == NF_INET_POST_ROUTING
+ || hooknum == NF_INET_LOCAL_OUT);
ct = nf_ct_get(skb, &ctinfo);
netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
- if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
+ if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_OUT)
new_ip = ip_hdr(skb)->daddr & ~netmask;
else
new_ip = ip_hdr(skb)->saddr & ~netmask;
@@ -75,30 +70,31 @@ target(struct sk_buff *skb,
mr->range[0].min, mr->range[0].max });
/* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, hooknum);
+ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(hooknum));
}
-static struct xt_target target_module __read_mostly = {
+static struct xt_target netmap_tg_reg __read_mostly = {
.name = "NETMAP",
.family = AF_INET,
- .target = target,
+ .target = netmap_tg,
.targetsize = sizeof(struct nf_nat_multi_range_compat),
.table = "nat",
- .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
- (1 << NF_IP_LOCAL_OUT),
- .checkentry = check,
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .checkentry = netmap_tg_check,
.me = THIS_MODULE
};
-static int __init ipt_netmap_init(void)
+static int __init netmap_tg_init(void)
{
- return xt_register_target(&target_module);
+ return xt_register_target(&netmap_tg_reg);
}
-static void __exit ipt_netmap_fini(void)
+static void __exit netmap_tg_exit(void)
{
- xt_unregister_target(&target_module);
+ xt_unregister_target(&netmap_tg_reg);
}
-module_init(ipt_netmap_init);
-module_exit(ipt_netmap_fini);
+module_init(netmap_tg_init);
+module_exit(netmap_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index f7cf7d61a2d4..5c6292449d13 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -23,15 +23,13 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables REDIRECT target module");
+MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
/* FIXME: Take multiple ranges --RR */
static bool
-redirect_check(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+redirect_tg_check(const char *tablename, const void *e,
+ const struct xt_target *target, void *targinfo,
+ unsigned int hook_mask)
{
const struct nf_nat_multi_range_compat *mr = targinfo;
@@ -47,12 +45,9 @@ redirect_check(const char *tablename,
}
static unsigned int
-redirect_target(struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+redirect_tg(struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const struct xt_target *target, const void *targinfo)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
@@ -60,14 +55,14 @@ redirect_target(struct sk_buff *skb,
const struct nf_nat_multi_range_compat *mr = targinfo;
struct nf_nat_range newrange;
- NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING
- || hooknum == NF_IP_LOCAL_OUT);
+ NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING
+ || hooknum == NF_INET_LOCAL_OUT);
ct = nf_ct_get(skb, &ctinfo);
NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
/* Local packets: make them go to loopback */
- if (hooknum == NF_IP_LOCAL_OUT)
+ if (hooknum == NF_INET_LOCAL_OUT)
newdst = htonl(0x7F000001);
else {
struct in_device *indev;
@@ -92,29 +87,29 @@ redirect_target(struct sk_buff *skb,
mr->range[0].min, mr->range[0].max });
/* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, hooknum);
+ return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_DST);
}
-static struct xt_target redirect_reg __read_mostly = {
+static struct xt_target redirect_tg_reg __read_mostly = {
.name = "REDIRECT",
.family = AF_INET,
- .target = redirect_target,
+ .target = redirect_tg,
.targetsize = sizeof(struct nf_nat_multi_range_compat),
.table = "nat",
- .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT),
- .checkentry = redirect_check,
+ .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
+ .checkentry = redirect_tg_check,
.me = THIS_MODULE,
};
-static int __init ipt_redirect_init(void)
+static int __init redirect_tg_init(void)
{
- return xt_register_target(&redirect_reg);
+ return xt_register_target(&redirect_tg_reg);
}
-static void __exit ipt_redirect_fini(void)
+static void __exit redirect_tg_exit(void)
{
- xt_unregister_target(&redirect_reg);
+ xt_unregister_target(&redirect_tg_reg);
}
-module_init(ipt_redirect_init);
-module_exit(ipt_redirect_fini);
+module_init(redirect_tg_init);
+module_exit(redirect_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index dcf4d21d5116..22606e2baa16 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -29,17 +29,14 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables REJECT target module");
+MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4");
/* Send RST reply */
static void send_reset(struct sk_buff *oldskb, int hook)
{
struct sk_buff *nskb;
- struct iphdr *niph;
+ struct iphdr *oiph, *niph;
struct tcphdr _otcph, *oth, *tcph;
- __be16 tmp_port;
- __be32 tmp_addr;
- int needs_ack;
unsigned int addr_type;
/* IP header checks: fragment. */
@@ -58,99 +55,73 @@ static void send_reset(struct sk_buff *oldskb, int hook)
/* Check checksum */
if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
return;
+ oiph = ip_hdr(oldskb);
- /* We need a linear, writeable skb. We also need to expand
- headroom in case hh_len of incoming interface < hh_len of
- outgoing interface */
- nskb = skb_copy_expand(oldskb, LL_MAX_HEADER, skb_tailroom(oldskb),
- GFP_ATOMIC);
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
if (!nskb)
return;
- /* This packet will not be the same as the other: clear nf fields */
- nf_reset(nskb);
- nskb->mark = 0;
- skb_init_secmark(nskb);
-
- skb_shinfo(nskb)->gso_size = 0;
- skb_shinfo(nskb)->gso_segs = 0;
- skb_shinfo(nskb)->gso_type = 0;
-
- tcph = (struct tcphdr *)(skb_network_header(nskb) + ip_hdrlen(nskb));
-
- /* Swap source and dest */
- niph = ip_hdr(nskb);
- tmp_addr = niph->saddr;
- niph->saddr = niph->daddr;
- niph->daddr = tmp_addr;
- tmp_port = tcph->source;
- tcph->source = tcph->dest;
- tcph->dest = tmp_port;
-
- /* Truncate to length (no data) */
- tcph->doff = sizeof(struct tcphdr)/4;
- skb_trim(nskb, ip_hdrlen(nskb) + sizeof(struct tcphdr));
- niph->tot_len = htons(nskb->len);
-
- if (tcph->ack) {
- needs_ack = 0;
+ skb_reserve(nskb, LL_MAX_HEADER);
+
+ skb_reset_network_header(nskb);
+ niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
+ niph->version = 4;
+ niph->ihl = sizeof(struct iphdr) / 4;
+ niph->tos = 0;
+ niph->id = 0;
+ niph->frag_off = htons(IP_DF);
+ niph->protocol = IPPROTO_TCP;
+ niph->check = 0;
+ niph->saddr = oiph->daddr;
+ niph->daddr = oiph->saddr;
+
+ tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+ memset(tcph, 0, sizeof(*tcph));
+ tcph->source = oth->dest;
+ tcph->dest = oth->source;
+ tcph->doff = sizeof(struct tcphdr) / 4;
+
+ if (oth->ack)
tcph->seq = oth->ack_seq;
- tcph->ack_seq = 0;
- } else {
- needs_ack = 1;
+ else {
tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
oldskb->len - ip_hdrlen(oldskb) -
(oth->doff << 2));
- tcph->seq = 0;
+ tcph->ack = 1;
}
- /* Reset flags */
- ((u_int8_t *)tcph)[13] = 0;
- tcph->rst = 1;
- tcph->ack = needs_ack;
-
- tcph->window = 0;
- tcph->urg_ptr = 0;
-
- /* Adjust TCP checksum */
- tcph->check = 0;
- tcph->check = tcp_v4_check(sizeof(struct tcphdr),
- niph->saddr, niph->daddr,
- csum_partial(tcph,
- sizeof(struct tcphdr), 0));
-
- /* Set DF, id = 0 */
- niph->frag_off = htons(IP_DF);
- niph->id = 0;
+ tcph->rst = 1;
+ tcph->check = tcp_v4_check(sizeof(struct tcphdr),
+ niph->saddr, niph->daddr,
+ csum_partial(tcph,
+ sizeof(struct tcphdr), 0));
addr_type = RTN_UNSPEC;
- if (hook != NF_IP_FORWARD
+ if (hook != NF_INET_FORWARD
#ifdef CONFIG_BRIDGE_NETFILTER
|| (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED)
#endif
)
addr_type = RTN_LOCAL;
+ /* ip_route_me_harder expects skb->dst to be set */
+ dst_hold(oldskb->dst);
+ nskb->dst = oldskb->dst;
+
if (ip_route_me_harder(nskb, addr_type))
goto free_nskb;
+ niph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT);
nskb->ip_summed = CHECKSUM_NONE;
- /* Adjust IP TTL */
- niph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT);
-
- /* Adjust IP checksum */
- niph->check = 0;
- niph->check = ip_fast_csum(skb_network_header(nskb), niph->ihl);
-
/* "Never happens" */
if (nskb->len > dst_mtu(nskb->dst))
goto free_nskb;
nf_ct_attach(nskb, oldskb);
- NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
- dst_output);
+ ip_local_out(nskb);
return;
free_nskb:
@@ -162,20 +133,13 @@ static inline void send_unreach(struct sk_buff *skb_in, int code)
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
}
-static unsigned int reject(struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+static unsigned int
+reject_tg(struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const struct xt_target *target, const void *targinfo)
{
const struct ipt_reject_info *reject = targinfo;
- /* Our naive response construction doesn't deal with IP
- options, and probably shouldn't try. */
- if (ip_hdrlen(skb) != sizeof(struct iphdr))
- return NF_DROP;
-
/* WARNING: This code causes reentry within iptables.
This means that the iptables jump stack is now crap. We
must return an absolute verdict. --RR */
@@ -211,11 +175,10 @@ static unsigned int reject(struct sk_buff *skb,
return NF_DROP;
}
-static bool check(const char *tablename,
- const void *e_void,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+static bool
+reject_tg_check(const char *tablename, const void *e_void,
+ const struct xt_target *target, void *targinfo,
+ unsigned int hook_mask)
{
const struct ipt_reject_info *rejinfo = targinfo;
const struct ipt_entry *e = e_void;
@@ -234,27 +197,27 @@ static bool check(const char *tablename,
return true;
}
-static struct xt_target ipt_reject_reg __read_mostly = {
+static struct xt_target reject_tg_reg __read_mostly = {
.name = "REJECT",
.family = AF_INET,
- .target = reject,
+ .target = reject_tg,
.targetsize = sizeof(struct ipt_reject_info),
.table = "filter",
- .hooks = (1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) |
- (1 << NF_IP_LOCAL_OUT),
- .checkentry = check,
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT),
+ .checkentry = reject_tg_check,
.me = THIS_MODULE,
};
-static int __init ipt_reject_init(void)
+static int __init reject_tg_init(void)
{
- return xt_register_target(&ipt_reject_reg);
+ return xt_register_target(&reject_tg_reg);
}
-static void __exit ipt_reject_fini(void)
+static void __exit reject_tg_exit(void)
{
- xt_unregister_target(&ipt_reject_reg);
+ xt_unregister_target(&reject_tg_reg);
}
-module_init(ipt_reject_init);
-module_exit(ipt_reject_fini);
+module_init(reject_tg_init);
+module_exit(reject_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
deleted file mode 100644
index 8988571436b8..000000000000
--- a/net/ipv4/netfilter/ipt_SAME.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Same. Just like SNAT, only try to make the connections
- * between client A and server B always have the same source ip.
- *
- * (C) 2000 Paul `Rusty' Russell
- * (C) 2001 Martin Josefsson
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/netdevice.h>
-#include <linux/if.h>
-#include <linux/inetdevice.h>
-#include <net/protocol.h>
-#include <net/checksum.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <linux/netfilter_ipv4/ipt_SAME.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>");
-MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip");
-
-static bool
-same_check(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- unsigned int count, countess, rangeip, index = 0;
- struct ipt_same_info *mr = targinfo;
-
- mr->ipnum = 0;
-
- if (mr->rangesize < 1) {
- pr_debug("same_check: need at least one dest range.\n");
- return false;
- }
- if (mr->rangesize > IPT_SAME_MAX_RANGE) {
- pr_debug("same_check: too many ranges specified, maximum "
- "is %u ranges\n", IPT_SAME_MAX_RANGE);
- return false;
- }
- for (count = 0; count < mr->rangesize; count++) {
- if (ntohl(mr->range[count].min_ip) >
- ntohl(mr->range[count].max_ip)) {
- pr_debug("same_check: min_ip is larger than max_ip in "
- "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n",
- NIPQUAD(mr->range[count].min_ip),
- NIPQUAD(mr->range[count].max_ip));
- return false;
- }
- if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) {
- pr_debug("same_check: bad MAP_IPS.\n");
- return false;
- }
- rangeip = (ntohl(mr->range[count].max_ip) -
- ntohl(mr->range[count].min_ip) + 1);
- mr->ipnum += rangeip;
-
- pr_debug("same_check: range %u, ipnum = %u\n", count, rangeip);
- }
- pr_debug("same_check: total ipaddresses = %u\n", mr->ipnum);
-
- mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL);
- if (!mr->iparray) {
- pr_debug("same_check: Couldn't allocate %Zu bytes "
- "for %u ipaddresses!\n",
- (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
- return false;
- }
- pr_debug("same_check: Allocated %Zu bytes for %u ipaddresses.\n",
- (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
-
- for (count = 0; count < mr->rangesize; count++) {
- for (countess = ntohl(mr->range[count].min_ip);
- countess <= ntohl(mr->range[count].max_ip);
- countess++) {
- mr->iparray[index] = countess;
- pr_debug("same_check: Added ipaddress `%u.%u.%u.%u' "
- "in index %u.\n", HIPQUAD(countess), index);
- index++;
- }
- }
- return true;
-}
-
-static void
-same_destroy(const struct xt_target *target, void *targinfo)
-{
- struct ipt_same_info *mr = targinfo;
-
- kfree(mr->iparray);
-
- pr_debug("same_destroy: Deallocated %Zu bytes for %u ipaddresses.\n",
- (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
-}
-
-static unsigned int
-same_target(struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- u_int32_t tmpip, aindex;
- __be32 new_ip;
- const struct ipt_same_info *same = targinfo;
- struct nf_nat_range newrange;
- const struct nf_conntrack_tuple *t;
-
- NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING ||
- hooknum == NF_IP_POST_ROUTING);
- ct = nf_ct_get(skb, &ctinfo);
-
- t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-
- /* Base new source on real src ip and optionally dst ip,
- giving some hope for consistency across reboots.
- Here we calculate the index in same->iparray which
- holds the ipaddress we should use */
-
- tmpip = ntohl(t->src.u3.ip);
-
- if (!(same->info & IPT_SAME_NODST))
- tmpip += ntohl(t->dst.u3.ip);
- aindex = tmpip % same->ipnum;
-
- new_ip = htonl(same->iparray[aindex]);
-
- pr_debug("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, "
- "new src=%u.%u.%u.%u\n",
- NIPQUAD(t->src.u3.ip), NIPQUAD(t->dst.u3.ip), NIPQUAD(new_ip));
-
- /* Transfer from original range. */
- newrange = ((struct nf_nat_range)
- { same->range[0].flags, new_ip, new_ip,
- /* FIXME: Use ports from correct range! */
- same->range[0].min, same->range[0].max });
-
- /* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, hooknum);
-}
-
-static struct xt_target same_reg __read_mostly = {
- .name = "SAME",
- .family = AF_INET,
- .target = same_target,
- .targetsize = sizeof(struct ipt_same_info),
- .table = "nat",
- .hooks = (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING),
- .checkentry = same_check,
- .destroy = same_destroy,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_same_init(void)
-{
- return xt_register_target(&same_reg);
-}
-
-static void __exit ipt_same_fini(void)
-{
- xt_unregister_target(&same_reg);
-}
-
-module_init(ipt_same_init);
-module_exit(ipt_same_fini);
-
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
deleted file mode 100644
index d4573baa7f27..000000000000
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/* This is a module which is used for setting the TOS field of a packet. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/checksum.h>
-
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv4/ipt_TOS.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("iptables TOS mangling module");
-
-static unsigned int
-target(struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
-{
- const struct ipt_tos_target_info *tosinfo = targinfo;
- struct iphdr *iph = ip_hdr(skb);
-
- if ((iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
- __u8 oldtos;
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
- return NF_DROP;
- iph = ip_hdr(skb);
- oldtos = iph->tos;
- iph->tos = (iph->tos & IPTOS_PREC_MASK) | tosinfo->tos;
- nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
- }
- return XT_CONTINUE;
-}
-
-static bool
-checkentry(const char *tablename,
- const void *e_void,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
-{
- const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos;
-
- if (tos != IPTOS_LOWDELAY
- && tos != IPTOS_THROUGHPUT
- && tos != IPTOS_RELIABILITY
- && tos != IPTOS_MINCOST
- && tos != IPTOS_NORMALSVC) {
- printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
- return false;
- }
- return true;
-}
-
-static struct xt_target ipt_tos_reg __read_mostly = {
- .name = "TOS",
- .family = AF_INET,
- .target = target,
- .targetsize = sizeof(struct ipt_tos_target_info),
- .table = "mangle",
- .checkentry = checkentry,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_tos_init(void)
-{
- return xt_register_target(&ipt_tos_reg);
-}
-
-static void __exit ipt_tos_fini(void)
-{
- xt_unregister_target(&ipt_tos_reg);
-}
-
-module_init(ipt_tos_init);
-module_exit(ipt_tos_fini);
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
index c620a0527666..30eed65e7338 100644
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -16,14 +16,13 @@
#include <linux/netfilter_ipv4/ipt_TTL.h>
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("IP tables TTL modification module");
+MODULE_DESCRIPTION("Xtables: IPv4 TTL field modification target");
MODULE_LICENSE("GPL");
static unsigned int
-ipt_ttl_target(struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- unsigned int hooknum, const struct xt_target *target,
- const void *targinfo)
+ttl_tg(struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const struct xt_target *target, const void *targinfo)
{
struct iphdr *iph;
const struct ipt_TTL_info *info = targinfo;
@@ -54,19 +53,18 @@ ipt_ttl_target(struct sk_buff *skb,
}
if (new_ttl != iph->ttl) {
- nf_csum_replace2(&iph->check, htons(iph->ttl << 8),
- htons(new_ttl << 8));
+ csum_replace2(&iph->check, htons(iph->ttl << 8),
+ htons(new_ttl << 8));
iph->ttl = new_ttl;
}
return XT_CONTINUE;
}
-static bool ipt_ttl_checkentry(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hook_mask)
+static bool
+ttl_tg_check(const char *tablename, const void *e,
+ const struct xt_target *target, void *targinfo,
+ unsigned int hook_mask)
{
const struct ipt_TTL_info *info = targinfo;
@@ -80,25 +78,25 @@ static bool ipt_ttl_checkentry(const char *tablename,
return true;
}
-static struct xt_target ipt_TTL __read_mostly = {
+static struct xt_target ttl_tg_reg __read_mostly = {
.name = "TTL",
.family = AF_INET,
- .target = ipt_ttl_target,
+ .target = ttl_tg,
.targetsize = sizeof(struct ipt_TTL_info),
.table = "mangle",
- .checkentry = ipt_ttl_checkentry,
+ .checkentry = ttl_tg_check,
.me = THIS_MODULE,
};
-static int __init ipt_ttl_init(void)
+static int __init ttl_tg_init(void)
{
- return xt_register_target(&ipt_TTL);
+ return xt_register_target(&ttl_tg_reg);
}
-static void __exit ipt_ttl_fini(void)
+static void __exit ttl_tg_exit(void)
{
- xt_unregister_target(&ipt_TTL);
+ xt_unregister_target(&ttl_tg_reg);
}
-module_init(ipt_ttl_init);
-module_exit(ipt_ttl_fini);
+module_init(ttl_tg_init);
+module_exit(ttl_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 212b830765a4..b192756c6d0d 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -43,13 +43,14 @@
#include <linux/netfilter.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ipt_ULOG.h>
+#include <net/netfilter/nf_log.h>
#include <net/sock.h>
#include <linux/bitops.h>
#include <asm/unaligned.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("iptables userspace logging module");
+MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
#define ULOG_NL_EVENT 111 /* Harald's favorite number */
@@ -279,12 +280,10 @@ alloc_failure:
spin_unlock_bh(&ulog_lock);
}
-static unsigned int ipt_ulog_target(struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- unsigned int hooknum,
- const struct xt_target *target,
- const void *targinfo)
+static unsigned int
+ulog_tg(struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const struct xt_target *target, const void *targinfo)
{
struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
@@ -318,11 +317,10 @@ static void ipt_logfn(unsigned int pf,
ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
}
-static bool ipt_ulog_checkentry(const char *tablename,
- const void *e,
- const struct xt_target *target,
- void *targinfo,
- unsigned int hookmask)
+static bool
+ulog_tg_check(const char *tablename, const void *e,
+ const struct xt_target *target, void *targinfo,
+ unsigned int hookmask)
{
const struct ipt_ulog_info *loginfo = targinfo;
@@ -347,7 +345,7 @@ struct compat_ipt_ulog_info {
char prefix[ULOG_PREFIX_LEN];
};
-static void compat_from_user(void *dst, void *src)
+static void ulog_tg_compat_from_user(void *dst, void *src)
{
const struct compat_ipt_ulog_info *cl = src;
struct ipt_ulog_info l = {
@@ -360,7 +358,7 @@ static void compat_from_user(void *dst, void *src)
memcpy(dst, &l, sizeof(l));
}
-static int compat_to_user(void __user *dst, void *src)
+static int ulog_tg_compat_to_user(void __user *dst, void *src)
{
const struct ipt_ulog_info *l = src;
struct compat_ipt_ulog_info cl = {
@@ -374,16 +372,16 @@ static int compat_to_user(void __user *dst, void *src)
}
#endif /* CONFIG_COMPAT */
-static struct xt_target ipt_ulog_reg __read_mostly = {
+static struct xt_target ulog_tg_reg __read_mostly = {
.name = "ULOG",
.family = AF_INET,
- .target = ipt_ulog_target,
+ .target = ulog_tg,
.targetsize = sizeof(struct ipt_ulog_info),
- .checkentry = ipt_ulog_checkentry,
+ .checkentry = ulog_tg_check,
#ifdef CONFIG_COMPAT
.compatsize = sizeof(struct compat_ipt_ulog_info),
- .compat_from_user = compat_from_user,
- .compat_to_user = compat_to_user,
+ .compat_from_user = ulog_tg_compat_from_user,
+ .compat_to_user = ulog_tg_compat_to_user,
#endif
.me = THIS_MODULE,
};
@@ -394,7 +392,7 @@ static struct nf_logger ipt_ulog_logger = {
.me = THIS_MODULE,
};
-static int __init ipt_ulog_init(void)
+static int __init ulog_tg_init(void)
{
int ret, i;
@@ -415,9 +413,9 @@ static int __init ipt_ulog_init(void)
if (!nflognl)
return -ENOMEM;
- ret = xt_register_target(&ipt_ulog_reg);
+ ret = xt_register_target(&ulog_tg_reg);
if (ret < 0) {
- sock_release(nflognl->sk_socket);
+ netlink_kernel_release(nflognl);
return ret;
}
if (nflog)
@@ -426,7 +424,7 @@ static int __init ipt_ulog_init(void)
return 0;
}
-static void __exit ipt_ulog_fini(void)
+static void __exit ulog_tg_exit(void)
{
ulog_buff_t *ub;
int i;
@@ -435,8 +433,8 @@ static void __exit ipt_ulog_fini(void)
if (nflog)
nf_log_unregister(&ipt_ulog_logger);
- xt_unregister_target(&ipt_ulog_reg);
- sock_release(nflognl->sk_socket);
+ xt_unregister_target(&ulog_tg_reg);
+ netlink_kernel_release(nflognl);
/* remove pending timers and free allocated skb's */
for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
@@ -453,5 +451,5 @@ static void __exit ipt_ulog_fini(void)
}
}
-module_init(ipt_ulog_init);
-module_exit(ipt_ulog_fini);
+module_init(ulog_tg_init);
+module_exit(ulog_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index 59f01f7ba6b4..49587a497229 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -2,6 +2,7 @@
* iptables module to match inet_addr_type() of an ip.
*
* Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
+ * (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -20,47 +21,119 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("iptables addrtype match");
+MODULE_DESCRIPTION("Xtables: address type match for IPv4");
-static inline bool match_type(__be32 addr, u_int16_t mask)
+static inline bool match_type(const struct net_device *dev, __be32 addr,
+ u_int16_t mask)
{
- return !!(mask & (1 << inet_addr_type(addr)));
+ return !!(mask & (1 << inet_dev_addr_type(&init_net, dev, addr)));
}
-static bool match(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const struct xt_match *match, const void *matchinfo,
- int offset, unsigned int protoff, bool *hotdrop)
+static bool
+addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct xt_match *match,
+ const void *matchinfo, int offset, unsigned int protoff,
+ bool *hotdrop)
{
const struct ipt_addrtype_info *info = matchinfo;
const struct iphdr *iph = ip_hdr(skb);
bool ret = true;
if (info->source)
- ret &= match_type(iph->saddr, info->source)^info->invert_source;
+ ret &= match_type(NULL, iph->saddr, info->source) ^
+ info->invert_source;
if (info->dest)
- ret &= match_type(iph->daddr, info->dest)^info->invert_dest;
+ ret &= match_type(NULL, iph->daddr, info->dest) ^
+ info->invert_dest;
return ret;
}
-static struct xt_match addrtype_match __read_mostly = {
- .name = "addrtype",
- .family = AF_INET,
- .match = match,
- .matchsize = sizeof(struct ipt_addrtype_info),
- .me = THIS_MODULE
+static bool
+addrtype_mt_v1(const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct xt_match *match,
+ const void *matchinfo, int offset, unsigned int protoff,
+ bool *hotdrop)
+{
+ const struct ipt_addrtype_info_v1 *info = matchinfo;
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct net_device *dev = NULL;
+ bool ret = true;
+
+ if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN)
+ dev = in;
+ else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT)
+ dev = out;
+
+ if (info->source)
+ ret &= match_type(dev, iph->saddr, info->source) ^
+ (info->flags & IPT_ADDRTYPE_INVERT_SOURCE);
+ if (ret && info->dest)
+ ret &= match_type(dev, iph->daddr, info->dest) ^
+ (info->flags & IPT_ADDRTYPE_INVERT_DEST);
+ return ret;
+}
+
+static bool
+addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void,
+ const struct xt_match *match, void *matchinfo,
+ unsigned int hook_mask)
+{
+ struct ipt_addrtype_info_v1 *info = matchinfo;
+
+ if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
+ info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
+ printk(KERN_ERR "ipt_addrtype: both incoming and outgoing "
+ "interface limitation cannot be selected\n");
+ return false;
+ }
+
+ if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) &&
+ info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
+ printk(KERN_ERR "ipt_addrtype: output interface limitation "
+ "not valid in PRE_ROUTING and INPUT\n");
+ return false;
+ }
+
+ if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) &&
+ info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
+ printk(KERN_ERR "ipt_addrtype: input interface limitation "
+ "not valid in POST_ROUTING and OUTPUT\n");
+ return false;
+ }
+
+ return true;
+}
+
+static struct xt_match addrtype_mt_reg[] __read_mostly = {
+ {
+ .name = "addrtype",
+ .family = AF_INET,
+ .match = addrtype_mt_v0,
+ .matchsize = sizeof(struct ipt_addrtype_info),
+ .me = THIS_MODULE
+ },
+ {
+ .name = "addrtype",
+ .family = AF_INET,
+ .revision = 1,
+ .match = addrtype_mt_v1,
+ .checkentry = addrtype_mt_checkentry_v1,
+ .matchsize = sizeof(struct ipt_addrtype_info_v1),
+ .me = THIS_MODULE
+ }
};
-static int __init ipt_addrtype_init(void)
+static int __init addrtype_mt_init(void)
{
- return xt_register_match(&addrtype_match);
+ return xt_register_matches(addrtype_mt_reg,
+ ARRAY_SIZE(addrtype_mt_reg));
}
-static void __exit ipt_addrtype_fini(void)
+static void __exit addrtype_mt_exit(void)
{
- xt_unregister_match(&addrtype_match);
+ xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg));
}
-module_init(ipt_addrtype_init);
-module_exit(ipt_addrtype_fini);
+module_init(addrtype_mt_init);
+module_exit(addrtype_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 61b017fd743c..e977989629c7 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -16,7 +16,7 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
-MODULE_DESCRIPTION("iptables AH SPI match module");
+MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match");
#ifdef DEBUG_CONNTRACK
#define duprintf(format, args...) printk(format , ## args)
@@ -37,14 +37,9 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
}
static bool
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- bool *hotdrop)
+ah_mt(const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct xt_match *match,
+ const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
{
struct ip_auth_hdr _ahdr;
const struct ip_auth_hdr *ah;
@@ -72,11 +67,9 @@ match(const struct sk_buff *skb,
/* Called when user tries to insert an entry of this type. */
static bool
-checkentry(const char *tablename,
- const void *ip_void,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
+ah_mt_check(const char *tablename, const void *ip_void,
+ const struct xt_match *match, void *matchinfo,
+ unsigned int hook_mask)
{
const struct ipt_ah *ahinfo = matchinfo;
@@ -88,25 +81,25 @@ checkentry(const char *tablename,
return true;
}
-static struct xt_match ah_match __read_mostly = {
+static struct xt_match ah_mt_reg __read_mostly = {
.name = "ah",
.family = AF_INET,
- .match = match,
+ .match = ah_mt,
.matchsize = sizeof(struct ipt_ah),
.proto = IPPROTO_AH,
- .checkentry = checkentry,
+ .checkentry = ah_mt_check,
.me = THIS_MODULE,
};
-static int __init ipt_ah_init(void)
+static int __init ah_mt_init(void)
{
- return xt_register_match(&ah_match);
+ return xt_register_match(&ah_mt_reg);
}
-static void __exit ipt_ah_fini(void)
+static void __exit ah_mt_exit(void)
{
- xt_unregister_match(&ah_match);
+ xt_unregister_match(&ah_mt_reg);
}
-module_init(ipt_ah_init);
-module_exit(ipt_ah_fini);
+module_init(ah_mt_init);
+module_exit(ah_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index d6925c674069..749de8284ce5 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -19,7 +19,7 @@
#include <linux/netfilter_ipv4/ipt_ecn.h>
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("iptables ECN matching module");
+MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match for IPv4");
MODULE_LICENSE("GPL");
static inline bool match_ip(const struct sk_buff *skb,
@@ -67,10 +67,10 @@ static inline bool match_tcp(const struct sk_buff *skb,
return true;
}
-static bool match(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const struct xt_match *match, const void *matchinfo,
- int offset, unsigned int protoff, bool *hotdrop)
+static bool
+ecn_mt(const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct xt_match *match,
+ const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
{
const struct ipt_ecn_info *info = matchinfo;
@@ -88,9 +88,10 @@ static bool match(const struct sk_buff *skb,
return true;
}
-static bool checkentry(const char *tablename, const void *ip_void,
- const struct xt_match *match,
- void *matchinfo, unsigned int hook_mask)
+static bool
+ecn_mt_check(const char *tablename, const void *ip_void,
+ const struct xt_match *match, void *matchinfo,
+ unsigned int hook_mask)
{
const struct ipt_ecn_info *info = matchinfo;
const struct ipt_ip *ip = ip_void;
@@ -111,24 +112,24 @@ static bool checkentry(const char *tablename, const void *ip_void,
return true;
}
-static struct xt_match ecn_match __read_mostly = {
+static struct xt_match ecn_mt_reg __read_mostly = {
.name = "ecn",
.family = AF_INET,
- .match = match,
+ .match = ecn_mt,
.matchsize = sizeof(struct ipt_ecn_info),
- .checkentry = checkentry,
+ .checkentry = ecn_mt_check,
.me = THIS_MODULE,
};
-static int __init ipt_ecn_init(void)
+static int __init ecn_mt_init(void)
{
- return xt_register_match(&ecn_match);
+ return xt_register_match(&ecn_mt_reg);
}
-static void __exit ipt_ecn_fini(void)
+static void __exit ecn_mt_exit(void)
{
- xt_unregister_match(&ecn_match);
+ xt_unregister_match(&ecn_mt_reg);
}
-module_init(ipt_ecn_init);
-module_exit(ipt_ecn_fini);
+module_init(ecn_mt_init);
+module_exit(ecn_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
deleted file mode 100644
index 0106dc955a69..000000000000
--- a/net/ipv4/netfilter/ipt_iprange.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * iptables module to match IP address ranges
- *
- * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv4/ipt_iprange.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-MODULE_DESCRIPTION("iptables arbitrary IP range match module");
-
-static bool
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset, unsigned int protoff, bool *hotdrop)
-{
- const struct ipt_iprange_info *info = matchinfo;
- const struct iphdr *iph = ip_hdr(skb);
-
- if (info->flags & IPRANGE_SRC) {
- if ((ntohl(iph->saddr) < ntohl(info->src.min_ip)
- || ntohl(iph->saddr) > ntohl(info->src.max_ip))
- ^ !!(info->flags & IPRANGE_SRC_INV)) {
- pr_debug("src IP %u.%u.%u.%u NOT in range %s"
- "%u.%u.%u.%u-%u.%u.%u.%u\n",
- NIPQUAD(iph->saddr),
- info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
- NIPQUAD(info->src.min_ip),
- NIPQUAD(info->src.max_ip));
- return false;
- }
- }
- if (info->flags & IPRANGE_DST) {
- if ((ntohl(iph->daddr) < ntohl(info->dst.min_ip)
- || ntohl(iph->daddr) > ntohl(info->dst.max_ip))
- ^ !!(info->flags & IPRANGE_DST_INV)) {
- pr_debug("dst IP %u.%u.%u.%u NOT in range %s"
- "%u.%u.%u.%u-%u.%u.%u.%u\n",
- NIPQUAD(iph->daddr),
- info->flags & IPRANGE_DST_INV ? "(INV) " : "",
- NIPQUAD(info->dst.min_ip),
- NIPQUAD(info->dst.max_ip));
- return false;
- }
- }
- return true;
-}
-
-static struct xt_match iprange_match __read_mostly = {
- .name = "iprange",
- .family = AF_INET,
- .match = match,
- .matchsize = sizeof(struct ipt_iprange_info),
- .me = THIS_MODULE
-};
-
-static int __init ipt_iprange_init(void)
-{
- return xt_register_match(&iprange_match);
-}
-
-static void __exit ipt_iprange_fini(void)
-{
- xt_unregister_match(&iprange_match);
-}
-
-module_init(ipt_iprange_init);
-module_exit(ipt_iprange_fini);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
deleted file mode 100644
index b14e77da7a33..000000000000
--- a/net/ipv4/netfilter/ipt_owner.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Kernel module to match various things tied to sockets associated with
- locally generated outgoing packets. */
-
-/* (C) 2000 Marc Boucher <marc@mbsi.ca>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/file.h>
-#include <linux/rcupdate.h>
-#include <net/sock.h>
-
-#include <linux/netfilter_ipv4/ipt_owner.h>
-#include <linux/netfilter/x_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
-MODULE_DESCRIPTION("iptables owner match");
-
-static bool
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- bool *hotdrop)
-{
- const struct ipt_owner_info *info = matchinfo;
-
- if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file)
- return false;
-
- if(info->match & IPT_OWNER_UID) {
- if ((skb->sk->sk_socket->file->f_uid != info->uid) ^
- !!(info->invert & IPT_OWNER_UID))
- return false;
- }
-
- if(info->match & IPT_OWNER_GID) {
- if ((skb->sk->sk_socket->file->f_gid != info->gid) ^
- !!(info->invert & IPT_OWNER_GID))
- return false;
- }
-
- return true;
-}
-
-static bool
-checkentry(const char *tablename,
- const void *ip,
- const struct xt_match *match,
- void *matchinfo,
- unsigned int hook_mask)
-{
- const struct ipt_owner_info *info = matchinfo;
-
- if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
- printk("ipt_owner: pid, sid and command matching "
- "not supported anymore\n");
- return false;
- }
- return true;
-}
-
-static struct xt_match owner_match __read_mostly = {
- .name = "owner",
- .family = AF_INET,
- .match = match,
- .matchsize = sizeof(struct ipt_owner_info),
- .hooks = (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING),
- .checkentry = checkentry,
- .me = THIS_MODULE,
-};
-
-static int __init ipt_owner_init(void)
-{
- return xt_register_match(&owner_match);
-}
-
-static void __exit ipt_owner_fini(void)
-{
- xt_unregister_match(&owner_match);
-}
-
-module_init(ipt_owner_init);
-module_exit(ipt_owner_fini);
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 11d39fb5f38b..e3154a99c08a 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -30,7 +30,7 @@
#include <linux/netfilter_ipv4/ipt_recent.h>
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("IP tables recently seen matching module");
+MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4");
MODULE_LICENSE("GPL");
static unsigned int ip_list_tot = 100;
@@ -170,10 +170,10 @@ static void recent_table_flush(struct recent_table *t)
}
static bool
-ipt_recent_match(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const struct xt_match *match, const void *matchinfo,
- int offset, unsigned int protoff, bool *hotdrop)
+recent_mt(const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct xt_match *match,
+ const void *matchinfo, int offset, unsigned int protoff,
+ bool *hotdrop)
{
const struct ipt_recent_info *info = matchinfo;
struct recent_table *t;
@@ -236,9 +236,9 @@ out:
}
static bool
-ipt_recent_checkentry(const char *tablename, const void *ip,
- const struct xt_match *match, void *matchinfo,
- unsigned int hook_mask)
+recent_mt_check(const char *tablename, const void *ip,
+ const struct xt_match *match, void *matchinfo,
+ unsigned int hook_mask)
{
const struct ipt_recent_info *info = matchinfo;
struct recent_table *t;
@@ -293,8 +293,7 @@ out:
return ret;
}
-static void
-ipt_recent_destroy(const struct xt_match *match, void *matchinfo)
+static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
{
const struct ipt_recent_info *info = matchinfo;
struct recent_table *t;
@@ -455,17 +454,17 @@ static const struct file_operations recent_fops = {
};
#endif /* CONFIG_PROC_FS */
-static struct xt_match recent_match __read_mostly = {
+static struct xt_match recent_mt_reg __read_mostly = {
.name = "recent",
.family = AF_INET,
- .match = ipt_recent_match,
+ .match = recent_mt,
.matchsize = sizeof(struct ipt_recent_info),
- .checkentry = ipt_recent_checkentry,
- .destroy = ipt_recent_destroy,
+ .checkentry = recent_mt_check,
+ .destroy = recent_mt_destroy,
.me = THIS_MODULE,
};
-static int __init ipt_recent_init(void)
+static int __init recent_mt_init(void)
{
int err;
@@ -473,27 +472,27 @@ static int __init ipt_recent_init(void)
return -EINVAL;
ip_list_hash_size = 1 << fls(ip_list_tot);
- err = xt_register_match(&recent_match);
+ err = xt_register_match(&recent_mt_reg);
#ifdef CONFIG_PROC_FS
if (err)
return err;
proc_dir = proc_mkdir("ipt_recent", init_net.proc_net);
if (proc_dir == NULL) {
- xt_unregister_match(&recent_match);
+ xt_unregister_match(&recent_mt_reg);
err = -ENOMEM;
}
#endif
return err;
}
-static void __exit ipt_recent_exit(void)
+static void __exit recent_mt_exit(void)
{
BUG_ON(!list_empty(&tables));
- xt_unregister_match(&recent_match);
+ xt_unregister_match(&recent_mt_reg);
#ifdef CONFIG_PROC_FS
remove_proc_entry("ipt_recent", init_net.proc_net);
#endif
}
-module_init(ipt_recent_init);
-module_exit(ipt_recent_exit);
+module_init(recent_mt_init);
+module_exit(recent_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
deleted file mode 100644
index e740441c973d..000000000000
--- a/net/ipv4/netfilter/ipt_tos.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Kernel module to match TOS values. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter_ipv4/ipt_tos.h>
-#include <linux/netfilter/x_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("iptables TOS match module");
-
-static bool
-match(const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct xt_match *match,
- const void *matchinfo,
- int offset,
- unsigned int protoff,
- bool *hotdrop)
-{
- const struct ipt_tos_info *info = matchinfo;
-
- return (ip_hdr(skb)->tos == info->tos) ^ info->invert;
-}
-
-static struct xt_match tos_match __read_mostly = {
- .name = "tos",
- .family = AF_INET,
- .match = match,
- .matchsize = sizeof(struct ipt_tos_info),
- .me = THIS_MODULE,
-};
-
-static int __init ipt_multiport_init(void)
-{
- return xt_register_match(&tos_match);
-}
-
-static void __exit ipt_multiport_fini(void)
-{
- xt_unregister_match(&tos_match);
-}
-
-module_init(ipt_multiport_init);
-module_exit(ipt_multiport_fini);
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
index a439900a4ba5..e0b8caeb710c 100644
--- a/net/ipv4/netfilter/ipt_ttl.c
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -15,13 +15,13 @@
#include <linux/netfilter/x_tables.h>
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("IP tables TTL matching module");
+MODULE_DESCRIPTION("Xtables: IPv4 TTL field match");
MODULE_LICENSE("GPL");
-static bool match(const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- const struct xt_match *match, const void *matchinfo,
- int offset, unsigned int protoff, bool *hotdrop)
+static bool
+ttl_mt(const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct xt_match *match,
+ const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
{
const struct ipt_ttl_info *info = matchinfo;
const u8 ttl = ip_hdr(skb)->ttl;
@@ -44,23 +44,23 @@ static bool match(const struct sk_buff *skb,
return false;
}
-static struct xt_match ttl_match __read_mostly = {
+static struct xt_match ttl_mt_reg __read_mostly = {
.name = "ttl",
.family = AF_INET,
- .match = match,
+ .match = ttl_mt,
.matchsize = sizeof(struct ipt_ttl_info),
.me = THIS_MODULE,
};
-static int __init ipt_ttl_init(void)
+static int __init ttl_mt_init(void)
{
- return xt_register_match(&ttl_match);
+ return xt_register_match(&ttl_mt_reg);
}
-static void __exit ipt_ttl_fini(void)
+static void __exit ttl_mt_exit(void)
{
- xt_unregister_match(&ttl_match);
+ xt_unregister_match(&ttl_mt_reg);
}
-module_init(ipt_ttl_init);
-module_exit(ipt_ttl_fini);
+module_init(ttl_mt_init);
+module_exit(ttl_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index ba3262c60437..29bb4f9fbda0 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -19,7 +19,9 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("iptables filter table");
-#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))
+#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT))
static struct
{
@@ -33,14 +35,14 @@ static struct
.num_entries = 4,
.size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
.hook_entry = {
- [NF_IP_LOCAL_IN] = 0,
- [NF_IP_FORWARD] = sizeof(struct ipt_standard),
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
+ [NF_INET_LOCAL_IN] = 0,
+ [NF_INET_FORWARD] = sizeof(struct ipt_standard),
+ [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
},
.underflow = {
- [NF_IP_LOCAL_IN] = 0,
- [NF_IP_FORWARD] = sizeof(struct ipt_standard),
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
+ [NF_INET_LOCAL_IN] = 0,
+ [NF_INET_FORWARD] = sizeof(struct ipt_standard),
+ [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
},
},
.entries = {
@@ -89,26 +91,26 @@ ipt_local_out_hook(unsigned int hook,
return ipt_do_table(skb, hook, in, out, &packet_filter);
}
-static struct nf_hook_ops ipt_ops[] = {
+static struct nf_hook_ops ipt_ops[] __read_mostly = {
{
.hook = ipt_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
+ .hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_FILTER,
},
{
.hook = ipt_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
+ .hooknum = NF_INET_FORWARD,
.priority = NF_IP_PRI_FILTER,
},
{
.hook = ipt_local_out_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
+ .hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_FILTER,
},
};
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index b4360a69d5ca..5c4be202430c 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -21,11 +21,11 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("iptables mangle table");
-#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \
- (1 << NF_IP_LOCAL_IN) | \
- (1 << NF_IP_FORWARD) | \
- (1 << NF_IP_LOCAL_OUT) | \
- (1 << NF_IP_POST_ROUTING))
+#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+ (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT) | \
+ (1 << NF_INET_POST_ROUTING))
/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */
static struct
@@ -40,18 +40,18 @@ static struct
.num_entries = 6,
.size = sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
.hook_entry = {
- [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard),
- [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2,
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
- [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
+ [NF_INET_PRE_ROUTING] = 0,
+ [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard),
+ [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2,
+ [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
+ [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
},
.underflow = {
- [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard),
- [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2,
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
- [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
+ [NF_INET_PRE_ROUTING] = 0,
+ [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard),
+ [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2,
+ [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
+ [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
},
},
.entries = {
@@ -128,40 +128,40 @@ ipt_local_hook(unsigned int hook,
return ret;
}
-static struct nf_hook_ops ipt_ops[] = {
+static struct nf_hook_ops ipt_ops[] __read_mostly = {
{
.hook = ipt_route_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
+ .hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_MANGLE,
},
{
.hook = ipt_route_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
+ .hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_MANGLE,
},
{
.hook = ipt_route_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
+ .hooknum = NF_INET_FORWARD,
.priority = NF_IP_PRI_MANGLE,
},
{
.hook = ipt_local_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
+ .hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_MANGLE,
},
{
.hook = ipt_route_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
+ .hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_MANGLE,
},
};
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index f8678651250f..dc34aa274533 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -7,7 +7,7 @@
#include <linux/netfilter_ipv4/ip_tables.h>
#include <net/ip.h>
-#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))
+#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
static struct
{
@@ -21,12 +21,12 @@ static struct
.num_entries = 3,
.size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
.hook_entry = {
- [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard)
+ [NF_INET_PRE_ROUTING] = 0,
+ [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard)
},
.underflow = {
- [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard)
+ [NF_INET_PRE_ROUTING] = 0,
+ [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard)
},
},
.entries = {
@@ -74,18 +74,18 @@ ipt_local_hook(unsigned int hook,
}
/* 'raw' is the very first table. */
-static struct nf_hook_ops ipt_ops[] = {
+static struct nf_hook_ops ipt_ops[] __read_mostly = {
{
.hook = ipt_hook,
.pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
+ .hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_RAW,
.owner = THIS_MODULE,
},
{
.hook = ipt_local_hook,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
+ .hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_RAW,
.owner = THIS_MODULE,
},
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 910dae732a0f..ac3d61d8026e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -56,12 +56,6 @@ static int ipv4_print_tuple(struct seq_file *s,
NIPQUAD(tuple->dst.u3.ip));
}
-static int ipv4_print_conntrack(struct seq_file *s,
- const struct nf_conn *conntrack)
-{
- return 0;
-}
-
/* Returns new sk_buff, or NULL */
static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
{
@@ -150,7 +144,7 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
/* Gather fragments. */
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (nf_ct_ipv4_gather_frags(skb,
- hooknum == NF_IP_PRE_ROUTING ?
+ hooknum == NF_INET_PRE_ROUTING ?
IP_DEFRAG_CONNTRACK_IN :
IP_DEFRAG_CONNTRACK_OUT))
return NF_STOLEN;
@@ -185,61 +179,61 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,
/* Connection tracking may drop packets, but never alters them, so
make it the first hook. */
-static struct nf_hook_ops ipv4_conntrack_ops[] = {
+static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
{
.hook = ipv4_conntrack_defrag,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
+ .hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
{
.hook = ipv4_conntrack_in,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
+ .hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_conntrack_defrag,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
+ .hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
{
.hook = ipv4_conntrack_local,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
+ .hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_conntrack_help,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
+ .hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv4_conntrack_help,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
+ .hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv4_confirm,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
+ .hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = ipv4_confirm,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
+ .hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
};
@@ -363,10 +357,8 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
- NLA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t),
- &tuple->src.u3.ip);
- NLA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t),
- &tuple->dst.u3.ip);
+ NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip);
+ NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip);
return 0;
nla_put_failure:
@@ -384,8 +376,8 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
return -EINVAL;
- t->src.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_SRC]);
- t->dst.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_DST]);
+ t->src.u3.ip = nla_get_be32(tb[CTA_IP_V4_SRC]);
+ t->dst.u3.ip = nla_get_be32(tb[CTA_IP_V4_DST]);
return 0;
}
@@ -405,7 +397,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.pkt_to_tuple = ipv4_pkt_to_tuple,
.invert_tuple = ipv4_invert_tuple,
.print_tuple = ipv4_print_tuple,
- .print_conntrack = ipv4_print_conntrack,
.get_l4proto = ipv4_get_l4proto,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
.tuple_to_nlattr = ipv4_tuple_to_nlattr,
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 741f3dfaa5a1..543c02b74c96 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -121,10 +121,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0)
return -ENOSPC;
- if (l3proto->print_conntrack(s, ct))
- return -ENOSPC;
-
- if (l4proto->print_conntrack(s, ct))
+ if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct))
return -ENOSPC;
if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index adcbaf6d4299..4004a04c5510 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -18,6 +18,7 @@
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_log.h>
static unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ;
@@ -73,13 +74,6 @@ static int icmp_print_tuple(struct seq_file *s,
ntohs(tuple->src.u.icmp.id));
}
-/* Print out the private part of the conntrack. */
-static int icmp_print_conntrack(struct seq_file *s,
- const struct nf_conn *conntrack)
-{
- return 0;
-}
-
/* Returns verdict for packet, or -1 for invalid. */
static int icmp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
@@ -128,7 +122,6 @@ static int icmp_new(struct nf_conn *conntrack,
return 1;
}
-extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
static int
icmp_error_message(struct sk_buff *skb,
@@ -195,7 +188,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
}
/* See ip_conntrack_proto_tcp.c */
- if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
+ if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_ip_checksum(skb, hooknum, dataoff, 0)) {
if (LOG_INVALID(IPPROTO_ICMP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
@@ -235,12 +228,9 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
static int icmp_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *t)
{
- NLA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
- &t->src.u.icmp.id);
- NLA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
- &t->dst.u.icmp.type);
- NLA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
- &t->dst.u.icmp.code);
+ NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id);
+ NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type);
+ NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code);
return 0;
@@ -262,12 +252,9 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[],
|| !tb[CTA_PROTO_ICMP_ID])
return -EINVAL;
- tuple->dst.u.icmp.type =
- *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_TYPE]);
- tuple->dst.u.icmp.code =
- *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_CODE]);
- tuple->src.u.icmp.id =
- *(__be16 *)nla_data(tb[CTA_PROTO_ICMP_ID]);
+ tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
+ tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
+ tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
if (tuple->dst.u.icmp.type >= sizeof(invmap)
|| !invmap[tuple->dst.u.icmp.type])
@@ -315,7 +302,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
.pkt_to_tuple = icmp_pkt_to_tuple,
.invert_tuple = icmp_invert_tuple,
.print_tuple = icmp_print_tuple,
- .print_conntrack = icmp_print_conntrack,
.packet = icmp_packet,
.new = icmp_new,
.error = icmp_error,
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 86b465b176ba..e53ae1ef8f5e 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -33,27 +33,28 @@
static DEFINE_RWLOCK(nf_nat_lock);
-static struct nf_conntrack_l3proto *l3proto = NULL;
+static struct nf_conntrack_l3proto *l3proto __read_mostly;
/* Calculated at init based on memory size */
-static unsigned int nf_nat_htable_size;
+static unsigned int nf_nat_htable_size __read_mostly;
static int nf_nat_vmalloced;
-static struct hlist_head *bysource;
+static struct hlist_head *bysource __read_mostly;
#define MAX_IP_NAT_PROTO 256
-static struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO];
+static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
+ __read_mostly;
-static inline struct nf_nat_protocol *
+static inline const struct nf_nat_protocol *
__nf_nat_proto_find(u_int8_t protonum)
{
return rcu_dereference(nf_nat_protos[protonum]);
}
-struct nf_nat_protocol *
+const struct nf_nat_protocol *
nf_nat_proto_find_get(u_int8_t protonum)
{
- struct nf_nat_protocol *p;
+ const struct nf_nat_protocol *p;
rcu_read_lock();
p = __nf_nat_proto_find(protonum);
@@ -66,7 +67,7 @@ nf_nat_proto_find_get(u_int8_t protonum)
EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
void
-nf_nat_proto_put(struct nf_nat_protocol *p)
+nf_nat_proto_put(const struct nf_nat_protocol *p)
{
module_put(p->me);
}
@@ -76,10 +77,13 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put);
static inline unsigned int
hash_by_src(const struct nf_conntrack_tuple *tuple)
{
+ unsigned int hash;
+
/* Original src, to ensure we map it consistently if poss. */
- return jhash_3words((__force u32)tuple->src.u3.ip,
+ hash = jhash_3words((__force u32)tuple->src.u3.ip,
(__force u32)tuple->src.u.all,
- tuple->dst.protonum, 0) % nf_nat_htable_size;
+ tuple->dst.protonum, 0);
+ return ((u64)hash * nf_nat_htable_size) >> 32;
}
/* Is this tuple already taken? (not by us) */
@@ -105,7 +109,7 @@ static int
in_range(const struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range)
{
- struct nf_nat_protocol *proto;
+ const struct nf_nat_protocol *proto;
int ret = 0;
/* If we are supposed to map IPs, then we must be in the
@@ -210,12 +214,13 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple,
maxip = ntohl(range->max_ip);
j = jhash_2words((__force u32)tuple->src.u3.ip,
(__force u32)tuple->dst.u3.ip, 0);
- *var_ipp = htonl(minip + j % (maxip - minip + 1));
+ j = ((u64)j * (maxip - minip + 1)) >> 32;
+ *var_ipp = htonl(minip + j);
}
-/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING,
- * we change the source to map into the range. For NF_IP_PRE_ROUTING
- * and NF_IP_LOCAL_OUT, we change the destination to map into the
+/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
+ * we change the source to map into the range. For NF_INET_PRE_ROUTING
+ * and NF_INET_LOCAL_OUT, we change the destination to map into the
* range. It might not be possible to get a unique tuple, but we try.
* At worst (or if we race), we will end up with a final duplicate in
* __ip_conntrack_confirm and drop the packet. */
@@ -226,7 +231,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
- struct nf_nat_protocol *proto;
+ const struct nf_nat_protocol *proto;
/* 1) If this srcip/proto/src-proto-part is currently mapped,
and that same mapping gives a unique tuple within the given
@@ -276,12 +281,11 @@ out:
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range *range,
- unsigned int hooknum)
+ enum nf_nat_manip_type maniptype)
{
struct nf_conntrack_tuple curr_tuple, new_tuple;
struct nf_conn_nat *nat;
int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
- enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
/* nat helper or nfctnetlink also setup binding */
nat = nfct_nat(ct);
@@ -293,10 +297,8 @@ nf_nat_setup_info(struct nf_conn *ct,
}
}
- NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING ||
- hooknum == NF_IP_POST_ROUTING ||
- hooknum == NF_IP_LOCAL_IN ||
- hooknum == NF_IP_LOCAL_OUT);
+ NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC ||
+ maniptype == IP_NAT_MANIP_DST);
BUG_ON(nf_nat_initialized(ct, maniptype));
/* What we've got will look like inverse of reply. Normally
@@ -355,7 +357,7 @@ manip_pkt(u_int16_t proto,
enum nf_nat_manip_type maniptype)
{
struct iphdr *iph;
- struct nf_nat_protocol *p;
+ const struct nf_nat_protocol *p;
if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
return 0;
@@ -372,10 +374,10 @@ manip_pkt(u_int16_t proto,
iph = (void *)skb->data + iphdroff;
if (maniptype == IP_NAT_MANIP_SRC) {
- nf_csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
+ csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
iph->saddr = target->src.u3.ip;
} else {
- nf_csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
+ csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
iph->daddr = target->dst.u3.ip;
}
return 1;
@@ -515,7 +517,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
/* Protocol registration. */
-int nf_nat_protocol_register(struct nf_nat_protocol *proto)
+int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
{
int ret = 0;
@@ -532,7 +534,7 @@ int nf_nat_protocol_register(struct nf_nat_protocol *proto)
EXPORT_SYMBOL(nf_nat_protocol_register);
/* Noone stores the protocol anywhere; simply delete it. */
-void nf_nat_protocol_unregister(struct nf_nat_protocol *proto)
+void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
{
write_lock_bh(&nf_nat_lock);
rcu_assign_pointer(nf_nat_protos[proto->protonum],
@@ -547,10 +549,8 @@ int
nf_nat_port_range_to_nlattr(struct sk_buff *skb,
const struct nf_nat_range *range)
{
- NLA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16),
- &range->min.tcp.port);
- NLA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16),
- &range->max.tcp.port);
+ NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.tcp.port);
+ NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.tcp.port);
return 0;
@@ -568,8 +568,7 @@ nf_nat_port_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range)
if (tb[CTA_PROTONAT_PORT_MIN]) {
ret = 1;
- range->min.tcp.port =
- *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MIN]);
+ range->min.tcp.port = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
}
if (!tb[CTA_PROTONAT_PORT_MAX]) {
@@ -577,8 +576,7 @@ nf_nat_port_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range)
range->max.tcp.port = range->min.tcp.port;
} else {
ret = 1;
- range->max.tcp.port =
- *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MAX]);
+ range->max.tcp.port = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
}
return ret;
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 93e18ef114f2..a121989fdad7 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -76,7 +76,7 @@ static int set_addr(struct sk_buff *skb,
static int set_h225_addr(struct sk_buff *skb,
unsigned char **data, int dataoff,
TransportAddress *taddr,
- union nf_conntrack_address *addr, __be16 port)
+ union nf_inet_addr *addr, __be16 port)
{
return set_addr(skb, data, dataoff, taddr->ipAddress.ip,
addr->ip, port);
@@ -86,7 +86,7 @@ static int set_h225_addr(struct sk_buff *skb,
static int set_h245_addr(struct sk_buff *skb,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr,
- union nf_conntrack_address *addr, __be16 port)
+ union nf_inet_addr *addr, __be16 port)
{
return set_addr(skb, data, dataoff,
taddr->unicastAddress.iPAddress.network,
@@ -103,7 +103,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
int dir = CTINFO2DIR(ctinfo);
int i;
__be16 port;
- union nf_conntrack_address addr;
+ union nf_inet_addr addr;
for (i = 0; i < count; i++) {
if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) {
@@ -155,7 +155,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
int dir = CTINFO2DIR(ctinfo);
int i;
__be16 port;
- union nf_conntrack_address addr;
+ union nf_inet_addr addr;
for (i = 0; i < count; i++) {
if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
@@ -389,18 +389,14 @@ static void ip_nat_q931_expect(struct nf_conn *new,
/* Change src to where master sends to */
range.flags = IP_NAT_RANGE_MAP_IPS;
range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
-
- /* hook doesn't matter, but it has to do source manip */
- nf_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
+ nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
/* For DST manip, map port here to where it's expected. */
range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
range.min = range.max = this->saved_proto;
range.min_ip = range.max_ip =
new->master->tuplehash[!this->dir].tuple.src.u3.ip;
-
- /* hook doesn't matter, but it has to do destination manip */
- nf_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
+ nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
}
/****************************************************************************/
@@ -412,7 +408,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
int dir = CTINFO2DIR(ctinfo);
u_int16_t nated_port = ntohs(port);
- union nf_conntrack_address addr;
+ union nf_inet_addr addr;
/* Set expectations for NAT */
exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
@@ -479,17 +475,13 @@ static void ip_nat_callforwarding_expect(struct nf_conn *new,
/* Change src to where master sends to */
range.flags = IP_NAT_RANGE_MAP_IPS;
range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
-
- /* hook doesn't matter, but it has to do source manip */
- nf_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
+ nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
/* For DST manip, map port here to where it's expected. */
range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
range.min = range.max = this->saved_proto;
range.min_ip = range.max_ip = this->saved_ip;
-
- /* hook doesn't matter, but it has to do destination manip */
- nf_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
+ nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
}
/****************************************************************************/
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 8718da00ef2a..4c0232842e75 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -20,6 +20,7 @@
#include <linux/netfilter_ipv4.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_protocol.h>
@@ -180,8 +181,8 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
datalen, 0));
}
} else
- nf_proto_csum_replace2(&tcph->check, skb,
- htons(oldlen), htons(datalen), 1);
+ inet_proto_csum_replace2(&tcph->check, skb,
+ htons(oldlen), htons(datalen), 1);
if (rep_len != match_len) {
set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
@@ -191,6 +192,8 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
/* Tell TCP window tracking about seq change */
nf_conntrack_tcp_update(skb, ip_hdrlen(skb),
ct, CTINFO2DIR(ctinfo));
+
+ nf_conntrack_event_cache(IPCT_NATSEQADJ, skb);
}
return 1;
}
@@ -270,8 +273,8 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
udph->check = CSUM_MANGLED_0;
}
} else
- nf_proto_csum_replace2(&udph->check, skb,
- htons(oldlen), htons(datalen), 1);
+ inet_proto_csum_replace2(&udph->check, skb,
+ htons(oldlen), htons(datalen), 1);
return 1;
}
@@ -310,10 +313,10 @@ sack_adjust(struct sk_buff *skb,
ntohl(sack->start_seq), new_start_seq,
ntohl(sack->end_seq), new_end_seq);
- nf_proto_csum_replace4(&tcph->check, skb,
- sack->start_seq, new_start_seq, 0);
- nf_proto_csum_replace4(&tcph->check, skb,
- sack->end_seq, new_end_seq, 0);
+ inet_proto_csum_replace4(&tcph->check, skb,
+ sack->start_seq, new_start_seq, 0);
+ inet_proto_csum_replace4(&tcph->check, skb,
+ sack->end_seq, new_end_seq, 0);
sack->start_seq = new_start_seq;
sack->end_seq = new_end_seq;
sackoff += sizeof(*sack);
@@ -397,8 +400,8 @@ nf_nat_seq_adjust(struct sk_buff *skb,
else
newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_before);
- nf_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
- nf_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
@@ -430,15 +433,13 @@ void nf_nat_follow_master(struct nf_conn *ct,
range.flags = IP_NAT_RANGE_MAP_IPS;
range.min_ip = range.max_ip
= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
- /* hook doesn't matter, but it has to do source manip */
- nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+ nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
/* For DST manip, map port here to where it's expected. */
range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
range.min = range.max = exp->saved_proto;
range.min_ip = range.max_ip
= ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
- /* hook doesn't matter, but it has to do destination manip */
- nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
+ nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
}
EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 6817e7995f35..e63b944a2ebb 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -93,8 +93,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
range.min = range.max = exp->saved_proto;
}
- /* hook doesn't matter, but it has to do source manip */
- nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+ nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
/* For DST manip, map port here to where it's expected. */
range.flags = IP_NAT_RANGE_MAP_IPS;
@@ -104,8 +103,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
range.min = range.max = exp->saved_proto;
}
- /* hook doesn't matter, but it has to do destination manip */
- nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
+ nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
}
/* outbound packets == from PNS to PAC */
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index b820f9960356..9fa272e73113 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -135,9 +135,10 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
return 1;
}
-static struct nf_nat_protocol gre __read_mostly = {
+static const struct nf_nat_protocol gre = {
.name = "GRE",
.protonum = IPPROTO_GRE,
+ .me = THIS_MODULE,
.manip_pkt = gre_manip_pkt,
.in_range = gre_in_range,
.unique_tuple = gre_unique_tuple,
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index b9fc724388fc..a0e44c953cb6 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -65,13 +65,13 @@ icmp_manip_pkt(struct sk_buff *skb,
return 0;
hdr = (struct icmphdr *)(skb->data + hdroff);
- nf_proto_csum_replace2(&hdr->checksum, skb,
- hdr->un.echo.id, tuple->src.u.icmp.id, 0);
+ inet_proto_csum_replace2(&hdr->checksum, skb,
+ hdr->un.echo.id, tuple->src.u.icmp.id, 0);
hdr->un.echo.id = tuple->src.u.icmp.id;
return 1;
}
-struct nf_nat_protocol nf_nat_protocol_icmp = {
+const struct nf_nat_protocol nf_nat_protocol_icmp = {
.name = "ICMP",
.protonum = IPPROTO_ICMP,
.me = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
index 6bab2e184455..da23e9fbe679 100644
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -132,12 +132,12 @@ tcp_manip_pkt(struct sk_buff *skb,
if (hdrsize < sizeof(*hdr))
return 1;
- nf_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
- nf_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
+ inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
+ inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
return 1;
}
-struct nf_nat_protocol nf_nat_protocol_tcp = {
+const struct nf_nat_protocol nf_nat_protocol_tcp = {
.name = "TCP",
.protonum = IPPROTO_TCP,
.me = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
index cbf1a61e2908..10df4db078af 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -117,9 +117,9 @@ udp_manip_pkt(struct sk_buff *skb,
portptr = &hdr->dest;
}
if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
- nf_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
- nf_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
- 0);
+ inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
+ inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
+ 0);
if (!hdr->check)
hdr->check = CSUM_MANGLED_0;
}
@@ -127,7 +127,7 @@ udp_manip_pkt(struct sk_buff *skb,
return 1;
}
-struct nf_nat_protocol nf_nat_protocol_udp = {
+const struct nf_nat_protocol nf_nat_protocol_udp = {
.name = "UDP",
.protonum = IPPROTO_UDP,
.me = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
index cfd2742e9706..a26efeb073cb 100644
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c
@@ -45,7 +45,7 @@ unknown_manip_pkt(struct sk_buff *skb,
return 1;
}
-struct nf_nat_protocol nf_nat_unknown_protocol = {
+const struct nf_nat_protocol nf_nat_unknown_protocol = {
.name = "unknown",
/* .me isn't set: getting a ref to this cannot fail. */
.manip_pkt = unknown_manip_pkt,
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 46b25ab5f78b..519182269e76 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -24,7 +24,9 @@
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_rule.h>
-#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT))
+#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+ (1 << NF_INET_POST_ROUTING) | \
+ (1 << NF_INET_LOCAL_OUT))
static struct
{
@@ -38,14 +40,14 @@ static struct
.num_entries = 4,
.size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
.hook_entry = {
- [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
+ [NF_INET_PRE_ROUTING] = 0,
+ [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard),
+ [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
},
.underflow = {
- [NF_IP_PRE_ROUTING] = 0,
- [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
- [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
+ [NF_INET_PRE_ROUTING] = 0,
+ [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard),
+ [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
},
},
.entries = {
@@ -76,7 +78,7 @@ static unsigned int ipt_snat_target(struct sk_buff *skb,
enum ip_conntrack_info ctinfo;
const struct nf_nat_multi_range_compat *mr = targinfo;
- NF_CT_ASSERT(hooknum == NF_IP_POST_ROUTING);
+ NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
ct = nf_ct_get(skb, &ctinfo);
@@ -85,7 +87,7 @@ static unsigned int ipt_snat_target(struct sk_buff *skb,
ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
NF_CT_ASSERT(out);
- return nf_nat_setup_info(ct, &mr->range[0], hooknum);
+ return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC);
}
/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
@@ -95,7 +97,7 @@ static void warn_if_extra_mangle(__be32 dstip, __be32 srcip)
struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
struct rtable *rt;
- if (ip_route_output_key(&rt, &fl) != 0)
+ if (ip_route_output_key(&init_net, &rt, &fl) != 0)
return;
if (rt->rt_src != srcip && !warned) {
@@ -118,20 +120,20 @@ static unsigned int ipt_dnat_target(struct sk_buff *skb,
enum ip_conntrack_info ctinfo;
const struct nf_nat_multi_range_compat *mr = targinfo;
- NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING ||
- hooknum == NF_IP_LOCAL_OUT);
+ NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING ||
+ hooknum == NF_INET_LOCAL_OUT);
ct = nf_ct_get(skb, &ctinfo);
/* Connection must be valid and new. */
NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
- if (hooknum == NF_IP_LOCAL_OUT &&
+ if (hooknum == NF_INET_LOCAL_OUT &&
mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
warn_if_extra_mangle(ip_hdr(skb)->daddr,
mr->range[0].min_ip);
- return nf_nat_setup_info(ct, &mr->range[0], hooknum);
+ return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
}
static bool ipt_snat_checkentry(const char *tablename,
@@ -182,7 +184,7 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
pr_debug("Allocating NULL binding for %p (%u.%u.%u.%u)\n",
ct, NIPQUAD(ip));
- return nf_nat_setup_info(ct, &range, hooknum);
+ return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
}
unsigned int
@@ -201,7 +203,7 @@ alloc_null_binding_confirmed(struct nf_conn *ct, unsigned int hooknum)
pr_debug("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n",
ct, NIPQUAD(ip));
- return nf_nat_setup_info(ct, &range, hooknum);
+ return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
}
int nf_nat_rule_find(struct sk_buff *skb,
@@ -227,7 +229,7 @@ static struct xt_target ipt_snat_reg __read_mostly = {
.target = ipt_snat_target,
.targetsize = sizeof(struct nf_nat_multi_range_compat),
.table = "nat",
- .hooks = 1 << NF_IP_POST_ROUTING,
+ .hooks = 1 << NF_INET_POST_ROUTING,
.checkentry = ipt_snat_checkentry,
.family = AF_INET,
};
@@ -237,7 +239,7 @@ static struct xt_target ipt_dnat_reg __read_mostly = {
.target = ipt_dnat_target,
.targetsize = sizeof(struct nf_nat_multi_range_compat),
.table = "nat",
- .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT),
+ .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
.checkentry = ipt_dnat_checkentry,
.family = AF_INET,
};
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 8996ccb757db..606a170bf4ca 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -228,15 +228,13 @@ static void ip_nat_sdp_expect(struct nf_conn *ct,
range.flags = IP_NAT_RANGE_MAP_IPS;
range.min_ip = range.max_ip
= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
- /* hook doesn't matter, but it has to do source manip */
- nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+ nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
/* For DST manip, map port here to where it's expected. */
range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
range.min = range.max = exp->saved_proto;
range.min_ip = range.max_ip = exp->saved_ip;
- /* hook doesn't matter, but it has to do destination manip */
- nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
+ nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
}
/* So, this packet has hit the connection tracking matching code.
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 03709d6b4b06..07f2a49926d4 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -60,7 +60,7 @@ MODULE_ALIAS("ip_nat_snmp_basic");
#define SNMP_PORT 161
#define SNMP_TRAP_PORT 162
-#define NOCT1(n) (*(u8 *)n)
+#define NOCT1(n) (*(u8 *)(n))
static int debug;
static DEFINE_SPINLOCK(snmp_lock);
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 7db76ea9af91..99b2c788d5a8 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -137,7 +137,7 @@ nf_nat_fn(unsigned int hooknum,
if (unlikely(nf_ct_is_confirmed(ct)))
/* NAT module was loaded late */
ret = alloc_null_binding_confirmed(ct, hooknum);
- else if (hooknum == NF_IP_LOCAL_IN)
+ else if (hooknum == NF_INET_LOCAL_IN)
/* LOCAL_IN hook doesn't have a chain! */
ret = alloc_null_binding(ct, hooknum);
else
@@ -273,13 +273,13 @@ nf_nat_adjust(unsigned int hooknum,
/* We must be after connection tracking and before packet filtering. */
-static struct nf_hook_ops nf_nat_ops[] = {
+static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = nf_nat_in,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_PRE_ROUTING,
+ .hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
/* After packet filtering, change source */
@@ -287,7 +287,7 @@ static struct nf_hook_ops nf_nat_ops[] = {
.hook = nf_nat_out,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
+ .hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
/* After conntrack, adjust sequence number */
@@ -295,7 +295,7 @@ static struct nf_hook_ops nf_nat_ops[] = {
.hook = nf_nat_adjust,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
+ .hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SEQ_ADJUST,
},
/* Before packet filtering, change destination */
@@ -303,7 +303,7 @@ static struct nf_hook_ops nf_nat_ops[] = {
.hook = nf_nat_local_fn,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_OUT,
+ .hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
/* After packet filtering, change source */
@@ -311,7 +311,7 @@ static struct nf_hook_ops nf_nat_ops[] = {
.hook = nf_nat_fn,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
+ .hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
},
/* After conntrack, adjust sequence number */
@@ -319,7 +319,7 @@ static struct nf_hook_ops nf_nat_ops[] = {
.hook = nf_nat_adjust,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
+ .hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SEQ_ADJUST,
},
};
@@ -332,7 +332,7 @@ static int __init nf_nat_standalone_init(void)
#ifdef CONFIG_XFRM
BUG_ON(ip_nat_decode_session != NULL);
- ip_nat_decode_session = nat_decode_session;
+ rcu_assign_pointer(ip_nat_decode_session, nat_decode_session);
#endif
ret = nf_nat_rule_init();
if (ret < 0) {
@@ -350,7 +350,7 @@ static int __init nf_nat_standalone_init(void)
nf_nat_rule_cleanup();
cleanup_decode_session:
#ifdef CONFIG_XFRM
- ip_nat_decode_session = NULL;
+ rcu_assign_pointer(ip_nat_decode_session, NULL);
synchronize_net();
#endif
return ret;
@@ -361,7 +361,7 @@ static void __exit nf_nat_standalone_fini(void)
nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
nf_nat_rule_cleanup();
#ifdef CONFIG_XFRM
- ip_nat_decode_session = NULL;
+ rcu_assign_pointer(ip_nat_decode_session, NULL);
synchronize_net();
#endif
/* Conntrack caches are unregistered in nf_conntrack_cleanup */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ce34b281803f..d63474c6b400 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -53,14 +53,16 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
{
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
- sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
+ sock_prot_inuse_get(&tcp_prot),
+ atomic_read(&tcp_orphan_count),
tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
atomic_read(&tcp_memory_allocated));
- seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot));
- seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot));
- seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot));
+ seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse_get(&udp_prot),
+ atomic_read(&udp_memory_allocated));
+ seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(&udplite_prot));
+ seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(&raw_prot));
seq_printf(seq, "FRAG: inuse %d memory %d\n",
- ip_frag_nqueues(), ip_frag_mem());
+ ip_frag_nqueues(&init_net), ip_frag_mem(&init_net));
return 0;
}
@@ -309,7 +311,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
seq_printf(seq, "\nIp: %d %d",
- IPV4_DEVCONF_ALL(FORWARDING) ? 1 : 2, sysctl_ip_default_ttl);
+ IPV4_DEVCONF_ALL(&init_net, FORWARDING) ? 1 : 2,
+ sysctl_ip_default_ttl);
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index e7050f8eabeb..85c08696abbe 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -80,38 +80,51 @@
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
-struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE];
-DEFINE_RWLOCK(raw_v4_lock);
+static struct raw_hashinfo raw_v4_hashinfo = {
+ .lock = __RW_LOCK_UNLOCKED(),
+};
-static void raw_v4_hash(struct sock *sk)
+void raw_hash_sk(struct sock *sk, struct raw_hashinfo *h)
{
- struct hlist_head *head = &raw_v4_htable[inet_sk(sk)->num &
- (RAWV4_HTABLE_SIZE - 1)];
+ struct hlist_head *head;
+
+ head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)];
- write_lock_bh(&raw_v4_lock);
+ write_lock_bh(&h->lock);
sk_add_node(sk, head);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock_bh(&raw_v4_lock);
+ sock_prot_inuse_add(sk->sk_prot, 1);
+ write_unlock_bh(&h->lock);
}
+EXPORT_SYMBOL_GPL(raw_hash_sk);
-static void raw_v4_unhash(struct sock *sk)
+void raw_unhash_sk(struct sock *sk, struct raw_hashinfo *h)
{
- write_lock_bh(&raw_v4_lock);
+ write_lock_bh(&h->lock);
if (sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
- write_unlock_bh(&raw_v4_lock);
+ sock_prot_inuse_add(sk->sk_prot, -1);
+ write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_unhash_sk);
+
+static void raw_v4_hash(struct sock *sk)
+{
+ raw_hash_sk(sk, &raw_v4_hashinfo);
}
-struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
- __be32 raddr, __be32 laddr,
- int dif)
+static void raw_v4_unhash(struct sock *sk)
+{
+ raw_unhash_sk(sk, &raw_v4_hashinfo);
+}
+
+static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+ unsigned short num, __be32 raddr, __be32 laddr, int dif)
{
struct hlist_node *node;
sk_for_each_from(sk, node) {
struct inet_sock *inet = inet_sk(sk);
- if (inet->num == num &&
+ if (sk->sk_net == net && inet->num == num &&
!(inet->daddr && inet->daddr != raddr) &&
!(inet->rcv_saddr && inet->rcv_saddr != laddr) &&
!(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
@@ -150,17 +163,20 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
* RFC 1122: SHOULD pass TOS value up to the transport layer.
* -> It does. And not only TOS, but all IP header.
*/
-int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
{
struct sock *sk;
struct hlist_head *head;
int delivered = 0;
+ struct net *net;
- read_lock(&raw_v4_lock);
- head = &raw_v4_htable[hash];
+ read_lock(&raw_v4_hashinfo.lock);
+ head = &raw_v4_hashinfo.ht[hash];
if (hlist_empty(head))
goto out;
- sk = __raw_v4_lookup(__sk_head(head), iph->protocol,
+
+ net = skb->dev->nd_net;
+ sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
iph->saddr, iph->daddr,
skb->dev->ifindex);
@@ -173,16 +189,34 @@ int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
if (clone)
raw_rcv(sk, clone);
}
- sk = __raw_v4_lookup(sk_next(sk), iph->protocol,
+ sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
iph->saddr, iph->daddr,
skb->dev->ifindex);
}
out:
- read_unlock(&raw_v4_lock);
+ read_unlock(&raw_v4_hashinfo.lock);
return delivered;
}
-void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
+int raw_local_deliver(struct sk_buff *skb, int protocol)
+{
+ int hash;
+ struct sock *raw_sk;
+
+ hash = protocol & (RAW_HTABLE_SIZE - 1);
+ raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
+
+ /* If there maybe a raw socket we must check - if not we
+ * don't care less
+ */
+ if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
+ raw_sk = NULL;
+
+ return raw_sk != NULL;
+
+}
+
+static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
{
struct inet_sock *inet = inet_sk(sk);
const int type = icmp_hdr(skb)->type;
@@ -236,12 +270,38 @@ void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
}
}
+void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
+{
+ int hash;
+ struct sock *raw_sk;
+ struct iphdr *iph;
+ struct net *net;
+
+ hash = protocol & (RAW_HTABLE_SIZE - 1);
+
+ read_lock(&raw_v4_hashinfo.lock);
+ raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
+ if (raw_sk != NULL) {
+ iph = (struct iphdr *)skb->data;
+ net = skb->dev->nd_net;
+
+ while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
+ iph->daddr, iph->saddr,
+ skb->dev->ifindex)) != NULL) {
+ raw_err(raw_sk, skb, info);
+ raw_sk = sk_next(raw_sk);
+ iph = (struct iphdr *)skb->data;
+ }
+ }
+ read_unlock(&raw_v4_hashinfo.lock);
+}
+
static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
{
/* Charge it to the socket. */
if (sock_queue_rcv_skb(sk, skb) < 0) {
- /* FIXME: increment a raw drops counter here */
+ atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return NET_RX_DROP;
}
@@ -252,6 +312,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return NET_RX_DROP;
}
@@ -320,7 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
icmp_out_count(((struct icmphdr *)
skb_transport_header(skb))->type);
- err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
dst_output);
if (err > 0)
err = inet->recverr ? net_xmit_errno(err) : 0;
@@ -474,7 +535,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (msg->msg_flags & MSG_DONTROUTE)
tos |= RTO_ONLINK;
- if (MULTICAST(daddr)) {
+ if (ipv4_is_multicast(daddr)) {
if (!ipc.oif)
ipc.oif = inet->mc_index;
if (!saddr)
@@ -497,7 +558,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
}
security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(&rt, &fl, sk, 1);
+ err = ip_route_output_flow(&init_net, &rt, &fl, sk, 1);
}
if (err)
goto done;
@@ -564,7 +625,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
goto out;
- chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+ chk_addr_ret = inet_addr_type(sk->sk_net, addr->sin_addr.s_addr);
ret = -EADDRNOTAVAIL;
if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
@@ -789,22 +850,18 @@ struct proto raw_prot = {
};
#ifdef CONFIG_PROC_FS
-struct raw_iter_state {
- int bucket;
-};
-
-#define raw_seq_private(seq) ((struct raw_iter_state *)(seq)->private)
-
static struct sock *raw_get_first(struct seq_file *seq)
{
struct sock *sk;
struct raw_iter_state* state = raw_seq_private(seq);
- for (state->bucket = 0; state->bucket < RAWV4_HTABLE_SIZE; ++state->bucket) {
+ for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
+ ++state->bucket) {
struct hlist_node *node;
- sk_for_each(sk, node, &raw_v4_htable[state->bucket])
- if (sk->sk_family == PF_INET)
+ sk_for_each(sk, node, &state->h->ht[state->bucket])
+ if (sk->sk_net == state->p.net &&
+ sk->sk_family == state->family)
goto found;
}
sk = NULL;
@@ -820,10 +877,11 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
sk = sk_next(sk);
try_again:
;
- } while (sk && sk->sk_family != PF_INET);
+ } while (sk && sk->sk_net != state->p.net &&
+ sk->sk_family != state->family);
- if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) {
- sk = sk_head(&raw_v4_htable[state->bucket]);
+ if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
+ sk = sk_head(&state->h->ht[state->bucket]);
goto try_again;
}
return sk;
@@ -839,13 +897,16 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
return pos ? NULL : sk;
}
-static void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+void *raw_seq_start(struct seq_file *seq, loff_t *pos)
{
- read_lock(&raw_v4_lock);
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ read_lock(&state->h->lock);
return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
+EXPORT_SYMBOL_GPL(raw_seq_start);
-static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct sock *sk;
@@ -856,11 +917,15 @@ static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
return sk;
}
+EXPORT_SYMBOL_GPL(raw_seq_next);
-static void raw_seq_stop(struct seq_file *seq, void *v)
+void raw_seq_stop(struct seq_file *seq, void *v)
{
- read_unlock(&raw_v4_lock);
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ read_unlock(&state->h->lock);
}
+EXPORT_SYMBOL_GPL(raw_seq_stop);
static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i)
{
@@ -871,28 +936,30 @@ static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i)
srcp = inet->num;
sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d",
i, src, srcp, dest, destp, sp->sk_state,
atomic_read(&sp->sk_wmem_alloc),
atomic_read(&sp->sk_rmem_alloc),
0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
- atomic_read(&sp->sk_refcnt), sp);
+ atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
return tmpbuf;
}
+#define TMPSZ 128
+
static int raw_seq_show(struct seq_file *seq, void *v)
{
- char tmpbuf[129];
+ char tmpbuf[TMPSZ+1];
if (v == SEQ_START_TOKEN)
- seq_printf(seq, "%-127s\n",
+ seq_printf(seq, "%-*s\n", TMPSZ-1,
" sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
- "inode");
+ "inode drops");
else {
struct raw_iter_state *state = raw_seq_private(seq);
- seq_printf(seq, "%-127s\n",
+ seq_printf(seq, "%-*s\n", TMPSZ-1,
get_raw_sock(v, tmpbuf, state->bucket));
}
return 0;
@@ -905,29 +972,62 @@ static const struct seq_operations raw_seq_ops = {
.show = raw_seq_show,
};
-static int raw_seq_open(struct inode *inode, struct file *file)
+int raw_seq_open(struct inode *ino, struct file *file, struct raw_hashinfo *h,
+ unsigned short family)
{
- return seq_open_private(file, &raw_seq_ops,
+ int err;
+ struct raw_iter_state *i;
+
+ err = seq_open_net(ino, file, &raw_seq_ops,
sizeof(struct raw_iter_state));
+ if (err < 0)
+ return err;
+
+ i = raw_seq_private((struct seq_file *)file->private_data);
+ i->h = h;
+ i->family = family;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(raw_seq_open);
+
+static int raw_v4_seq_open(struct inode *inode, struct file *file)
+{
+ return raw_seq_open(inode, file, &raw_v4_hashinfo, PF_INET);
}
static const struct file_operations raw_seq_fops = {
.owner = THIS_MODULE,
- .open = raw_seq_open,
+ .open = raw_v4_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
-int __init raw_proc_init(void)
+static __net_init int raw_init_net(struct net *net)
{
- if (!proc_net_fops_create(&init_net, "raw", S_IRUGO, &raw_seq_fops))
+ if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops))
return -ENOMEM;
+
return 0;
}
+static __net_exit void raw_exit_net(struct net *net)
+{
+ proc_net_remove(net, "raw");
+}
+
+static __net_initdata struct pernet_operations raw_net_ops = {
+ .init = raw_init_net,
+ .exit = raw_exit_net,
+};
+
+int __init raw_proc_init(void)
+{
+ return register_pernet_subsys(&raw_net_ops);
+}
+
void __init raw_proc_exit(void)
{
- proc_net_remove(&init_net, "raw");
+ unregister_pernet_subsys(&raw_net_ops);
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 28484f396b04..896c768e41a2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -92,6 +92,7 @@
#include <linux/jhash.h>
#include <linux/rcupdate.h>
#include <linux/times.h>
+#include <net/dst.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/ip.h>
@@ -132,13 +133,14 @@ static int ip_rt_mtu_expires = 10 * 60 * HZ;
static int ip_rt_min_pmtu = 512 + 20 + 20;
static int ip_rt_min_advmss = 256;
static int ip_rt_secret_interval = 10 * 60 * HZ;
+static int ip_rt_flush_expected;
static unsigned long rt_deadline;
#define RTprint(a...) printk(KERN_DEBUG a)
static struct timer_list rt_flush_timer;
-static void rt_check_expire(struct work_struct *work);
-static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
+static void rt_worker_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
static struct timer_list rt_secret_timer;
/*
@@ -152,7 +154,7 @@ static void ipv4_dst_ifdown(struct dst_entry *dst,
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
-static int rt_garbage_collect(void);
+static int rt_garbage_collect(struct dst_ops *ops);
static struct dst_ops ipv4_dst_ops = {
@@ -165,6 +167,7 @@ static struct dst_ops ipv4_dst_ops = {
.negative_advice = ipv4_negative_advice,
.link_failure = ipv4_link_failure,
.update_pmtu = ip_rt_update_pmtu,
+ .local_out = ip_local_out,
.entry_size = sizeof(struct rtable),
};
@@ -232,16 +235,25 @@ struct rt_hash_bucket {
static spinlock_t *rt_hash_locks;
# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-# define rt_hash_lock_init() { \
- int i; \
- rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
- if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
- for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
- spin_lock_init(&rt_hash_locks[i]); \
- }
+
+static __init void rt_hash_lock_init(void)
+{
+ int i;
+
+ rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
+ GFP_KERNEL);
+ if (!rt_hash_locks)
+ panic("IP: failed to allocate rt_hash_locks\n");
+
+ for (i = 0; i < RT_HASH_LOCK_SZ; i++)
+ spin_lock_init(&rt_hash_locks[i]);
+}
#else
# define rt_hash_lock_addr(slot) NULL
-# define rt_hash_lock_init()
+
+static inline void rt_hash_lock_init(void)
+{
+}
#endif
static struct rt_hash_bucket *rt_hash_table;
@@ -478,6 +490,83 @@ static const struct file_operations rt_cpu_seq_fops = {
.release = seq_release,
};
+#ifdef CONFIG_NET_CLS_ROUTE
+static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ unsigned int i;
+
+ if ((offset & 3) || (length & 3))
+ return -EIO;
+
+ if (offset >= sizeof(struct ip_rt_acct) * 256) {
+ *eof = 1;
+ return 0;
+ }
+
+ if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
+ length = sizeof(struct ip_rt_acct) * 256 - offset;
+ *eof = 1;
+ }
+
+ offset /= sizeof(u32);
+
+ if (length > 0) {
+ u32 *dst = (u32 *) buffer;
+
+ *start = buffer;
+ memset(dst, 0, length);
+
+ for_each_possible_cpu(i) {
+ unsigned int j;
+ u32 *src;
+
+ src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
+ for (j = 0; j < length/4; j++)
+ dst[j] += src[j];
+ }
+ }
+ return length;
+}
+#endif
+
+static __init int ip_rt_proc_init(struct net *net)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
+ &rt_cache_seq_fops);
+ if (!pde)
+ goto err1;
+
+ pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
+ if (!pde)
+ goto err2;
+
+ pde->proc_fops = &rt_cpu_seq_fops;
+
+#ifdef CONFIG_NET_CLS_ROUTE
+ pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
+ ip_rt_acct_read, NULL);
+ if (!pde)
+ goto err3;
+#endif
+ return 0;
+
+#ifdef CONFIG_NET_CLS_ROUTE
+err3:
+ remove_proc_entry("rt_cache", net->proc_net_stat);
+#endif
+err2:
+ remove_proc_entry("rt_cache", net->proc_net);
+err1:
+ return -ENOMEM;
+}
+#else
+static inline int ip_rt_proc_init(struct net *net)
+{
+ return 0;
+}
#endif /* CONFIG_PROC_FS */
static __inline__ void rt_free(struct rtable *rt)
@@ -559,7 +648,41 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
(fl1->iif ^ fl2->iif)) == 0;
}
-static void rt_check_expire(struct work_struct *work)
+static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
+{
+ return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
+}
+
+/*
+ * Perform a full scan of hash table and free all entries.
+ * Can be called by a softirq or a process.
+ * In the later case, we want to be reschedule if necessary
+ */
+static void rt_do_flush(int process_context)
+{
+ unsigned int i;
+ struct rtable *rth, *next;
+
+ for (i = 0; i <= rt_hash_mask; i++) {
+ if (process_context && need_resched())
+ cond_resched();
+ rth = rt_hash_table[i].chain;
+ if (!rth)
+ continue;
+
+ spin_lock_bh(rt_hash_lock_addr(i));
+ rth = rt_hash_table[i].chain;
+ rt_hash_table[i].chain = NULL;
+ spin_unlock_bh(rt_hash_lock_addr(i));
+
+ for (; rth; rth = next) {
+ next = rth->u.dst.rt_next;
+ rt_free(rth);
+ }
+ }
+}
+
+static void rt_check_expire(void)
{
static unsigned int rover;
unsigned int i = rover, goal;
@@ -605,33 +728,33 @@ static void rt_check_expire(struct work_struct *work)
spin_unlock_bh(rt_hash_lock_addr(i));
}
rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * If a whole flush was scheduled, it is done.
+ * Else, we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+ if (ip_rt_flush_expected) {
+ ip_rt_flush_expected = 0;
+ rt_do_flush(1);
+ } else
+ rt_check_expire();
schedule_delayed_work(&expires_work, ip_rt_gc_interval);
}
/* This can run from both BH and non-BH contexts, the latter
* in the case of a forced flush event.
*/
-static void rt_run_flush(unsigned long dummy)
+static void rt_run_flush(unsigned long process_context)
{
- int i;
- struct rtable *rth, *next;
-
rt_deadline = 0;
get_random_bytes(&rt_hash_rnd, 4);
- for (i = rt_hash_mask; i >= 0; i--) {
- spin_lock_bh(rt_hash_lock_addr(i));
- rth = rt_hash_table[i].chain;
- if (rth)
- rt_hash_table[i].chain = NULL;
- spin_unlock_bh(rt_hash_lock_addr(i));
-
- for (; rth; rth = next) {
- next = rth->u.dst.rt_next;
- rt_free(rth);
- }
- }
+ rt_do_flush(process_context);
}
static DEFINE_SPINLOCK(rt_flush_lock);
@@ -665,7 +788,7 @@ void rt_cache_flush(int delay)
if (delay <= 0) {
spin_unlock_bh(&rt_flush_lock);
- rt_run_flush(0);
+ rt_run_flush(user_mode);
return;
}
@@ -676,12 +799,17 @@ void rt_cache_flush(int delay)
spin_unlock_bh(&rt_flush_lock);
}
+/*
+ * We change rt_hash_rnd and ask next rt_worker_func() invocation
+ * to perform a flush in process context
+ */
static void rt_secret_rebuild(unsigned long dummy)
{
- unsigned long now = jiffies;
-
- rt_cache_flush(0);
- mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
+ get_random_bytes(&rt_hash_rnd, 4);
+ ip_rt_flush_expected = 1;
+ cancel_delayed_work(&expires_work);
+ schedule_delayed_work(&expires_work, HZ/10);
+ mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
}
/*
@@ -697,7 +825,7 @@ static void rt_secret_rebuild(unsigned long dummy)
and when load increases it reduces to limit cache size.
*/
-static int rt_garbage_collect(void)
+static int rt_garbage_collect(struct dst_ops *ops)
{
static unsigned long expire = RT_GC_TIMEOUT;
static unsigned long last_gc;
@@ -728,14 +856,14 @@ static int rt_garbage_collect(void)
equilibrium = ipv4_dst_ops.gc_thresh;
goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
if (goal > 0) {
- equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
+ equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
}
} else {
/* We are in dangerous area. Try to reduce cache really
* aggressively.
*/
- goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
+ goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
}
@@ -838,7 +966,7 @@ restart:
spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
- if (compare_keys(&rth->fl, &rt->fl)) {
+ if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
/* Put it first */
*rthp = rth->u.dst.rt_next;
/*
@@ -912,7 +1040,7 @@ restart:
int saved_int = ip_rt_gc_min_interval;
ip_rt_gc_elasticity = 1;
ip_rt_gc_min_interval = 0;
- rt_garbage_collect();
+ rt_garbage_collect(&ipv4_dst_ops);
ip_rt_gc_min_interval = saved_int;
ip_rt_gc_elasticity = saved_elasticity;
goto restart;
@@ -1031,7 +1159,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
return;
if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
- || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
+ || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
+ || ipv4_is_zeronet(new_gw))
goto reject_redirect;
if (!IN_DEV_SHARED_MEDIA(in_dev)) {
@@ -1040,7 +1169,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
goto reject_redirect;
} else {
- if (inet_addr_type(new_gw) != RTN_UNICAST)
+ if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
goto reject_redirect;
}
@@ -1291,7 +1420,8 @@ static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
return 68;
}
-unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
+unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
+ unsigned short new_mtu)
{
int i;
unsigned short old_mtu = ntohs(iph->tot_len);
@@ -1314,7 +1444,8 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
rth->rt_dst == daddr &&
rth->rt_src == iph->saddr &&
rth->fl.iif == 0 &&
- !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
+ !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
+ rth->u.dst.dev->nd_net == net) {
unsigned short mtu = new_mtu;
if (new_mtu < 68 || new_mtu >= old_mtu) {
@@ -1389,8 +1520,9 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
{
struct rtable *rt = (struct rtable *) dst;
struct in_device *idev = rt->idev;
- if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
- struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
+ if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
+ struct in_device *loopback_idev =
+ in_dev_get(dev->nd_net->loopback_dev);
if (loopback_idev) {
rt->idev = loopback_idev;
in_dev_put(idev);
@@ -1434,7 +1566,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
if (rt->fl.iif == 0)
src = rt->rt_src;
- else if (fib_lookup(&rt->fl, &res) == 0) {
+ else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
src = FIB_RES_PREFSRC(res);
fib_res_put(&res);
} else
@@ -1509,12 +1641,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (in_dev == NULL)
return -EINVAL;
- if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
- skb->protocol != htons(ETH_P_IP))
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+ ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
goto e_inval;
- if (ZERONET(saddr)) {
- if (!LOCAL_MCAST(daddr))
+ if (ipv4_is_zeronet(saddr)) {
+ if (!ipv4_is_local_multicast(daddr))
goto e_inval;
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
} else if (fib_validate_source(saddr, 0, tos, 0,
@@ -1556,7 +1688,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
}
#ifdef CONFIG_IP_MROUTE
- if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+ if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
rth->u.dst.input = ip_mr_input;
#endif
RT_CACHE_STAT_INC(in_slow_mc);
@@ -1643,7 +1775,7 @@ static inline int __mkroute_input(struct sk_buff *skb,
if (err)
flags |= RTCF_DIRECTSRC;
- if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
+ if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
(IN_DEV_SHARED_MEDIA(out_dev) ||
inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
flags |= RTCF_DOREDIRECT;
@@ -1652,7 +1784,7 @@ static inline int __mkroute_input(struct sk_buff *skb,
/* Not IP (i.e. ARP). Do not create route, if it is
* invalid for proxy arp. DNAT routes are always valid.
*/
- if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
+ if (out_dev == in_dev) {
err = -EINVAL;
goto cleanup;
}
@@ -1756,6 +1888,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
__be32 spec_dst;
int err = -EINVAL;
int free_res = 0;
+ struct net * net = dev->nd_net;
/* IP on this device is disabled. */
@@ -1766,7 +1899,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
by fib_lookup.
*/
- if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+ ipv4_is_loopback(saddr))
goto martian_source;
if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
@@ -1775,16 +1909,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
/* Accept zero addresses only to limited broadcast;
* I even do not know to fix it or not. Waiting for complains :-)
*/
- if (ZERONET(saddr))
+ if (ipv4_is_zeronet(saddr))
goto martian_source;
- if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
+ if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
+ ipv4_is_loopback(daddr))
goto martian_destination;
/*
* Now we are ready to route packet.
*/
- if ((err = fib_lookup(&fl, &res)) != 0) {
+ if ((err = fib_lookup(net, &fl, &res)) != 0) {
if (!IN_DEV_FORWARD(in_dev))
goto e_hostunreach;
goto no_route;
@@ -1799,7 +1934,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res.type == RTN_LOCAL) {
int result;
result = fib_validate_source(saddr, daddr, tos,
- init_net.loopback_dev->ifindex,
+ net->loopback_dev->ifindex,
dev, &spec_dst, &itag);
if (result < 0)
goto martian_source;
@@ -1825,7 +1960,7 @@ brd_input:
if (skb->protocol != htons(ETH_P_IP))
goto e_inval;
- if (ZERONET(saddr))
+ if (ipv4_is_zeronet(saddr))
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
else {
err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
@@ -1861,7 +1996,7 @@ local_input:
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
- rth->u.dst.dev = init_net.loopback_dev;
+ rth->u.dst.dev = net->loopback_dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->rt_gateway = daddr;
@@ -1921,7 +2056,9 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
struct rtable * rth;
unsigned hash;
int iif = dev->ifindex;
+ struct net *net;
+ net = skb->dev->nd_net;
tos &= IPTOS_RT_MASK;
hash = rt_hash(daddr, saddr, iif);
@@ -1933,7 +2070,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->fl.iif == iif &&
rth->fl.oif == 0 &&
rth->fl.mark == skb->mark &&
- rth->fl.fl4_tos == tos) {
+ rth->fl.fl4_tos == tos &&
+ rth->u.dst.dev->nd_net == net) {
dst_use(&rth->u.dst, jiffies);
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
@@ -1955,7 +2093,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Note, that multicast routers are not affected, because
route cache entry is created eventually.
*/
- if (MULTICAST(daddr)) {
+ if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev;
rcu_read_lock();
@@ -1964,7 +2102,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
- || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+ || (!ipv4_is_local_multicast(daddr) &&
+ IN_DEV_MFORWARD(in_dev))
#endif
) {
rcu_read_unlock();
@@ -1990,14 +2129,14 @@ static inline int __mkroute_output(struct rtable **result,
u32 tos = RT_FL_TOS(oldflp);
int err = 0;
- if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+ if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
return -EINVAL;
if (fl->fl4_dst == htonl(0xFFFFFFFF))
res->type = RTN_BROADCAST;
- else if (MULTICAST(fl->fl4_dst))
+ else if (ipv4_is_multicast(fl->fl4_dst))
res->type = RTN_MULTICAST;
- else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
+ else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
return -EINVAL;
if (dev_out->flags & IFF_LOOPBACK)
@@ -2077,7 +2216,7 @@ static inline int __mkroute_output(struct rtable **result,
#ifdef CONFIG_IP_MROUTE
if (res->type == RTN_MULTICAST) {
if (IN_DEV_MFORWARD(in_dev) &&
- !LOCAL_MCAST(oldflp->fl4_dst)) {
+ !ipv4_is_local_multicast(oldflp->fl4_dst)) {
rth->u.dst.input = ip_mr_input;
rth->u.dst.output = ip_mc_output;
}
@@ -2119,7 +2258,8 @@ static inline int ip_mkroute_output(struct rtable **rp,
* Major route resolver routine.
*/
-static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
+static int ip_route_output_slow(struct net *net, struct rtable **rp,
+ const struct flowi *oldflp)
{
u32 tos = RT_FL_TOS(oldflp);
struct flowi fl = { .nl_u = { .ip4_u =
@@ -2131,7 +2271,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
RT_SCOPE_UNIVERSE),
} },
.mark = oldflp->mark,
- .iif = init_net.loopback_dev->ifindex,
+ .iif = net->loopback_dev->ifindex,
.oif = oldflp->oif };
struct fib_result res;
unsigned flags = 0;
@@ -2147,26 +2287,27 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
if (oldflp->fl4_src) {
err = -EINVAL;
- if (MULTICAST(oldflp->fl4_src) ||
- BADCLASS(oldflp->fl4_src) ||
- ZERONET(oldflp->fl4_src))
+ if (ipv4_is_multicast(oldflp->fl4_src) ||
+ ipv4_is_lbcast(oldflp->fl4_src) ||
+ ipv4_is_zeronet(oldflp->fl4_src))
goto out;
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = ip_dev_find(oldflp->fl4_src);
+ dev_out = ip_dev_find(net, oldflp->fl4_src);
if (dev_out == NULL)
goto out;
/* I removed check for oif == dev_out->oif here.
It was wrong for two reasons:
- 1. ip_dev_find(saddr) can return wrong iface, if saddr is
- assigned to multiple interfaces.
+ 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
+ is assigned to multiple interfaces.
2. Moreover, we are allowed to send packets with saddr
of another iface. --ANK
*/
if (oldflp->oif == 0
- && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+ && (ipv4_is_multicast(oldflp->fl4_dst) ||
+ oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
/* Special hack: user can direct multicasts
and limited broadcast via necessary interface
without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
@@ -2192,7 +2333,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
if (oldflp->oif) {
- dev_out = dev_get_by_index(&init_net, oldflp->oif);
+ dev_out = dev_get_by_index(net, oldflp->oif);
err = -ENODEV;
if (dev_out == NULL)
goto out;
@@ -2203,14 +2344,15 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
goto out; /* Wrong error code */
}
- if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
+ if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
+ oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
if (!fl.fl4_src)
fl.fl4_src = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
goto make_route;
}
if (!fl.fl4_src) {
- if (MULTICAST(oldflp->fl4_dst))
+ if (ipv4_is_multicast(oldflp->fl4_dst))
fl.fl4_src = inet_select_addr(dev_out, 0,
fl.fl4_scope);
else if (!oldflp->fl4_dst)
@@ -2225,15 +2367,15 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
if (dev_out)
dev_put(dev_out);
- dev_out = init_net.loopback_dev;
+ dev_out = net->loopback_dev;
dev_hold(dev_out);
- fl.oif = init_net.loopback_dev->ifindex;
+ fl.oif = net->loopback_dev->ifindex;
res.type = RTN_LOCAL;
flags |= RTCF_LOCAL;
goto make_route;
}
- if (fib_lookup(&fl, &res)) {
+ if (fib_lookup(net, &fl, &res)) {
res.fi = NULL;
if (oldflp->oif) {
/* Apparently, routing tables are wrong. Assume,
@@ -2272,7 +2414,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
fl.fl4_src = fl.fl4_dst;
if (dev_out)
dev_put(dev_out);
- dev_out = init_net.loopback_dev;
+ dev_out = net->loopback_dev;
dev_hold(dev_out);
fl.oif = dev_out->ifindex;
if (res.fi)
@@ -2288,7 +2430,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
else
#endif
if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
- fib_select_default(&fl, &res);
+ fib_select_default(net, &fl, &res);
if (!fl.fl4_src)
fl.fl4_src = FIB_RES_PREFSRC(res);
@@ -2311,7 +2453,8 @@ make_route:
out: return err;
}
-int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
+int __ip_route_output_key(struct net *net, struct rtable **rp,
+ const struct flowi *flp)
{
unsigned hash;
struct rtable *rth;
@@ -2327,7 +2470,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
rth->fl.oif == flp->oif &&
rth->fl.mark == flp->mark &&
!((rth->fl.fl4_tos ^ flp->fl4_tos) &
- (IPTOS_RT_MASK | RTO_ONLINK))) {
+ (IPTOS_RT_MASK | RTO_ONLINK)) &&
+ rth->u.dst.dev->nd_net == net) {
dst_use(&rth->u.dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
@@ -2338,7 +2482,7 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
}
rcu_read_unlock_bh();
- return ip_route_output_slow(rp, flp);
+ return ip_route_output_slow(net, rp, flp);
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
@@ -2357,12 +2501,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
};
-static int ipv4_blackhole_output(struct sk_buff *skb)
-{
- kfree_skb(skb);
- return 0;
-}
-
static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
{
struct rtable *ort = *rp;
@@ -2374,8 +2512,8 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock
atomic_set(&new->__refcnt, 1);
new->__use = 1;
- new->input = ipv4_blackhole_output;
- new->output = ipv4_blackhole_output;
+ new->input = dst_discard;
+ new->output = dst_discard;
memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
new->dev = ort->u.dst.dev;
@@ -2406,11 +2544,12 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock
return (rt ? 0 : -ENOMEM);
}
-int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
+int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
+ struct sock *sk, int flags)
{
int err;
- if ((err = __ip_route_output_key(rp, flp)) != 0)
+ if ((err = __ip_route_output_key(net, rp, flp)) != 0)
return err;
if (flp->proto) {
@@ -2418,7 +2557,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk,
flp->fl4_src = (*rp)->rt_src;
if (!flp->fl4_dst)
flp->fl4_dst = (*rp)->rt_dst;
- err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
+ err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
+ flags ? XFRM_LOOKUP_WAIT : 0);
if (err == -EREMOTE)
err = ipv4_dst_blackhole(rp, flp, sk);
@@ -2430,9 +2570,9 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk,
EXPORT_SYMBOL_GPL(ip_route_output_flow);
-int ip_route_output_key(struct rtable **rp, struct flowi *flp)
+int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
{
- return ip_route_output_flow(rp, flp, NULL, 0);
+ return ip_route_output_flow(net, rp, flp, NULL, 0);
}
static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
@@ -2499,8 +2639,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
#ifdef CONFIG_IP_MROUTE
__be32 dst = rt->rt_dst;
- if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
- IPV4_DEVCONF_ALL(MC_FORWARDING)) {
+ if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
+ IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
int err = ipmr_get_route(skb, r, nowait);
if (err <= 0) {
if (!nowait) {
@@ -2531,6 +2671,7 @@ nla_put_failure:
static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
{
+ struct net *net = in_skb->sk->sk_net;
struct rtmsg *rtm;
struct nlattr *tb[RTA_MAX+1];
struct rtable *rt = NULL;
@@ -2540,6 +2681,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
int err;
struct sk_buff *skb;
+ if (net != &init_net)
+ return -EINVAL;
+
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
if (err < 0)
goto errout;
@@ -2595,7 +2739,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
},
.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
};
- err = ip_route_output_key(&rt, &fl);
+ err = ip_route_output_key(&init_net, &rt, &fl);
}
if (err)
@@ -2610,7 +2754,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
if (err <= 0)
goto errout_free;
- err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
+ err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
errout:
return err;
@@ -2862,51 +3006,7 @@ ctl_table ipv4_route_table[] = {
#endif
#ifdef CONFIG_NET_CLS_ROUTE
-struct ip_rt_acct *ip_rt_acct;
-
-/* This code sucks. But you should have seen it before! --RR */
-
-/* IP route accounting ptr for this logical cpu number. */
-#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
-
-#ifdef CONFIG_PROC_FS
-static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
- int length, int *eof, void *data)
-{
- unsigned int i;
-
- if ((offset & 3) || (length & 3))
- return -EIO;
-
- if (offset >= sizeof(struct ip_rt_acct) * 256) {
- *eof = 1;
- return 0;
- }
-
- if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
- length = sizeof(struct ip_rt_acct) * 256 - offset;
- *eof = 1;
- }
-
- offset /= sizeof(u32);
-
- if (length > 0) {
- u32 *dst = (u32 *) buffer;
-
- *start = buffer;
- memset(dst, 0, length);
-
- for_each_possible_cpu(i) {
- unsigned int j;
- u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
-
- for (j = 0; j < length/4; j++)
- dst[j] += src[j];
- }
- }
- return length;
-}
-#endif /* CONFIG_PROC_FS */
+struct ip_rt_acct *ip_rt_acct __read_mostly;
#endif /* CONFIG_NET_CLS_ROUTE */
static __initdata unsigned long rhash_entries;
@@ -2927,16 +3027,9 @@ int __init ip_rt_init(void)
(jiffies ^ (jiffies >> 7)));
#ifdef CONFIG_NET_CLS_ROUTE
- {
- int order;
- for (order = 0;
- (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
- /* NOTHING */;
- ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
+ ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
- memset(ip_rt_acct, 0, PAGE_SIZE << order);
- }
#endif
ipv4_dst_ops.kmem_cachep =
@@ -2964,10 +3057,8 @@ int __init ip_rt_init(void)
devinet_init();
ip_fib_init();
- init_timer(&rt_flush_timer);
- rt_flush_timer.function = rt_run_flush;
- init_timer(&rt_secret_timer);
- rt_secret_timer.function = rt_secret_rebuild;
+ setup_timer(&rt_flush_timer, rt_run_flush, 0);
+ setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
/* All the timers, started at system startup tend
to synchronize. Perturb it a bit.
@@ -2979,20 +3070,8 @@ int __init ip_rt_init(void)
ip_rt_secret_interval;
add_timer(&rt_secret_timer);
-#ifdef CONFIG_PROC_FS
- {
- struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
- if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
- !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
- init_net.proc_net_stat))) {
- return -ENOMEM;
- }
- rtstat_pde->proc_fops = &rt_cpu_seq_fops;
- }
-#ifdef CONFIG_NET_CLS_ROUTE
- create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
-#endif
-#endif
+ if (ip_rt_proc_init(&init_net))
+ printk(KERN_ERR "Unable to create route proc files\n");
#ifdef CONFIG_XFRM
xfrm_init();
xfrm4_init();
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 2da1be0589a9..f470fe4511db 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -264,7 +264,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
{ .sport = th->dest,
.dport = th->source } } };
security_req_classify_flow(req, &fl);
- if (ip_route_output_key(&rt, &fl)) {
+ if (ip_route_output_key(&init_net, &rt, &fl)) {
reqsk_free(req);
goto out;
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index bec6fe880657..82cdf23837e3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -13,83 +13,20 @@
#include <linux/igmp.h>
#include <linux/inetdevice.h>
#include <linux/seqlock.h>
+#include <linux/init.h>
#include <net/snmp.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
+#include <net/udp.h>
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
-/* From af_inet.c */
-extern int sysctl_ip_nonlocal_bind;
-
-#ifdef CONFIG_SYSCTL
static int zero;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
-#endif
-
-struct ipv4_config ipv4_config;
-
-#ifdef CONFIG_SYSCTL
-
-static
-int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int val = IPV4_DEVCONF_ALL(FORWARDING);
- int ret;
-
- ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-
- if (write && IPV4_DEVCONF_ALL(FORWARDING) != val)
- inet_forward_change();
-
- return ret;
-}
-
-static int ipv4_sysctl_forward_strategy(ctl_table *table,
- int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen)
-{
- int *valp = table->data;
- int new;
-
- if (!newval || !newlen)
- return 0;
-
- if (newlen != sizeof(int))
- return -EINVAL;
-
- if (get_user(new, (int __user *)newval))
- return -EFAULT;
-
- if (new == *valp)
- return 0;
-
- if (oldval && oldlenp) {
- size_t len;
-
- if (get_user(len, oldlenp))
- return -EFAULT;
-
- if (len) {
- if (len > table->maxlen)
- len = table->maxlen;
- if (copy_to_user(oldval, valp, len))
- return -EFAULT;
- if (put_user(len, oldlenp))
- return -EFAULT;
- }
- }
-
- *valp = new;
- inet_forward_change();
- return 1;
-}
extern seqlock_t sysctl_port_range_lock;
extern int sysctl_local_port_range[2];
@@ -256,7 +193,7 @@ static int strategy_allowed_congestion_control(ctl_table *table, int __user *nam
}
-ctl_table ipv4_table[] = {
+static struct ctl_table ipv4_table[] = {
{
.ctl_name = NET_IPV4_TCP_TIMESTAMPS,
.procname = "tcp_timestamps",
@@ -290,15 +227,6 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec
},
{
- .ctl_name = NET_IPV4_FORWARD,
- .procname = "ip_forward",
- .data = &IPV4_DEVCONF_ALL(FORWARDING),
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &ipv4_sysctl_forward,
- .strategy = &ipv4_sysctl_forward_strategy
- },
- {
.ctl_name = NET_IPV4_DEFAULT_TTL,
.procname = "ip_default_ttl",
.data = &sysctl_ip_default_ttl,
@@ -356,22 +284,6 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec
},
{
- .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH,
- .procname = "ipfrag_high_thresh",
- .data = &ip4_frags_ctl.high_thresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
- .procname = "ipfrag_low_thresh",
- .data = &ip4_frags_ctl.low_thresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
.ctl_name = NET_IPV4_DYNADDR,
.procname = "ip_dynaddr",
.data = &sysctl_ip_dynaddr,
@@ -380,15 +292,6 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec
},
{
- .ctl_name = NET_IPV4_IPFRAG_TIME,
- .procname = "ipfrag_time",
- .data = &ip4_frags_ctl.timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
- },
- {
.ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME,
.procname = "tcp_keepalive_time",
.data = &sysctl_tcp_keepalive_time,
@@ -731,23 +634,6 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec
},
{
- .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL,
- .procname = "ipfrag_secret_interval",
- .data = &ip4_frags_ctl.secret_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies
- },
- {
- .procname = "ipfrag_max_dist",
- .data = &sysctl_ipfrag_max_dist,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .extra1 = &zero
- },
- {
.ctl_name = NET_TCP_NO_METRICS_SAVE,
.procname = "tcp_no_metrics_save",
.data = &sysctl_tcp_nometrics_save,
@@ -885,9 +771,52 @@ ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_mem",
+ .data = &sysctl_udp_mem,
+ .maxlen = sizeof(sysctl_udp_mem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_rmem_min",
+ .data = &sysctl_udp_rmem_min,
+ .maxlen = sizeof(sysctl_udp_rmem_min),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_wmem_min",
+ .data = &sysctl_udp_wmem_min,
+ .maxlen = sizeof(sysctl_udp_wmem_min),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
{ .ctl_name = 0 }
};
-#endif /* CONFIG_SYSCTL */
+struct ctl_path net_ipv4_ctl_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "ipv4", .ctl_name = NET_IPV4, },
+ { },
+};
+EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
+
+static __init int sysctl_ipv4_init(void)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
+ return hdr == NULL ? -ENOMEM : 0;
+}
-EXPORT_SYMBOL(ipv4_config);
+__initcall(sysctl_ipv4_init);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8e65182f7af1..a0d373bd9065 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -254,6 +254,10 @@
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/skbuff.h>
+#include <linux/splice.h>
+#include <linux/net.h>
+#include <linux/socket.h>
#include <linux/random.h>
#include <linux/bootmem.h>
#include <linux/cache.h>
@@ -265,6 +269,7 @@
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/netdma.h>
+#include <net/sock.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -292,9 +297,18 @@ EXPORT_SYMBOL(tcp_memory_allocated);
EXPORT_SYMBOL(tcp_sockets_allocated);
/*
+ * TCP splice context
+ */
+struct tcp_splice_state {
+ struct pipe_inode_info *pipe;
+ size_t len;
+ unsigned int flags;
+};
+
+/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
- * All the sk_stream_mem_schedule() is of this nature: accounting
+ * All the __sk_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
int tcp_memory_pressure __read_mostly;
@@ -471,7 +485,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
tcb->sacked = 0;
skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
- sk_charge_skb(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
}
@@ -482,7 +497,6 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
if (flags & MSG_OOB) {
tp->urg_mode = 1;
tp->snd_up = tp->write_seq;
- TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
}
}
@@ -501,6 +515,145 @@ static inline void tcp_push(struct sock *sk, int flags, int mss_now,
}
}
+static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct tcp_splice_state *tss = rd_desc->arg.data;
+
+ return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
+}
+
+static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
+{
+ /* Store TCP splice context information in read_descriptor_t. */
+ read_descriptor_t rd_desc = {
+ .arg.data = tss,
+ };
+
+ return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+}
+
+/**
+ * tcp_splice_read - splice data from TCP socket to a pipe
+ * @sock: socket to splice from
+ * @ppos: position (not valid)
+ * @pipe: pipe to splice to
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * Will read pages from given socket and fill them into a pipe.
+ *
+ **/
+ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct tcp_splice_state tss = {
+ .pipe = pipe,
+ .len = len,
+ .flags = flags,
+ };
+ long timeo;
+ ssize_t spliced;
+ int ret;
+
+ /*
+ * We can't seek on a socket input
+ */
+ if (unlikely(*ppos))
+ return -ESPIPE;
+
+ ret = spliced = 0;
+
+ lock_sock(sk);
+
+ timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
+ while (tss.len) {
+ ret = __tcp_splice_read(sk, &tss);
+ if (ret < 0)
+ break;
+ else if (!ret) {
+ if (spliced)
+ break;
+ if (flags & SPLICE_F_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ ret = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_state == TCP_CLOSE) {
+ /*
+ * This occurs when user tries to read
+ * from never connected socket.
+ */
+ if (!sock_flag(sk, SOCK_DONE))
+ ret = -ENOTCONN;
+ break;
+ }
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+ sk_wait_data(sk, &timeo);
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(timeo);
+ break;
+ }
+ continue;
+ }
+ tss.len -= ret;
+ spliced += ret;
+
+ release_sock(sk);
+ lock_sock(sk);
+
+ if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo ||
+ signal_pending(current))
+ break;
+ }
+
+ release_sock(sk);
+
+ if (spliced)
+ return spliced;
+
+ return ret;
+}
+
+struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+{
+ struct sk_buff *skb;
+
+ /* The TCP header must be at least 32-bit aligned. */
+ size = ALIGN(size, 4);
+
+ skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
+ if (skb) {
+ if (sk_wmem_schedule(sk, skb->truesize)) {
+ /*
+ * Make sure that we have exactly size bytes
+ * available to the caller, no more, no less.
+ */
+ skb_reserve(skb, skb_tailroom(skb) - size);
+ return skb;
+ }
+ __kfree_skb(skb);
+ } else {
+ sk->sk_prot->enter_memory_pressure();
+ sk_stream_moderate_sndbuf(sk);
+ }
+ return NULL;
+}
+
static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
size_t psize, int flags)
{
@@ -537,8 +690,7 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- skb = sk_stream_alloc_pskb(sk, 0, 0,
- sk->sk_allocation);
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
if (!skb)
goto wait_for_memory;
@@ -555,7 +707,7 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
}
- if (!sk_stream_wmem_schedule(sk, copy))
+ if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
if (can_coalesce) {
@@ -569,7 +721,7 @@ new_segment:
skb->data_len += copy;
skb->truesize += copy;
sk->sk_wmem_queued += copy;
- sk->sk_forward_alloc -= copy;
+ sk_mem_charge(sk, copy);
skb->ip_summed = CHECKSUM_PARTIAL;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
@@ -718,8 +870,8 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- skb = sk_stream_alloc_pskb(sk, select_size(sk),
- 0, sk->sk_allocation);
+ skb = sk_stream_alloc_skb(sk, select_size(sk),
+ sk->sk_allocation);
if (!skb)
goto wait_for_memory;
@@ -776,7 +928,7 @@ new_segment:
if (copy > PAGE_SIZE - off)
copy = PAGE_SIZE - off;
- if (!sk_stream_wmem_schedule(sk, copy))
+ if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
if (!page) {
@@ -867,7 +1019,7 @@ do_fault:
* reset, where we can be unlinking the send_head.
*/
tcp_check_send_head(sk, skb);
- sk_stream_free_skb(sk, skb);
+ sk_wmem_free_skb(sk, skb);
}
do_error:
@@ -1500,6 +1652,41 @@ recv_urg:
goto out;
}
+void tcp_set_state(struct sock *sk, int state)
+{
+ int oldstate = sk->sk_state;
+
+ switch (state) {
+ case TCP_ESTABLISHED:
+ if (oldstate != TCP_ESTABLISHED)
+ TCP_INC_STATS(TCP_MIB_CURRESTAB);
+ break;
+
+ case TCP_CLOSE:
+ if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
+ TCP_INC_STATS(TCP_MIB_ESTABRESETS);
+
+ sk->sk_prot->unhash(sk);
+ if (inet_csk(sk)->icsk_bind_hash &&
+ !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ inet_put_port(&tcp_hashinfo, sk);
+ /* fall through */
+ default:
+ if (oldstate==TCP_ESTABLISHED)
+ TCP_DEC_STATS(TCP_MIB_CURRESTAB);
+ }
+
+ /* Change state AFTER socket is unhashed to avoid closed
+ * socket sitting in hash tables.
+ */
+ sk->sk_state = state;
+
+#ifdef STATE_TRACE
+ SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
+#endif
+}
+EXPORT_SYMBOL_GPL(tcp_set_state);
+
/*
* State processing on a close. This implements the state shift for
* sending our FIN frame. Note that we only send a FIN for some
@@ -1586,7 +1773,7 @@ void tcp_close(struct sock *sk, long timeout)
__kfree_skb(skb);
}
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
/* As outlined in RFC 2525, section 2.17, we send a RST here because
* data was lost. To witness the awful effects of the old behavior of
@@ -1689,7 +1876,7 @@ adjudge_to_death:
}
}
if (sk->sk_state != TCP_CLOSE) {
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
if (tcp_too_many_orphans(sk,
atomic_read(sk->sk_prot->orphan_count))) {
if (net_ratelimit())
@@ -2411,7 +2598,6 @@ void tcp_done(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_done);
-extern void __skb_cb_too_small_for_tcp(int, int);
extern struct tcp_congestion_ops tcp_reno;
static __initdata unsigned long thash_entries;
@@ -2430,9 +2616,7 @@ void __init tcp_init(void)
unsigned long limit;
int order, i, max_share;
- if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
- __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
- sizeof(skb->cb));
+ BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
@@ -2509,11 +2693,11 @@ void __init tcp_init(void)
limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
max_share = min(4UL*1024*1024, limit);
- sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
+ sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_wmem[1] = 16*1024;
sysctl_tcp_wmem[2] = max(64*1024, max_share);
- sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
+ sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_rmem[1] = 87380;
sysctl_tcp_rmem[2] = max(87380, max_share);
@@ -2532,6 +2716,7 @@ EXPORT_SYMBOL(tcp_poll);
EXPORT_SYMBOL(tcp_read_sock);
EXPORT_SYMBOL(tcp_recvmsg);
EXPORT_SYMBOL(tcp_sendmsg);
+EXPORT_SYMBOL(tcp_splice_read);
EXPORT_SYMBOL(tcp_sendpage);
EXPORT_SYMBOL(tcp_setsockopt);
EXPORT_SYMBOL(tcp_shutdown);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 5dba0fc8f579..5212ed9b0c98 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -136,8 +136,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack,
- u32 in_flight, int data_acked)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 55fca1820c34..3a6be23d222f 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -274,6 +274,27 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
return err;
}
+/* RFC2861 Check whether we are limited by application or congestion window
+ * This is the inverse of cwnd check in tcp_tso_should_defer
+ */
+int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 left;
+
+ if (in_flight >= tp->snd_cwnd)
+ return 1;
+
+ if (!sk_can_gso(sk))
+ return 0;
+
+ left = tp->snd_cwnd - in_flight;
+ if (sysctl_tcp_tso_win_divisor)
+ return left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd;
+ else
+ return left <= tcp_max_burst(tp);
+}
+EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
/*
* Slow start is used when congestion window is less than slow start
@@ -324,7 +345,7 @@ EXPORT_SYMBOL_GPL(tcp_slow_start);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag)
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 80bd084a9f91..3aa0b23c1ea0 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -246,8 +246,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack,
- u32 in_flight, int data_acked)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 14a073d8b60f..8b6caaf75bb9 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -109,8 +109,7 @@ static void hstcp_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void hstcp_cong_avoid(struct sock *sk, u32 adk,
- u32 in_flight, int data_acked)
+static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 5215691f2760..af99776146ff 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -225,8 +225,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
}
-static void htcp_cong_avoid(struct sock *sk, u32 ack,
- u32 in_flight, int data_acked)
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index b3e55cf56171..44618b675916 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -85,8 +85,7 @@ static inline u32 hybla_fraction(u32 odds)
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
-static void hybla_cong_avoid(struct sock *sk, u32 ack,
- u32 in_flight, int flag)
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hybla *ca = inet_csk_ca(sk);
@@ -103,7 +102,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack,
return;
if (!ca->hybla_en)
- return tcp_reno_cong_avoid(sk, ack, in_flight, flag);
+ return tcp_reno_cong_avoid(sk, ack, in_flight);
if (ca->rho == 0)
hybla_recalc_param(sk);
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 5aa5f5496d6d..1eba160b72dc 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -256,8 +256,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)
/*
* Increase window in response to successful acknowledgment.
*/
-static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack,
- u32 in_flight, int flag)
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b39f0d86e44c..fa2c85ca5bc3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -105,6 +105,7 @@ int sysctl_tcp_abc __read_mostly;
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
+#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -120,8 +121,7 @@ int sysctl_tcp_abc __read_mostly;
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
-static void tcp_measure_rcv_mss(struct sock *sk,
- const struct sk_buff *skb)
+static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const unsigned int lss = icsk->icsk_ack.last_seg_size;
@@ -132,7 +132,7 @@ static void tcp_measure_rcv_mss(struct sock *sk,
/* skb->len may jitter because of SACKs, even if peer
* sends good full-sized frames.
*/
- len = skb_shinfo(skb)->gso_size ?: skb->len;
+ len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
icsk->icsk_ack.rcv_mss = len;
} else {
@@ -172,8 +172,8 @@ static void tcp_incr_quickack(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
- if (quickacks==0)
- quickacks=2;
+ if (quickacks == 0)
+ quickacks = 2;
if (quickacks > icsk->icsk_ack.quick)
icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
}
@@ -198,7 +198,7 @@ static inline int tcp_in_quickack_mode(const struct sock *sk)
static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
{
- if (tp->ecn_flags&TCP_ECN_OK)
+ if (tp->ecn_flags & TCP_ECN_OK)
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
@@ -215,7 +215,7 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
{
- if (tp->ecn_flags&TCP_ECN_OK) {
+ if (tp->ecn_flags & TCP_ECN_OK) {
if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
/* Funny extension: if ECT is not set on a segment,
@@ -228,19 +228,19 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th)
{
- if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr))
+ if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th)
{
- if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr))
+ if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
{
- if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK))
+ if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
return 1;
return 0;
}
@@ -289,8 +289,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Optimize this! */
- int truesize = tcp_win_from_space(skb->truesize)/2;
- int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
+ int truesize = tcp_win_from_space(skb->truesize) >> 1;
+ int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
@@ -302,8 +302,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
return 0;
}
-static void tcp_grow_window(struct sock *sk,
- struct sk_buff *skb)
+static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -317,12 +316,13 @@ static void tcp_grow_window(struct sock *sk,
* will fit to rcvbuf in future.
*/
if (tcp_win_from_space(skb->truesize) <= skb->len)
- incr = 2*tp->advmss;
+ incr = 2 * tp->advmss;
else
incr = __tcp_grow_window(sk, skb);
if (incr) {
- tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
+ tp->window_clamp);
inet_csk(sk)->icsk_ack.quick |= 1;
}
}
@@ -397,10 +397,9 @@ static void tcp_clamp_window(struct sock *sk)
sysctl_tcp_rmem[2]);
}
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
- tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
+ tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
}
-
/* Initialize RCV_MSS value.
* RCV_MSS is an our guess about MSS used by the peer.
* We haven't any direct information about the MSS.
@@ -413,7 +412,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
- hint = min(hint, tp->rcv_wnd/2);
+ hint = min(hint, tp->rcv_wnd / 2);
hint = min(hint, TCP_MIN_RCVMSS);
hint = max(hint, TCP_MIN_MSS);
@@ -470,16 +469,15 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
goto new_measure;
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
return;
- tcp_rcv_rtt_update(tp,
- jiffies - tp->rcv_rtt_est.time,
- 1);
+ tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
new_measure:
tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
tp->rcv_rtt_est.time = tcp_time_stamp;
}
-static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
+ const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->rx_opt.rcv_tsecr &&
@@ -502,8 +500,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
goto new_measure;
time = tcp_time_stamp - tp->rcvq_space.time;
- if (time < (tp->rcv_rtt_est.rtt >> 3) ||
- tp->rcv_rtt_est.rtt == 0)
+ if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
return;
space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
@@ -579,7 +576,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
} else {
int m = now - icsk->icsk_ack.lrcvtime;
- if (m <= TCP_ATO_MIN/2) {
+ if (m <= TCP_ATO_MIN / 2) {
/* The fastest case is the first. */
icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
} else if (m < icsk->icsk_ack.ato) {
@@ -591,7 +588,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
* restart window, so that we send ACKs quickly.
*/
tcp_incr_quickack(sk);
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
}
}
icsk->icsk_ack.lrcvtime = now;
@@ -608,7 +605,7 @@ static u32 tcp_rto_min(struct sock *sk)
u32 rto_min = TCP_RTO_MIN;
if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
- rto_min = dst->metrics[RTAX_RTO_MIN-1];
+ rto_min = dst->metrics[RTAX_RTO_MIN - 1];
return rto_min;
}
@@ -671,14 +668,14 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
}
if (after(tp->snd_una, tp->rtt_seq)) {
if (tp->mdev_max < tp->rttvar)
- tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
+ tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
tp->rtt_seq = tp->snd_nxt;
tp->mdev_max = tcp_rto_min(sk);
}
} else {
/* no previous measure. */
- tp->srtt = m<<3; /* take the measured time to be rtt */
- tp->mdev = m<<1; /* make sure rto = 3*rtt */
+ tp->srtt = m << 3; /* take the measured time to be rtt */
+ tp->mdev = m << 1; /* make sure rto = 3*rtt */
tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
tp->rtt_seq = tp->snd_nxt;
}
@@ -732,7 +729,7 @@ void tcp_update_metrics(struct sock *sk)
dst_confirm(dst);
- if (dst && (dst->flags&DST_HOST)) {
+ if (dst && (dst->flags & DST_HOST)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
int m;
@@ -742,7 +739,7 @@ void tcp_update_metrics(struct sock *sk)
* Reset our results.
*/
if (!(dst_metric_locked(dst, RTAX_RTT)))
- dst->metrics[RTAX_RTT-1] = 0;
+ dst->metrics[RTAX_RTT - 1] = 0;
return;
}
@@ -754,9 +751,9 @@ void tcp_update_metrics(struct sock *sk)
*/
if (!(dst_metric_locked(dst, RTAX_RTT))) {
if (m <= 0)
- dst->metrics[RTAX_RTT-1] = tp->srtt;
+ dst->metrics[RTAX_RTT - 1] = tp->srtt;
else
- dst->metrics[RTAX_RTT-1] -= (m>>3);
+ dst->metrics[RTAX_RTT - 1] -= (m >> 3);
}
if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
@@ -769,7 +766,7 @@ void tcp_update_metrics(struct sock *sk)
m = tp->mdev;
if (m >= dst_metric(dst, RTAX_RTTVAR))
- dst->metrics[RTAX_RTTVAR-1] = m;
+ dst->metrics[RTAX_RTTVAR - 1] = m;
else
dst->metrics[RTAX_RTTVAR-1] -=
(dst->metrics[RTAX_RTTVAR-1] - m)>>2;
@@ -783,7 +780,7 @@ void tcp_update_metrics(struct sock *sk)
dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
if (!dst_metric_locked(dst, RTAX_CWND) &&
tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
- dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
+ dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd;
} else if (tp->snd_cwnd > tp->snd_ssthresh &&
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
@@ -863,6 +860,9 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
*/
static void tcp_disable_fack(struct tcp_sock *tp)
{
+ /* RFC3517 uses different metric in lost marker => reset on change */
+ if (tcp_is_fack(tp))
+ tp->lost_skb_hint = NULL;
tp->rx_opt.sack_ok &= ~2;
}
@@ -1112,16 +1112,22 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
*
* Search retransmitted skbs from write_queue that were sent when snd_nxt was
* less than what is now known to be received by the other end (derived from
- * SACK blocks by the caller). Also calculate the lowest snd_nxt among the
- * remaining retransmitted skbs to avoid some costly processing per ACKs.
+ * highest SACK block). Also calculate the lowest snd_nxt among the remaining
+ * retransmitted skbs to avoid some costly processing per ACKs.
*/
-static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto)
+static void tcp_mark_lost_retrans(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int flag = 0;
int cnt = 0;
u32 new_low_seq = tp->snd_nxt;
+ u32 received_upto = tcp_highest_sack_seq(tp);
+
+ if (!tcp_is_fack(tp) || !tp->retrans_out ||
+ !after(received_upto, tp->lost_retrans_low) ||
+ icsk->icsk_ca_state != TCP_CA_Recovery)
+ return;
tcp_for_write_queue(skb, sk) {
u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
@@ -1149,9 +1155,8 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto)
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- flag |= FLAG_DATA_SACKED;
- NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
}
+ NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
} else {
if (before(ack_seq, new_low_seq))
new_low_seq = ack_seq;
@@ -1161,8 +1166,6 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto)
if (tp->retrans_out)
tp->lost_retrans_low = new_low_seq;
-
- return flag;
}
static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
@@ -1230,34 +1233,205 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
return in_sack;
}
+static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
+ int *reord, int dup_sack, int fack_count)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u8 sacked = TCP_SKB_CB(skb)->sacked;
+ int flag = 0;
+
+ /* Account D-SACK for retransmitted packet. */
+ if (dup_sack && (sacked & TCPCB_RETRANS)) {
+ if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+ tp->undo_retrans--;
+ if (sacked & TCPCB_SACKED_ACKED)
+ *reord = min(fack_count, *reord);
+ }
+
+ /* Nothing to do; acked frame is about to be dropped (was ACKed). */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ return flag;
+
+ if (!(sacked & TCPCB_SACKED_ACKED)) {
+ if (sacked & TCPCB_SACKED_RETRANS) {
+ /* If the segment is not tagged as lost,
+ * we do not clear RETRANS, believing
+ * that retransmission is still in flight.
+ */
+ if (sacked & TCPCB_LOST) {
+ TCP_SKB_CB(skb)->sacked &=
+ ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+ tp->lost_out -= tcp_skb_pcount(skb);
+ tp->retrans_out -= tcp_skb_pcount(skb);
+
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
+ }
+ } else {
+ if (!(sacked & TCPCB_RETRANS)) {
+ /* New sack for not retransmitted frame,
+ * which was in hole. It is reordering.
+ */
+ if (before(TCP_SKB_CB(skb)->seq,
+ tcp_highest_sack_seq(tp)))
+ *reord = min(fack_count, *reord);
+
+ /* SACK enhanced F-RTO (RFC4138; Appendix B) */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
+ flag |= FLAG_ONLY_ORIG_SACKED;
+ }
+
+ if (sacked & TCPCB_LOST) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+ tp->lost_out -= tcp_skb_pcount(skb);
+
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
+ }
+ }
+
+ TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
+ flag |= FLAG_DATA_SACKED;
+ tp->sacked_out += tcp_skb_pcount(skb);
+
+ fack_count += tcp_skb_pcount(skb);
+
+ /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
+ if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->lost_skb_hint)->seq))
+ tp->lost_cnt_hint += tcp_skb_pcount(skb);
+
+ if (fack_count > tp->fackets_out)
+ tp->fackets_out = fack_count;
+
+ if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
+ tcp_advance_highest_sack(sk, skb);
+ }
+
+ /* D-SACK. We can detect redundant retransmission in S|R and plain R
+ * frames and clear it. undo_retrans is decreased above, L|R frames
+ * are accounted above as well.
+ */
+ if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ tp->retransmit_skb_hint = NULL;
+ }
+
+ return flag;
+}
+
+static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
+ struct tcp_sack_block *next_dup,
+ u32 start_seq, u32 end_seq,
+ int dup_sack_in, int *fack_count,
+ int *reord, int *flag)
+{
+ tcp_for_write_queue_from(skb, sk) {
+ int in_sack = 0;
+ int dup_sack = dup_sack_in;
+
+ if (skb == tcp_send_head(sk))
+ break;
+
+ /* queue is in-order => we can short-circuit the walk early */
+ if (!before(TCP_SKB_CB(skb)->seq, end_seq))
+ break;
+
+ if ((next_dup != NULL) &&
+ before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
+ in_sack = tcp_match_skb_to_sack(sk, skb,
+ next_dup->start_seq,
+ next_dup->end_seq);
+ if (in_sack > 0)
+ dup_sack = 1;
+ }
+
+ if (in_sack <= 0)
+ in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,
+ end_seq);
+ if (unlikely(in_sack < 0))
+ break;
+
+ if (in_sack)
+ *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
+ *fack_count);
+
+ *fack_count += tcp_skb_pcount(skb);
+ }
+ return skb;
+}
+
+/* Avoid all extra work that is being done by sacktag while walking in
+ * a normal way
+ */
+static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
+ u32 skip_to_seq)
+{
+ tcp_for_write_queue_from(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+
+ if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
+ break;
+ }
+ return skb;
+}
+
+static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
+ struct sock *sk,
+ struct tcp_sack_block *next_dup,
+ u32 skip_to_seq,
+ int *fack_count, int *reord,
+ int *flag)
+{
+ if (next_dup == NULL)
+ return skb;
+
+ if (before(next_dup->start_seq, skip_to_seq)) {
+ skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
+ tcp_sacktag_walk(skb, sk, NULL,
+ next_dup->start_seq, next_dup->end_seq,
+ 1, fack_count, reord, flag);
+ }
+
+ return skb;
+}
+
+static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache)
+{
+ return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+}
+
static int
-tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
+tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
+ u32 prior_snd_una)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned char *ptr = (skb_transport_header(ack_skb) +
TCP_SKB_CB(ack_skb)->sacked);
- struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
- struct sk_buff *cached_skb;
- int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
+ struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
+ struct tcp_sack_block sp[4];
+ struct tcp_sack_block *cache;
+ struct sk_buff *skb;
+ int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE) >> 3;
+ int used_sacks;
int reord = tp->packets_out;
- int prior_fackets;
- u32 highest_sack_end_seq = tp->lost_retrans_low;
int flag = 0;
int found_dup_sack = 0;
- int cached_fack_count;
- int i;
+ int fack_count;
+ int i, j;
int first_sack_index;
- int force_one_sack;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
tp->fackets_out = 0;
- tp->highest_sack = tp->snd_una;
+ tcp_highest_sack_reset(sk);
}
- prior_fackets = tp->fackets_out;
- found_dup_sack = tcp_check_dsack(tp, ack_skb, sp,
+ found_dup_sack = tcp_check_dsack(tp, ack_skb, sp_wire,
num_sacks, prior_snd_una);
if (found_dup_sack)
flag |= FLAG_DSACKING_ACK;
@@ -1272,78 +1446,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (!tp->packets_out)
goto out;
- /* SACK fastpath:
- * if the only SACK change is the increase of the end_seq of
- * the first block then only apply that SACK block
- * and use retrans queue hinting otherwise slowpath */
- force_one_sack = 1;
- for (i = 0; i < num_sacks; i++) {
- __be32 start_seq = sp[i].start_seq;
- __be32 end_seq = sp[i].end_seq;
-
- if (i == 0) {
- if (tp->recv_sack_cache[i].start_seq != start_seq)
- force_one_sack = 0;
- } else {
- if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
- (tp->recv_sack_cache[i].end_seq != end_seq))
- force_one_sack = 0;
- }
- tp->recv_sack_cache[i].start_seq = start_seq;
- tp->recv_sack_cache[i].end_seq = end_seq;
- }
- /* Clear the rest of the cache sack blocks so they won't match mistakenly. */
- for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) {
- tp->recv_sack_cache[i].start_seq = 0;
- tp->recv_sack_cache[i].end_seq = 0;
- }
-
+ used_sacks = 0;
first_sack_index = 0;
- if (force_one_sack)
- num_sacks = 1;
- else {
- int j;
- tp->fastpath_skb_hint = NULL;
-
- /* order SACK blocks to allow in order walk of the retrans queue */
- for (i = num_sacks-1; i > 0; i--) {
- for (j = 0; j < i; j++){
- if (after(ntohl(sp[j].start_seq),
- ntohl(sp[j+1].start_seq))){
- struct tcp_sack_block_wire tmp;
-
- tmp = sp[j];
- sp[j] = sp[j+1];
- sp[j+1] = tmp;
-
- /* Track where the first SACK block goes to */
- if (j == first_sack_index)
- first_sack_index = j+1;
- }
-
- }
- }
- }
-
- /* Use SACK fastpath hint if valid */
- cached_skb = tp->fastpath_skb_hint;
- cached_fack_count = tp->fastpath_cnt_hint;
- if (!cached_skb) {
- cached_skb = tcp_write_queue_head(sk);
- cached_fack_count = 0;
- }
-
for (i = 0; i < num_sacks; i++) {
- struct sk_buff *skb;
- __u32 start_seq = ntohl(sp->start_seq);
- __u32 end_seq = ntohl(sp->end_seq);
- int fack_count;
- int dup_sack = (found_dup_sack && (i == first_sack_index));
- int next_dup = (found_dup_sack && (i+1 == first_sack_index));
+ int dup_sack = !i && found_dup_sack;
- sp++;
+ sp[used_sacks].start_seq = ntohl(get_unaligned(&sp_wire[i].start_seq));
+ sp[used_sacks].end_seq = ntohl(get_unaligned(&sp_wire[i].end_seq));
- if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) {
+ if (!tcp_is_sackblock_valid(tp, dup_sack,
+ sp[used_sacks].start_seq,
+ sp[used_sacks].end_seq)) {
if (dup_sack) {
if (!tp->undo_marker)
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO);
@@ -1352,169 +1465,148 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
} else {
/* Don't count olds caused by ACK reordering */
if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
- !after(end_seq, tp->snd_una))
+ !after(sp[used_sacks].end_seq, tp->snd_una))
continue;
NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD);
}
+ if (i == 0)
+ first_sack_index = -1;
continue;
}
- skb = cached_skb;
- fack_count = cached_fack_count;
-
- /* Event "B" in the comment above. */
- if (after(end_seq, tp->high_seq))
- flag |= FLAG_DATA_LOST;
-
- tcp_for_write_queue_from(skb, sk) {
- int in_sack = 0;
- u8 sacked;
-
- if (skb == tcp_send_head(sk))
- break;
-
- cached_skb = skb;
- cached_fack_count = fack_count;
- if (i == first_sack_index) {
- tp->fastpath_skb_hint = skb;
- tp->fastpath_cnt_hint = fack_count;
- }
+ /* Ignore very old stuff early */
+ if (!after(sp[used_sacks].end_seq, prior_snd_una))
+ continue;
- /* The retransmission queue is always in order, so
- * we can short-circuit the walk early.
- */
- if (!before(TCP_SKB_CB(skb)->seq, end_seq))
- break;
+ used_sacks++;
+ }
- dup_sack = (found_dup_sack && (i == first_sack_index));
+ /* order SACK blocks to allow in order walk of the retrans queue */
+ for (i = used_sacks - 1; i > 0; i--) {
+ for (j = 0; j < i; j++) {
+ if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
+ struct tcp_sack_block tmp;
- /* Due to sorting DSACK may reside within this SACK block! */
- if (next_dup) {
- u32 dup_start = ntohl(sp->start_seq);
- u32 dup_end = ntohl(sp->end_seq);
+ tmp = sp[j];
+ sp[j] = sp[j + 1];
+ sp[j + 1] = tmp;
- if (before(TCP_SKB_CB(skb)->seq, dup_end)) {
- in_sack = tcp_match_skb_to_sack(sk, skb, dup_start, dup_end);
- if (in_sack > 0)
- dup_sack = 1;
- }
+ /* Track where the first SACK block goes to */
+ if (j == first_sack_index)
+ first_sack_index = j + 1;
}
+ }
+ }
- /* DSACK info lost if out-of-mem, try SACK still */
- if (in_sack <= 0)
- in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq);
- if (unlikely(in_sack < 0))
- break;
+ skb = tcp_write_queue_head(sk);
+ fack_count = 0;
+ i = 0;
- sacked = TCP_SKB_CB(skb)->sacked;
+ if (!tp->sacked_out) {
+ /* It's already past, so skip checking against it */
+ cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+ } else {
+ cache = tp->recv_sack_cache;
+ /* Skip empty blocks in at head of the cache */
+ while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
+ !cache->end_seq)
+ cache++;
+ }
- /* Account D-SACK for retransmitted packet. */
- if ((dup_sack && in_sack) &&
- (sacked & TCPCB_RETRANS) &&
- after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
- tp->undo_retrans--;
+ while (i < used_sacks) {
+ u32 start_seq = sp[i].start_seq;
+ u32 end_seq = sp[i].end_seq;
+ int dup_sack = (found_dup_sack && (i == first_sack_index));
+ struct tcp_sack_block *next_dup = NULL;
- /* The frame is ACKed. */
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
- if (sacked&TCPCB_RETRANS) {
- if ((dup_sack && in_sack) &&
- (sacked&TCPCB_SACKED_ACKED))
- reord = min(fack_count, reord);
- }
+ if (found_dup_sack && ((i + 1) == first_sack_index))
+ next_dup = &sp[i + 1];
- /* Nothing to do; acked frame is about to be dropped. */
- fack_count += tcp_skb_pcount(skb);
- continue;
- }
+ /* Event "B" in the comment above. */
+ if (after(end_seq, tp->high_seq))
+ flag |= FLAG_DATA_LOST;
- if (!in_sack) {
- fack_count += tcp_skb_pcount(skb);
- continue;
+ /* Skip too early cached blocks */
+ while (tcp_sack_cache_ok(tp, cache) &&
+ !before(start_seq, cache->end_seq))
+ cache++;
+
+ /* Can skip some work by looking recv_sack_cache? */
+ if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
+ after(end_seq, cache->start_seq)) {
+
+ /* Head todo? */
+ if (before(start_seq, cache->start_seq)) {
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
+ skb = tcp_sacktag_walk(skb, sk, next_dup,
+ start_seq,
+ cache->start_seq,
+ dup_sack, &fack_count,
+ &reord, &flag);
}
- if (!(sacked&TCPCB_SACKED_ACKED)) {
- if (sacked & TCPCB_SACKED_RETRANS) {
- /* If the segment is not tagged as lost,
- * we do not clear RETRANS, believing
- * that retransmission is still in flight.
- */
- if (sacked & TCPCB_LOST) {
- TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
- tp->lost_out -= tcp_skb_pcount(skb);
- tp->retrans_out -= tcp_skb_pcount(skb);
-
- /* clear lost hint */
- tp->retransmit_skb_hint = NULL;
- }
- } else {
- if (!(sacked & TCPCB_RETRANS)) {
- /* New sack for not retransmitted frame,
- * which was in hole. It is reordering.
- */
- if (fack_count < prior_fackets)
- reord = min(fack_count, reord);
-
- /* SACK enhanced F-RTO (RFC4138; Appendix B) */
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
- flag |= FLAG_ONLY_ORIG_SACKED;
- }
-
- if (sacked & TCPCB_LOST) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- tp->lost_out -= tcp_skb_pcount(skb);
+ /* Rest of the block already fully processed? */
+ if (!after(end_seq, cache->end_seq))
+ goto advance_sp;
- /* clear lost hint */
- tp->retransmit_skb_hint = NULL;
- }
- }
+ skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
+ cache->end_seq,
+ &fack_count, &reord,
+ &flag);
- TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
- flag |= FLAG_DATA_SACKED;
- tp->sacked_out += tcp_skb_pcount(skb);
-
- fack_count += tcp_skb_pcount(skb);
- if (fack_count > tp->fackets_out)
- tp->fackets_out = fack_count;
-
- if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) {
- tp->highest_sack = TCP_SKB_CB(skb)->seq;
- highest_sack_end_seq = TCP_SKB_CB(skb)->end_seq;
- }
- } else {
- if (dup_sack && (sacked&TCPCB_RETRANS))
- reord = min(fack_count, reord);
-
- fack_count += tcp_skb_pcount(skb);
+ /* ...tail remains todo... */
+ if (tcp_highest_sack_seq(tp) == cache->end_seq) {
+ /* ...but better entrypoint exists! */
+ skb = tcp_highest_sack(sk);
+ if (skb == NULL)
+ break;
+ fack_count = tp->fackets_out;
+ cache++;
+ goto walk;
}
- /* D-SACK. We can detect redundant retransmission
- * in S|R and plain R frames and clear it.
- * undo_retrans is decreased above, L|R frames
- * are accounted above as well.
- */
- if (dup_sack &&
- (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- tp->retransmit_skb_hint = NULL;
- }
+ skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
+ /* Check overlap against next cached too (past this one already) */
+ cache++;
+ continue;
+ }
+
+ if (!before(start_seq, tcp_highest_sack_seq(tp))) {
+ skb = tcp_highest_sack(sk);
+ if (skb == NULL)
+ break;
+ fack_count = tp->fackets_out;
}
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
+
+walk:
+ skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq,
+ dup_sack, &fack_count, &reord, &flag);
+advance_sp:
/* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
* due to in-order walk
*/
if (after(end_seq, tp->frto_highmark))
flag &= ~FLAG_ONLY_ORIG_SACKED;
+
+ i++;
}
- if (tp->retrans_out &&
- after(highest_sack_end_seq, tp->lost_retrans_low) &&
- icsk->icsk_ca_state == TCP_CA_Recovery)
- flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq);
+ /* Clear the head of the cache sack blocks so we can skip it next time */
+ for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
+ tp->recv_sack_cache[i].start_seq = 0;
+ tp->recv_sack_cache[i].end_seq = 0;
+ }
+ for (j = 0; j < used_sacks; j++)
+ tp->recv_sack_cache[i++] = sp[j];
+
+ tcp_mark_lost_retrans(sk);
tcp_verify_left_out(tp);
- if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
+ if ((reord < tp->fackets_out) &&
+ ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&
(!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
@@ -1565,10 +1657,10 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked)
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
- if (acked-1 >= tp->sacked_out)
+ if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
- tp->sacked_out -= acked-1;
+ tp->sacked_out -= acked - 1;
}
tcp_check_reno_reordering(sk, acked);
tcp_verify_left_out(tp);
@@ -1602,10 +1694,10 @@ int tcp_use_frto(struct sock *sk)
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
- if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
return 0;
/* Short-circuit when first non-SACKed skb has been checked */
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED))
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
break;
}
return 1;
@@ -1715,7 +1807,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
* Count the retransmission made on RTO correctly (only when
* waiting for the first ACK and did not get it)...
*/
- if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) {
+ if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
/* For some reason this R-bit might get cleared? */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out += tcp_skb_pcount(skb);
@@ -1728,7 +1820,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
}
/* Don't lost mark skbs that were fwd transmitted after RTO */
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) &&
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) &&
!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -1743,7 +1835,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
tp->bytes_acked = 0;
tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
+ sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp);
@@ -1810,7 +1902,7 @@ void tcp_enter_loss(struct sock *sk, int how)
if (skb == tcp_send_head(sk))
break;
- if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
@@ -1822,7 +1914,7 @@ void tcp_enter_loss(struct sock *sk, int how)
tcp_verify_left_out(tp);
tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
+ sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
@@ -1830,18 +1922,15 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->frto_counter = 0;
}
-static int tcp_check_sack_reneging(struct sock *sk)
+/* If ACK arrived pointing to a remembered SACK, it means that our
+ * remembered SACKs do not reflect real state of receiver i.e.
+ * receiver _host_ is heavily congested (or buggy).
+ *
+ * Do processing similar to RTO timeout.
+ */
+static int tcp_check_sack_reneging(struct sock *sk, int flag)
{
- struct sk_buff *skb;
-
- /* If ACK arrived pointing to a remembered SACK,
- * it means that our remembered SACKs do not reflect
- * real state of receiver i.e.
- * receiver _host_ is heavily congested (or buggy).
- * Do processing similar to RTO timeout.
- */
- if ((skb = tcp_write_queue_head(sk)) != NULL &&
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ if (flag & FLAG_SACK_RENEGING) {
struct inet_connection_sock *icsk = inet_csk(sk);
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
@@ -1857,7 +1946,27 @@ static int tcp_check_sack_reneging(struct sock *sk)
static inline int tcp_fackets_out(struct tcp_sock *tp)
{
- return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out;
+ return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
+}
+
+/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
+ * counter when SACK is enabled (without SACK, sacked_out is used for
+ * that purpose).
+ *
+ * Instead, with FACK TCP uses fackets_out that includes both SACKed
+ * segments up to the highest received SACK block so far and holes in
+ * between them.
+ *
+ * With reordering, holes may still be in flight, so RFC3517 recovery
+ * uses pure sacked_out (total number of SACKed segments) even though
+ * it violates the RFC that uses duplicate ACKs, often these are equal
+ * but when e.g. out-of-window ACKs or packet duplication occurs,
+ * they differ. Since neither occurs due to loss, TCP should really
+ * ignore them.
+ */
+static inline int tcp_dupack_heurestics(struct tcp_sock *tp)
+{
+ return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
@@ -1980,13 +2089,13 @@ static int tcp_time_to_recover(struct sock *sk)
return 1;
/* Not-A-Trick#2 : Classic rule... */
- if (tcp_fackets_out(tp) > tp->reordering)
+ if (tcp_dupack_heurestics(tp) > tp->reordering)
return 1;
/* Trick#3 : when we use RFC2988 timer restart, fast
* retransmit can be triggered by timeout of queue head.
*/
- if (tcp_head_timedout(sk))
+ if (tcp_is_fack(tp) && tcp_head_timedout(sk))
return 1;
/* Trick#4: It is still not OK... But will it be useful to delay
@@ -2010,17 +2119,18 @@ static int tcp_time_to_recover(struct sock *sk)
* retransmitted past LOST markings in the first place? I'm not fully sure
* about undo and end of connection cases, which can cause R without L?
*/
-static void tcp_verify_retransmit_hint(struct tcp_sock *tp,
- struct sk_buff *skb)
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
if ((tp->retransmit_skb_hint != NULL) &&
before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = NULL;
}
-/* Mark head of queue up as lost. */
-static void tcp_mark_head_lost(struct sock *sk, int packets)
+/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
+ * is against sacked "cnt", otherwise it's against facked "cnt"
+ */
+static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -2042,8 +2152,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
- cnt += tcp_skb_pcount(skb);
- if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
+
+ if (tcp_is_fack(tp) ||
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+ cnt += tcp_skb_pcount(skb);
+
+ if (((!fast_rexmit || (tp->lost_out > 0)) && (cnt > packets)) ||
+ after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
break;
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -2056,17 +2171,22 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
/* Account newly detected lost packet(s) */
-static void tcp_update_scoreboard(struct sock *sk)
+static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_is_fack(tp)) {
+ if (tcp_is_reno(tp)) {
+ tcp_mark_head_lost(sk, 1, fast_rexmit);
+ } else if (tcp_is_fack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
- tcp_mark_head_lost(sk, lost);
+ tcp_mark_head_lost(sk, lost, fast_rexmit);
} else {
- tcp_mark_head_lost(sk, 1);
+ int sacked_upto = tp->sacked_out - tp->reordering;
+ if (sacked_upto < 0)
+ sacked_upto = 0;
+ tcp_mark_head_lost(sk, sacked_upto, fast_rexmit);
}
/* New heuristics: it is possible only after we switched
@@ -2074,7 +2194,7 @@ static void tcp_update_scoreboard(struct sock *sk)
* Hence, we can detect timed out packets during fast
* retransmit without falling to slow start.
*/
- if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) {
+ if (tcp_is_fack(tp) && tcp_head_timedout(sk)) {
struct sk_buff *skb;
skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
@@ -2105,7 +2225,7 @@ static void tcp_update_scoreboard(struct sock *sk)
static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
{
tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp)+tcp_max_burst(tp));
+ tcp_packets_in_flight(tp) + tcp_max_burst(tp));
tp->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -2125,15 +2245,15 @@ static void tcp_cwnd_down(struct sock *sk, int flag)
struct tcp_sock *tp = tcp_sk(sk);
int decr = tp->snd_cwnd_cnt + 1;
- if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) ||
- (tcp_is_reno(tp) && !(flag&FLAG_NOT_DUP))) {
- tp->snd_cwnd_cnt = decr&1;
+ if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
+ (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
+ tp->snd_cwnd_cnt = decr & 1;
decr >>= 1;
if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
tp->snd_cwnd -= decr;
- tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
+ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
}
@@ -2177,7 +2297,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
if (icsk->icsk_ca_ops->undo_cwnd)
tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
else
- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
+ tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
@@ -2196,8 +2316,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
static inline int tcp_may_undo(struct tcp_sock *tp)
{
- return tp->undo_marker &&
- (!tp->undo_retrans || tcp_packet_delayed(tp));
+ return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
}
/* People celebrate: "We love our President!" */
@@ -2247,7 +2366,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Partial ACK arrived. Force Hoe's retransmit. */
- int failed = tcp_is_reno(tp) || tp->fackets_out>tp->reordering;
+ int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering);
if (tcp_may_undo(tp)) {
/* Plain luck! Hole if filled with delayed
@@ -2316,7 +2435,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
if (tp->retrans_out == 0)
tp->retrans_stamp = 0;
- if (flag&FLAG_ECE)
+ if (flag & FLAG_ECE)
tcp_enter_cwr(sk, 1);
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
@@ -2362,7 +2481,6 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
-
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2374,38 +2492,35 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
-static void
-tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
+static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int is_dupack = !(flag&(FLAG_SND_UNA_ADVANCED|FLAG_NOT_DUP));
- int do_lost = is_dupack || ((flag&FLAG_DATA_SACKED) &&
- (tp->fackets_out > tp->reordering));
+ int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
+ int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
+ (tcp_fackets_out(tp) > tp->reordering));
+ int fast_rexmit = 0;
- /* Some technical things:
- * 1. Reno does not count dupacks (sacked_out) automatically. */
- if (!tp->packets_out)
+ if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
-
if (WARN_ON(!tp->sacked_out && tp->fackets_out))
tp->fackets_out = 0;
/* Now state machine starts.
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
- if (flag&FLAG_ECE)
+ if (flag & FLAG_ECE)
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
- if (tp->sacked_out && tcp_check_sack_reneging(sk))
+ if (tcp_check_sack_reneging(sk, flag))
return;
/* C. Process data loss notification, provided it is valid. */
- if ((flag&FLAG_DATA_LOST) &&
+ if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) &&
before(tp->snd_una, tp->high_seq) &&
icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
- tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
+ tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
}
@@ -2465,7 +2580,7 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
do_lost = tcp_try_undo_partial(sk, pkts_acked);
break;
case TCP_CA_Loss:
- if (flag&FLAG_DATA_ACKED)
+ if (flag & FLAG_DATA_ACKED)
icsk->icsk_retransmits = 0;
if (!tcp_try_undo_loss(sk)) {
tcp_moderate_cwnd(tp);
@@ -2515,7 +2630,7 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
tp->undo_retrans = tp->retrans_out;
if (icsk->icsk_ca_state < TCP_CA_CWR) {
- if (!(flag&FLAG_ECE))
+ if (!(flag & FLAG_ECE))
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
TCP_ECN_queue_cwr(tp);
@@ -2524,10 +2639,11 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
tp->bytes_acked = 0;
tp->snd_cwnd_cnt = 0;
tcp_set_ca_state(sk, TCP_CA_Recovery);
+ fast_rexmit = 1;
}
- if (do_lost || tcp_head_timedout(sk))
- tcp_update_scoreboard(sk);
+ if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
+ tcp_update_scoreboard(sk, fast_rexmit);
tcp_cwnd_down(sk, flag);
tcp_xmit_retransmit_queue(sk);
}
@@ -2591,11 +2707,10 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
tcp_ack_no_tstamp(sk, seq_rtt, flag);
}
-static void tcp_cong_avoid(struct sock *sk, u32 ack,
- u32 in_flight, int good)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight, good);
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -2609,7 +2724,8 @@ static void tcp_rearm_rto(struct sock *sk)
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}
}
@@ -2638,8 +2754,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
*/
-static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
- int prior_fackets)
+static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2647,8 +2762,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
u32 now = tcp_time_stamp;
int fully_acked = 1;
int flag = 0;
- int prior_packets = tp->packets_out;
- u32 cnt = 0;
+ u32 pkts_acked = 0;
u32 reord = tp->packets_out;
s32 seq_rtt = -1;
s32 ca_seq_rtt = -1;
@@ -2657,7 +2771,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u32 end_seq;
- u32 packets_acked;
+ u32 acked_pcount;
u8 sacked = scb->sacked;
/* Determine how many packets and what bytes were acked, tso and else */
@@ -2666,14 +2780,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
!after(tp->snd_una, scb->seq))
break;
- packets_acked = tcp_tso_acked(sk, skb);
- if (!packets_acked)
+ acked_pcount = tcp_tso_acked(sk, skb);
+ if (!acked_pcount)
break;
fully_acked = 0;
end_seq = tp->snd_una;
} else {
- packets_acked = tcp_skb_pcount(skb);
+ acked_pcount = tcp_skb_pcount(skb);
end_seq = scb->end_seq;
}
@@ -2683,44 +2797,34 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
tcp_mtup_probe_success(sk, skb);
}
- if (sacked) {
- if (sacked & TCPCB_RETRANS) {
- if (sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out -= packets_acked;
- flag |= FLAG_RETRANS_DATA_ACKED;
- ca_seq_rtt = -1;
- seq_rtt = -1;
- if ((flag & FLAG_DATA_ACKED) ||
- (packets_acked > 1))
- flag |= FLAG_NONHEAD_RETRANS_ACKED;
- } else {
- ca_seq_rtt = now - scb->when;
- last_ackt = skb->tstamp;
- if (seq_rtt < 0) {
- seq_rtt = ca_seq_rtt;
- }
- if (!(sacked & TCPCB_SACKED_ACKED))
- reord = min(cnt, reord);
- }
-
- if (sacked & TCPCB_SACKED_ACKED)
- tp->sacked_out -= packets_acked;
- if (sacked & TCPCB_LOST)
- tp->lost_out -= packets_acked;
-
- if ((sacked & TCPCB_URG) && tp->urg_mode &&
- !before(end_seq, tp->snd_up))
- tp->urg_mode = 0;
+ if (sacked & TCPCB_RETRANS) {
+ if (sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= acked_pcount;
+ flag |= FLAG_RETRANS_DATA_ACKED;
+ ca_seq_rtt = -1;
+ seq_rtt = -1;
+ if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
+ flag |= FLAG_NONHEAD_RETRANS_ACKED;
} else {
ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp;
if (seq_rtt < 0) {
seq_rtt = ca_seq_rtt;
}
- reord = min(cnt, reord);
+ if (!(sacked & TCPCB_SACKED_ACKED))
+ reord = min(pkts_acked, reord);
}
- tp->packets_out -= packets_acked;
- cnt += packets_acked;
+
+ if (sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= acked_pcount;
+ if (sacked & TCPCB_LOST)
+ tp->lost_out -= acked_pcount;
+
+ if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up)))
+ tp->urg_mode = 0;
+
+ tp->packets_out -= acked_pcount;
+ pkts_acked += acked_pcount;
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
@@ -2740,12 +2844,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
break;
tcp_unlink_write_queue(skb, sk);
- sk_stream_free_skb(sk, skb);
+ sk_wmem_free_skb(sk, skb);
tcp_clear_all_retrans_hints(tp);
}
+ if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+ flag |= FLAG_SACK_RENEGING;
+
if (flag & FLAG_ACKED) {
- u32 pkts_acked = prior_packets - tp->packets_out;
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
@@ -2761,9 +2867,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
}
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
- /* hint's skb might be NULL but we don't need to care */
- tp->fastpath_cnt_hint -= min_t(u32, pkts_acked,
- tp->fastpath_cnt_hint);
+
if (ca_ops->pkts_acked) {
s32 rtt_us = -1;
@@ -2806,7 +2910,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
}
}
#endif
- *seq_rtt_p = seq_rtt;
return flag;
}
@@ -2817,8 +2920,7 @@ static void tcp_ack_probe(struct sock *sk)
/* Was it a usable window open? */
- if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
- tp->snd_una + tp->snd_wnd)) {
+ if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
@@ -2847,8 +2949,9 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
-static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
- const u32 ack_seq, const u32 nwin)
+static inline int tcp_may_update_window(const struct tcp_sock *tp,
+ const u32 ack, const u32 ack_seq,
+ const u32 nwin)
{
return (after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
@@ -2917,7 +3020,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk)
static void tcp_undo_spur_to_response(struct sock *sk, int flag)
{
- if (flag&FLAG_ECE)
+ if (flag & FLAG_ECE)
tcp_ratehalving_spur_to_response(sk);
else
tcp_undo_cwr(sk, 1);
@@ -2960,7 +3063,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
tcp_verify_left_out(tp);
/* Duplicate the behavior from Loss state (fastretrans_alert) */
- if (flag&FLAG_DATA_ACKED)
+ if (flag & FLAG_DATA_ACKED)
inet_csk(sk)->icsk_retransmits = 0;
if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
@@ -2977,16 +3080,16 @@ static int tcp_process_frto(struct sock *sk, int flag)
* ACK isn't duplicate nor advances window, e.g., opposite dir
* data, winupdate
*/
- if (!(flag&FLAG_ANY_PROGRESS) && (flag&FLAG_NOT_DUP))
+ if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
return 1;
- if (!(flag&FLAG_DATA_ACKED)) {
+ if (!(flag & FLAG_DATA_ACKED)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
flag);
return 1;
}
} else {
- if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
+ if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
/* Prevent sending of new data. */
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp));
@@ -2994,10 +3097,12 @@ static int tcp_process_frto(struct sock *sk, int flag)
}
if ((tp->frto_counter >= 2) &&
- (!(flag&FLAG_FORWARD_PROGRESS) ||
- ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
+ (!(flag & FLAG_FORWARD_PROGRESS) ||
+ ((flag & FLAG_DATA_SACKED) &&
+ !(flag & FLAG_ONLY_ORIG_SACKED)))) {
/* RFC4138 shortcoming (see comment above) */
- if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP))
+ if (!(flag & FLAG_FORWARD_PROGRESS) &&
+ (flag & FLAG_NOT_DUP))
return 1;
tcp_enter_frto_loss(sk, 3, flag);
@@ -3043,7 +3148,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
u32 ack = TCP_SKB_CB(skb)->ack_seq;
u32 prior_in_flight;
u32 prior_fackets;
- s32 seq_rtt;
int prior_packets;
int frto_cwnd = 0;
@@ -3064,13 +3168,14 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
tp->bytes_acked += ack - prior_snd_una;
else if (icsk->icsk_ca_state == TCP_CA_Loss)
/* we assume just one segment left network */
- tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache);
+ tp->bytes_acked += min(ack - prior_snd_una,
+ tp->mss_cache);
}
prior_fackets = tp->fackets_out;
prior_in_flight = tcp_packets_in_flight(tp);
- if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
@@ -3109,7 +3214,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, &seq_rtt, prior_fackets);
+ flag |= tcp_clean_rtx_queue(sk, prior_fackets);
if (tp->frto_counter)
frto_cwnd = tcp_process_frto(sk, flag);
@@ -3121,14 +3226,15 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
/* Advance CWND, if state allows this. */
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, prior_in_flight, 0);
- tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, flag);
+ tcp_cong_avoid(sk, ack, prior_in_flight);
+ tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
+ flag);
} else {
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
- tcp_cong_avoid(sk, ack, prior_in_flight, 1);
+ tcp_cong_avoid(sk, ack, prior_in_flight);
}
- if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
+ if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
dst_confirm(sk->sk_dst_cache);
return 1;
@@ -3153,100 +3259,99 @@ uninteresting_ack:
return 0;
}
-
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
-void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
+void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
+ int estab)
{
unsigned char *ptr;
struct tcphdr *th = tcp_hdr(skb);
- int length=(th->doff*4)-sizeof(struct tcphdr);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
while (length > 0) {
- int opcode=*ptr++;
+ int opcode = *ptr++;
int opsize;
switch (opcode) {
- case TCPOPT_EOL:
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
return;
- case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
- length--;
- continue;
- default:
- opsize=*ptr++;
- if (opsize < 2) /* "silly options" */
- return;
- if (opsize > length)
- return; /* don't parse partial options */
- switch (opcode) {
- case TCPOPT_MSS:
- if (opsize==TCPOLEN_MSS && th->syn && !estab) {
- u16 in_mss = ntohs(get_unaligned((__be16 *)ptr));
- if (in_mss) {
- if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
- in_mss = opt_rx->user_mss;
- opt_rx->mss_clamp = in_mss;
- }
- }
- break;
- case TCPOPT_WINDOW:
- if (opsize==TCPOLEN_WINDOW && th->syn && !estab)
- if (sysctl_tcp_window_scaling) {
- __u8 snd_wscale = *(__u8 *) ptr;
- opt_rx->wscale_ok = 1;
- if (snd_wscale > 14) {
- if (net_ratelimit())
- printk(KERN_INFO "tcp_parse_options: Illegal window "
- "scaling value %d >14 received.\n",
- snd_wscale);
- snd_wscale = 14;
- }
- opt_rx->snd_wscale = snd_wscale;
- }
- break;
- case TCPOPT_TIMESTAMP:
- if (opsize==TCPOLEN_TIMESTAMP) {
- if ((estab && opt_rx->tstamp_ok) ||
- (!estab && sysctl_tcp_timestamps)) {
- opt_rx->saw_tstamp = 1;
- opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr));
- opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4)));
- }
+ if (opsize > length)
+ return; /* don't parse partial options */
+ switch (opcode) {
+ case TCPOPT_MSS:
+ if (opsize == TCPOLEN_MSS && th->syn && !estab) {
+ u16 in_mss = ntohs(get_unaligned((__be16 *)ptr));
+ if (in_mss) {
+ if (opt_rx->user_mss &&
+ opt_rx->user_mss < in_mss)
+ in_mss = opt_rx->user_mss;
+ opt_rx->mss_clamp = in_mss;
}
- break;
- case TCPOPT_SACK_PERM:
- if (opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
- if (sysctl_tcp_sack) {
- opt_rx->sack_ok = 1;
- tcp_sack_reset(opt_rx);
- }
+ }
+ break;
+ case TCPOPT_WINDOW:
+ if (opsize == TCPOLEN_WINDOW && th->syn &&
+ !estab && sysctl_tcp_window_scaling) {
+ __u8 snd_wscale = *(__u8 *)ptr;
+ opt_rx->wscale_ok = 1;
+ if (snd_wscale > 14) {
+ if (net_ratelimit())
+ printk(KERN_INFO "tcp_parse_options: Illegal window "
+ "scaling value %d >14 received.\n",
+ snd_wscale);
+ snd_wscale = 14;
}
- break;
+ opt_rx->snd_wscale = snd_wscale;
+ }
+ break;
+ case TCPOPT_TIMESTAMP:
+ if ((opsize == TCPOLEN_TIMESTAMP) &&
+ ((estab && opt_rx->tstamp_ok) ||
+ (!estab && sysctl_tcp_timestamps))) {
+ opt_rx->saw_tstamp = 1;
+ opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr));
+ opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4)));
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opsize == TCPOLEN_SACK_PERM && th->syn &&
+ !estab && sysctl_tcp_sack) {
+ opt_rx->sack_ok = 1;
+ tcp_sack_reset(opt_rx);
+ }
+ break;
- case TCPOPT_SACK:
- if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
- !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
- opt_rx->sack_ok) {
- TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
- }
- break;
+ case TCPOPT_SACK:
+ if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+ !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
+ opt_rx->sack_ok) {
+ TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
+ }
+ break;
#ifdef CONFIG_TCP_MD5SIG
- case TCPOPT_MD5SIG:
- /*
- * The MD5 Hash has already been
- * checked (see tcp_v{4,6}_do_rcv()).
- */
- break;
+ case TCPOPT_MD5SIG:
+ /*
+ * The MD5 Hash has already been
+ * checked (see tcp_v{4,6}_do_rcv()).
+ */
+ break;
#endif
- }
+ }
- ptr+=opsize-2;
- length-=opsize;
+ ptr += opsize-2;
+ length -= opsize;
}
}
}
@@ -3257,7 +3362,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
struct tcp_sock *tp)
{
- if (th->doff == sizeof(struct tcphdr)>>2) {
+ if (th->doff == sizeof(struct tcphdr) >> 2) {
tp->rx_opt.saw_tstamp = 0;
return 0;
} else if (tp->rx_opt.tstamp_ok &&
@@ -3342,7 +3447,8 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
}
-static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
+static inline int tcp_paws_discard(const struct sock *sk,
+ const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
@@ -3374,16 +3480,16 @@ static void tcp_reset(struct sock *sk)
{
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
- case TCP_SYN_SENT:
- sk->sk_err = ECONNREFUSED;
- break;
- case TCP_CLOSE_WAIT:
- sk->sk_err = EPIPE;
- break;
- case TCP_CLOSE:
- return;
- default:
- sk->sk_err = ECONNRESET;
+ case TCP_SYN_SENT:
+ sk->sk_err = ECONNREFUSED;
+ break;
+ case TCP_CLOSE_WAIT:
+ sk->sk_err = EPIPE;
+ break;
+ case TCP_CLOSE:
+ return;
+ default:
+ sk->sk_err = ECONNRESET;
}
if (!sock_flag(sk, SOCK_DEAD))
@@ -3416,43 +3522,43 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
- case TCP_SYN_RECV:
- case TCP_ESTABLISHED:
- /* Move to CLOSE_WAIT */
- tcp_set_state(sk, TCP_CLOSE_WAIT);
- inet_csk(sk)->icsk_ack.pingpong = 1;
- break;
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ /* Move to CLOSE_WAIT */
+ tcp_set_state(sk, TCP_CLOSE_WAIT);
+ inet_csk(sk)->icsk_ack.pingpong = 1;
+ break;
- case TCP_CLOSE_WAIT:
- case TCP_CLOSING:
- /* Received a retransmission of the FIN, do
- * nothing.
- */
- break;
- case TCP_LAST_ACK:
- /* RFC793: Remain in the LAST-ACK state. */
- break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ /* Received a retransmission of the FIN, do
+ * nothing.
+ */
+ break;
+ case TCP_LAST_ACK:
+ /* RFC793: Remain in the LAST-ACK state. */
+ break;
- case TCP_FIN_WAIT1:
- /* This case occurs when a simultaneous close
- * happens, we must ack the received FIN and
- * enter the CLOSING state.
- */
- tcp_send_ack(sk);
- tcp_set_state(sk, TCP_CLOSING);
- break;
- case TCP_FIN_WAIT2:
- /* Received a FIN -- send ACK and enter TIME_WAIT. */
- tcp_send_ack(sk);
- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
- break;
- default:
- /* Only TCP_LISTEN and TCP_CLOSE are left, in these
- * cases we should never reach this piece of code.
- */
- printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
- __FUNCTION__, sk->sk_state);
- break;
+ case TCP_FIN_WAIT1:
+ /* This case occurs when a simultaneous close
+ * happens, we must ack the received FIN and
+ * enter the CLOSING state.
+ */
+ tcp_send_ack(sk);
+ tcp_set_state(sk, TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
+ /* Received a FIN -- send ACK and enter TIME_WAIT. */
+ tcp_send_ack(sk);
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ break;
+ default:
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+ * cases we should never reach this piece of code.
+ */
+ printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
+ __FUNCTION__, sk->sk_state);
+ break;
}
/* It _is_ possible, that we have something out-of-order _after_ FIN.
@@ -3461,7 +3567,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
__skb_queue_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
@@ -3469,13 +3575,14 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* Do not send POLL_HUP for half duplex close. */
if (sk->sk_shutdown == SHUTDOWN_MASK ||
sk->sk_state == TCP_CLOSE)
- sk_wake_async(sk, 1, POLL_HUP);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
else
- sk_wake_async(sk, 1, POLL_IN);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}
}
-static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
+static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
+ u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
if (before(seq, sp->start_seq))
@@ -3498,7 +3605,8 @@ static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
tp->duplicate_sack[0].end_seq = end_seq;
- tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok);
+ tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1,
+ 4 - tp->rx_opt.tstamp_ok);
}
}
@@ -3538,12 +3646,12 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
{
int this_sack;
struct tcp_sack_block *sp = &tp->selective_acks[0];
- struct tcp_sack_block *swalk = sp+1;
+ struct tcp_sack_block *swalk = sp + 1;
/* See if the recent change to the first SACK eats into
* or hits the sequence space of other SACK blocks, if so coalesce.
*/
- for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) {
+ for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
int i;
@@ -3551,16 +3659,19 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
* Decrease num_sacks.
*/
tp->rx_opt.num_sacks--;
- tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
- for (i=this_sack; i < tp->rx_opt.num_sacks; i++)
- sp[i] = sp[i+1];
+ tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks +
+ tp->rx_opt.dsack,
+ 4 - tp->rx_opt.tstamp_ok);
+ for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
+ sp[i] = sp[i + 1];
continue;
}
this_sack++, swalk++;
}
}
-static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
+static inline void tcp_sack_swap(struct tcp_sack_block *sack1,
+ struct tcp_sack_block *sack2)
{
__u32 tmp;
@@ -3583,11 +3694,11 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
if (!cur_sacks)
goto new_sack;
- for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) {
+ for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
if (tcp_sack_extend(sp, seq, end_seq)) {
/* Rotate this_sack to the first one. */
- for (; this_sack>0; this_sack--, sp--)
- tcp_sack_swap(sp, sp-1);
+ for (; this_sack > 0; this_sack--, sp--)
+ tcp_sack_swap(sp, sp - 1);
if (cur_sacks > 1)
tcp_sack_maybe_coalesce(tp);
return;
@@ -3606,14 +3717,15 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
sp--;
}
for (; this_sack > 0; this_sack--, sp--)
- *sp = *(sp-1);
+ *sp = *(sp - 1);
new_sack:
/* Build the new head SACK, and we're done. */
sp->start_seq = seq;
sp->end_seq = end_seq;
tp->rx_opt.num_sacks++;
- tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+ tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack,
+ 4 - tp->rx_opt.tstamp_ok);
}
/* RCV.NXT advances, some SACKs should be eaten. */
@@ -3631,7 +3743,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
return;
}
- for (this_sack = 0; this_sack < num_sacks; ) {
+ for (this_sack = 0; this_sack < num_sacks;) {
/* Check if the start of the sack is covered by RCV.NXT. */
if (!before(tp->rcv_nxt, sp->start_seq)) {
int i;
@@ -3650,7 +3762,9 @@ static void tcp_sack_remove(struct tcp_sock *tp)
}
if (num_sacks != tp->rx_opt.num_sacks) {
tp->rx_opt.num_sacks = num_sacks;
- tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+ tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks +
+ tp->rx_opt.dsack,
+ 4 - tp->rx_opt.tstamp_ok);
}
}
@@ -3703,14 +3817,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
goto drop;
- __skb_pull(skb, th->doff*4);
+ __skb_pull(skb, th->doff * 4);
TCP_ECN_accept_cwr(tp, skb);
if (tp->rx_opt.dsack) {
tp->rx_opt.dsack = 0;
tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks,
- 4 - tp->rx_opt.tstamp_ok);
+ 4 - tp->rx_opt.tstamp_ok);
}
/* Queue data for delivery to the user.
@@ -3726,7 +3840,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
sock_owned_by_user(sk) && !tp->urg_data) {
int chunk = min_t(unsigned int, skb->len,
- tp->ucopy.len);
+ tp->ucopy.len);
__set_current_state(TASK_RUNNING);
@@ -3744,12 +3858,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
queue_and_out:
if (eaten < 0 &&
(atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_stream_rmem_schedule(sk, skb))) {
+ !sk_rmem_schedule(sk, skb->truesize))) {
if (tcp_prune_queue(sk) < 0 ||
- !sk_stream_rmem_schedule(sk, skb))
+ !sk_rmem_schedule(sk, skb->truesize))
goto drop;
}
- sk_stream_set_owner_r(skb, sk);
+ skb_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
}
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3818,9 +3932,9 @@ drop:
TCP_ECN_check_ce(tp, skb);
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_stream_rmem_schedule(sk, skb)) {
+ !sk_rmem_schedule(sk, skb->truesize)) {
if (tcp_prune_queue(sk) < 0 ||
- !sk_stream_rmem_schedule(sk, skb))
+ !sk_rmem_schedule(sk, skb->truesize))
goto drop;
}
@@ -3831,7 +3945,7 @@ drop:
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
- sk_stream_set_owner_r(skb, sk);
+ skb_set_owner_r(skb, sk);
if (!skb_peek(&tp->out_of_order_queue)) {
/* Initial out of order segment, build 1 SACK. */
@@ -3843,7 +3957,7 @@ drop:
tp->selective_acks[0].end_seq =
TCP_SKB_CB(skb)->end_seq;
}
- __skb_queue_head(&tp->out_of_order_queue,skb);
+ __skb_queue_head(&tp->out_of_order_queue, skb);
} else {
struct sk_buff *skb1 = tp->out_of_order_queue.prev;
u32 seq = TCP_SKB_CB(skb)->seq;
@@ -3866,10 +3980,10 @@ drop:
if (!after(TCP_SKB_CB(skb1)->seq, seq))
break;
} while ((skb1 = skb1->prev) !=
- (struct sk_buff*)&tp->out_of_order_queue);
+ (struct sk_buff *)&tp->out_of_order_queue);
/* Do skb overlap to previous one? */
- if (skb1 != (struct sk_buff*)&tp->out_of_order_queue &&
+ if (skb1 != (struct sk_buff *)&tp->out_of_order_queue &&
before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
@@ -3879,7 +3993,8 @@ drop:
}
if (after(seq, TCP_SKB_CB(skb1)->seq)) {
/* Partial overlap. */
- tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq);
+ tcp_dsack_set(tp, seq,
+ TCP_SKB_CB(skb1)->end_seq);
} else {
skb1 = skb1->prev;
}
@@ -3888,15 +4003,17 @@ drop:
/* And clean segments covered by new one as whole. */
while ((skb1 = skb->next) !=
- (struct sk_buff*)&tp->out_of_order_queue &&
+ (struct sk_buff *)&tp->out_of_order_queue &&
after(end_seq, TCP_SKB_CB(skb1)->seq)) {
- if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
- tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
- break;
- }
- __skb_unlink(skb1, &tp->out_of_order_queue);
- tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
- __kfree_skb(skb1);
+ if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq,
+ end_seq);
+ break;
+ }
+ __skb_unlink(skb1, &tp->out_of_order_queue);
+ tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ __kfree_skb(skb1);
}
add_sack:
@@ -3919,7 +4036,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
/* First, check that queue is collapsible and find
* the point where collapsing can be useful. */
- for (skb = head; skb != tail; ) {
+ for (skb = head; skb != tail;) {
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
struct sk_buff *next = skb->next;
@@ -3957,9 +4074,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
/* Too big header? This can happen with IPv6. */
if (copy < 0)
return;
- if (end-start < copy)
- copy = end-start;
- nskb = alloc_skb(copy+header, GFP_ATOMIC);
+ if (end - start < copy)
+ copy = end - start;
+ nskb = alloc_skb(copy + header, GFP_ATOMIC);
if (!nskb)
return;
@@ -3973,7 +4090,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
__skb_insert(nskb, skb->prev, skb, list);
- sk_stream_set_owner_r(nskb, sk);
+ skb_set_owner_r(nskb, sk);
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
@@ -4069,9 +4186,9 @@ static int tcp_prune_queue(struct sock *sk)
tcp_collapse_ofo_queue(sk);
tcp_collapse(sk, &sk->sk_receive_queue,
sk->sk_receive_queue.next,
- (struct sk_buff*)&sk->sk_receive_queue,
+ (struct sk_buff *)&sk->sk_receive_queue,
tp->copied_seq, tp->rcv_nxt);
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
@@ -4091,7 +4208,7 @@ static int tcp_prune_queue(struct sock *sk)
*/
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
}
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
@@ -4108,7 +4225,6 @@ static int tcp_prune_queue(struct sock *sk)
return -1;
}
-
/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
* As additional protections, we do not touch cwnd in retransmission phases,
* and if application hit its sndbuf limit recently.
@@ -4170,8 +4286,8 @@ static void tcp_new_space(struct sock *sk)
int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
demanded = max_t(unsigned int, tp->snd_cwnd,
- tp->reordering + 1);
- sndmem *= 2*demanded;
+ tp->reordering + 1);
+ sndmem *= 2 * demanded;
if (sndmem > sk->sk_sndbuf)
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -4212,8 +4328,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
- (ofo_possible &&
- skb_peek(&tp->out_of_order_queue))) {
+ (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
/* Then ack it now */
tcp_send_ack(sk);
} else {
@@ -4241,7 +4356,7 @@ static inline void tcp_ack_snd_check(struct sock *sk)
* either form (or just set the sysctl tcp_stdurg).
*/
-static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
+static void tcp_check_urg(struct sock *sk, struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);
@@ -4290,8 +4405,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
* buggy users.
*/
if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
- !sock_flag(sk, SOCK_URGINLINE) &&
- tp->copied_seq != tp->rcv_nxt) {
+ !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
tp->copied_seq++;
if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
@@ -4300,8 +4414,8 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
}
}
- tp->urg_data = TCP_URG_NOTYET;
- tp->urg_seq = ptr;
+ tp->urg_data = TCP_URG_NOTYET;
+ tp->urg_seq = ptr;
/* Disable header prediction. */
tp->pred_flags = 0;
@@ -4314,7 +4428,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
/* Check if we get a new urgent pointer - normally not. */
if (th->urg)
- tcp_check_urg(sk,th);
+ tcp_check_urg(sk, th);
/* Do we wait for any urgent data? - normally not... */
if (tp->urg_data == TCP_URG_NOTYET) {
@@ -4356,7 +4470,8 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
return err;
}
-static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+static __sum16 __tcp_checksum_complete_user(struct sock *sk,
+ struct sk_buff *skb)
{
__sum16 result;
@@ -4370,14 +4485,16 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb
return result;
}
-static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_checksum_complete_user(struct sock *sk,
+ struct sk_buff *skb)
{
return !skb_csum_unnecessary(skb) &&
- __tcp_checksum_complete_user(sk, skb);
+ __tcp_checksum_complete_user(sk, skb);
}
#ifdef CONFIG_NET_DMA
-static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
+static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
+ int hlen)
{
struct tcp_sock *tp = tcp_sk(sk);
int chunk = skb->len - hlen;
@@ -4393,7 +4510,9 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen
if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
- skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
+ skb, hlen,
+ tp->ucopy.iov, chunk,
+ tp->ucopy.pinned_list);
if (dma_cookie < 0)
goto out;
@@ -4475,7 +4594,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
*/
if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
- TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
int tcp_header_len = tp->tcp_header_len;
/* Timestamp header prediction: tcp_header_len
@@ -4544,7 +4663,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
eaten = 1;
}
#endif
- if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
+ if (tp->ucopy.task == current &&
+ sock_owned_by_user(sk) && !copied_early) {
__set_current_state(TASK_RUNNING);
if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
@@ -4591,9 +4711,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
- __skb_pull(skb,tcp_header_len);
+ __skb_pull(skb, tcp_header_len);
__skb_queue_tail(&sk->sk_receive_queue, skb);
- sk_stream_set_owner_r(skb, sk);
+ skb_set_owner_r(skb, sk);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
}
@@ -4623,7 +4743,7 @@ no_ack:
}
slow_path:
- if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb))
+ if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
goto csum_error;
/*
@@ -4830,7 +4950,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
- sk_wake_async(sk, 0, POLL_OUT);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
}
if (sk->sk_write_pending ||
@@ -4873,7 +4993,8 @@ discard:
}
/* PAWS check. */
- if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0))
+ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
+ tcp_paws_check(&tp->rx_opt, 0))
goto discard_and_undo;
if (th->syn) {
@@ -4908,7 +5029,6 @@ discard:
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
-
tcp_send_synack(sk);
#if 0
/* Note, we could accept data and URG from this segment.
@@ -4940,7 +5060,6 @@ reset_and_undo:
return 1;
}
-
/*
* This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT.
@@ -5060,9 +5179,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* are not waked up, because sk->sk_sleep ==
* NULL and sk->sk_socket == NULL.
*/
- if (sk->sk_socket) {
- sk_wake_async(sk,0,POLL_OUT);
- }
+ if (sk->sk_socket)
+ sk_wake_async(sk,
+ SOCK_WAKE_IO, POLL_OUT);
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) <<
@@ -5074,8 +5193,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* and does not calculate rtt.
* Fix it at least with timestamps.
*/
- if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- !tp->srtt)
+ if (tp->rx_opt.saw_tstamp &&
+ tp->rx_opt.rcv_tsecr && !tp->srtt)
tcp_ack_saw_tstamp(sk, 0);
if (tp->rx_opt.tstamp_ok)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 652c32368ccc..9aea88b8d4fc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -99,7 +99,7 @@ static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
__be32 saddr, __be32 daddr,
struct tcphdr *th, int protocol,
- int tcplen);
+ unsigned int tcplen);
#endif
struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
@@ -1020,7 +1020,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
__be32 saddr, __be32 daddr,
struct tcphdr *th, int protocol,
- int tcplen)
+ unsigned int tcplen)
{
struct scatterlist sg[4];
__u16 data_len;
@@ -1113,7 +1113,7 @@ int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
struct dst_entry *dst,
struct request_sock *req,
struct tcphdr *th, int protocol,
- int tcplen)
+ unsigned int tcplen)
{
__be32 saddr, daddr;
@@ -1478,7 +1478,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
}
#endif
- __inet_hash(&tcp_hashinfo, newsk, 0);
+ __inet_hash_nolisten(&tcp_hashinfo, newsk);
__inet_inherit_port(&tcp_hashinfo, sk, newsk);
return newsk;
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index e7f5ef92cbd8..ce3c41ff50b2 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -115,12 +115,12 @@ static void tcp_lp_init(struct sock *sk)
* Will only call newReno CA when away from inference.
* From TCP-LP's paper, this will be handled in additive increasement.
*/
-static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag)
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct lp *lp = inet_csk_ca(sk);
if (!(lp->flag & LP_WITHIN_INF))
- tcp_reno_cong_avoid(sk, ack, in_flight, flag);
+ tcp_reno_cong_avoid(sk, ack, in_flight);
}
/**
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f4c1eef89af0..89f0188885c7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -61,27 +61,24 @@ int sysctl_tcp_base_mss __read_mostly = 512;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
-static inline void tcp_packets_out_inc(struct sock *sk,
- const struct sk_buff *skb)
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- int orig = tp->packets_out;
+ unsigned int prior_packets = tp->packets_out;
+
+ tcp_advance_send_head(sk, skb);
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+
+ /* Don't override Nagle indefinately with F-RTO */
+ if (tp->frto_counter == 2)
+ tp->frto_counter = 3;
tp->packets_out += tcp_skb_pcount(skb);
- if (!orig)
+ if (!prior_packets)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}
-static void update_send_head(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_advance_send_head(sk, skb);
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tcp_packets_out_inc(sk, skb);
-}
-
/* SND.NXT, if window was not shrunk.
* If window has been shrunk, what should we make? It is not clear at all.
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
@@ -92,10 +89,10 @@ static inline __u32 tcp_acceptable_seq(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
+ if (!before(tcp_wnd_end(tp), tp->snd_nxt))
return tp->snd_nxt;
else
- return tp->snd_una+tp->snd_wnd;
+ return tcp_wnd_end(tp);
}
/* Calculate mss to advertise in SYN segment.
@@ -224,14 +221,14 @@ void tcp_select_initial_window(int __space, __u32 mss,
* following RFC2414. Senders, not following this RFC,
* will be satisfied with 2.
*/
- if (mss > (1<<*rcv_wscale)) {
+ if (mss > (1 << *rcv_wscale)) {
int init_cwnd = 4;
- if (mss > 1460*3)
+ if (mss > 1460 * 3)
init_cwnd = 2;
else if (mss > 1460)
init_cwnd = 3;
- if (*rcv_wnd > init_cwnd*mss)
- *rcv_wnd = init_cwnd*mss;
+ if (*rcv_wnd > init_cwnd * mss)
+ *rcv_wnd = init_cwnd * mss;
}
/* Set the clamp no higher than max representable value */
@@ -281,11 +278,10 @@ static u16 tcp_select_window(struct sock *sk)
return new_win;
}
-static inline void TCP_ECN_send_synack(struct tcp_sock *tp,
- struct sk_buff *skb)
+static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
{
TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
- if (!(tp->ecn_flags&TCP_ECN_OK))
+ if (!(tp->ecn_flags & TCP_ECN_OK))
TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
}
@@ -295,7 +291,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
tp->ecn_flags = 0;
if (sysctl_tcp_ecn) {
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR;
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
tp->ecn_flags = TCP_ECN_OK;
}
}
@@ -317,7 +313,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
if (skb->len != tcp_header_len &&
!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
INET_ECN_xmit(sk);
- if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) {
+ if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
tcp_hdr(skb)->cwr = 1;
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
@@ -331,6 +327,26 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
}
}
+/* Constructs common control bits of non-data skb. If SYN/FIN is present,
+ * auto increment end seqno.
+ */
+static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+{
+ skb->csum = 0;
+
+ TCP_SKB_CB(skb)->flags = flags;
+ TCP_SKB_CB(skb)->sacked = 0;
+
+ skb_shinfo(skb)->gso_segs = 1;
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
+
+ TCP_SKB_CB(skb)->seq = seq;
+ if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
+ seq++;
+ TCP_SKB_CB(skb)->end_seq = seq;
+}
+
static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,
__u32 tstamp, __u8 **md5_hash)
{
@@ -434,7 +450,7 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
(TCPOPT_NOP << 16) |
(TCPOPT_MD5SIG << 8) |
TCPOLEN_MD5SIG);
- *md5_hash = (__u8 *) ptr;
+ *md5_hash = (__u8 *)ptr;
}
#endif
}
@@ -450,7 +466,8 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ gfp_t gfp_mask)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
@@ -554,8 +571,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
th->urg_ptr = 0;
if (unlikely(tp->urg_mode &&
- between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
- th->urg_ptr = htons(tp->snd_up-tcb->seq);
+ between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
+ th->urg_ptr = htons(tp->snd_up - tcb->seq);
th->urg = 1;
}
@@ -619,7 +636,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
#undef SYSCTL_FLAG_SACK
}
-
/* This routine just queue's the buffer
*
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
@@ -633,10 +649,12 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
tp->write_seq = TCP_SKB_CB(skb)->end_seq;
skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
- sk_charge_skb(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
}
-static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
+ unsigned int mss_now)
{
if (skb->len <= mss_now || !sk_can_gso(sk)) {
/* Avoid the costly divide in the normal
@@ -653,23 +671,18 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned
}
/* When a modification to fackets out becomes necessary, we need to check
- * skb is counted to fackets_out or not. Another important thing is to
- * tweak SACK fastpath hint too as it would overwrite all changes unless
- * hint is also changed.
+ * skb is counted to fackets_out or not.
*/
-static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb,
+static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
int decr)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
if (!tp->sacked_out || tcp_is_reno(tp))
return;
- if (!before(tp->highest_sack, TCP_SKB_CB(skb)->seq))
+ if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
tp->fackets_out -= decr;
-
- /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */
- if (tp->fastpath_skb_hint != NULL &&
- after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq))
- tp->fastpath_cnt_hint -= decr;
}
/* Function to create two new TCP segments. Shrinks the given segment
@@ -677,7 +690,8 @@ static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb,
* packet to the list. This won't be called frequently, I hope.
* Remember, these are still headerless SKBs at this point.
*/
-int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
+int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+ unsigned int mss_now)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
@@ -702,7 +716,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
if (buff == NULL)
return -ENOMEM; /* We'll just try again later. */
- sk_charge_skb(sk, buff);
+ sk->sk_wmem_queued += buff->truesize;
+ sk_mem_charge(sk, buff->truesize);
nlen = skb->len - len - nsize;
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -712,20 +727,16 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
- if (tcp_is_sack(tp) && tp->sacked_out &&
- (TCP_SKB_CB(skb)->seq == tp->highest_sack))
- tp->highest_sack = TCP_SKB_CB(buff)->seq;
-
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->flags;
- TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+ TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
TCP_SKB_CB(buff)->flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
/* Copy and checksum data tail into the new buffer. */
- buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
+ buff->csum = csum_partial_copy_nocheck(skb->data + len,
+ skb_put(buff, nsize),
nsize, 0);
skb_trim(skb, len);
@@ -772,7 +783,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
tcp_dec_pcount_approx_int(&tp->sacked_out, diff);
tcp_verify_left_out(tp);
}
- tcp_adjust_fackets_out(tp, skb, diff);
+ tcp_adjust_fackets_out(sk, skb, diff);
}
/* Link BUFF into the send queue. */
@@ -792,7 +803,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
eat = len;
k = 0;
- for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
if (skb_shinfo(skb)->frags[i].size <= eat) {
put_page(skb_shinfo(skb)->frags[i].page);
eat -= skb_shinfo(skb)->frags[i].size;
@@ -815,8 +826,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
- if (skb_cloned(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
return -ENOMEM;
/* If len == headlen, we avoid __skb_pull to preserve alignment. */
@@ -830,7 +840,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
skb->truesize -= len;
sk->sk_wmem_queued -= len;
- sk->sk_forward_alloc += len;
+ sk_mem_uncharge(sk, len);
sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
/* Any change of skb->len requires recalculation of tso
@@ -898,6 +908,15 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_mtup.probe_size = 0;
}
+/* Bound MSS / TSO packet size with the half of the window */
+static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
+{
+ if (tp->max_window && pktsize > (tp->max_window >> 1))
+ return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
+ else
+ return pktsize;
+}
+
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -920,7 +939,6 @@ void tcp_mtup_init(struct sock *sk)
NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
are READ ONLY outside this function. --ANK (980731)
*/
-
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -931,10 +949,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
icsk->icsk_mtup.search_high = pmtu;
mss_now = tcp_mtu_to_mss(sk, pmtu);
-
- /* Bound mss with half of window */
- if (tp->max_window && mss_now > (tp->max_window>>1))
- mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
+ mss_now = tcp_bound_to_half_wnd(tp, mss_now);
/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
@@ -988,11 +1003,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
inet_csk(sk)->icsk_ext_hdr_len -
tp->tcp_header_len);
- if (tp->max_window &&
- (xmit_size_goal > (tp->max_window >> 1)))
- xmit_size_goal = max((tp->max_window >> 1),
- 68U - tp->tcp_header_len);
-
+ xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
xmit_size_goal -= (xmit_size_goal % mss_now);
}
tp->xmit_size_goal = xmit_size_goal;
@@ -1001,13 +1012,11 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
}
/* Congestion window validation. (RFC2861) */
-
static void tcp_cwnd_validate(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- __u32 packets_out = tp->packets_out;
- if (packets_out >= tp->snd_cwnd) {
+ if (tp->packets_out >= tp->snd_cwnd) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1022,19 +1031,35 @@ static void tcp_cwnd_validate(struct sock *sk)
}
}
-static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
+/* Returns the portion of skb which can be sent right away without
+ * introducing MSS oddities to segment boundaries. In rare cases where
+ * mss_now != mss_cache, we will request caller to create a small skb
+ * per input skb which could be mostly avoided here (if desired).
+ */
+static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
+ unsigned int mss_now, unsigned int cwnd)
{
- u32 window, cwnd_len;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 needed, window, cwnd_len;
- window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
+ window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
cwnd_len = mss_now * cwnd;
- return min(window, cwnd_len);
+
+ if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
+ return cwnd_len;
+
+ if (skb == tcp_write_queue_tail(sk) && cwnd_len <= skb->len)
+ return cwnd_len;
+
+ needed = min(skb->len, window);
+ return needed - needed % mss_now;
}
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed.
*/
-static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
+ struct sk_buff *skb)
{
u32 in_flight, cwnd;
@@ -1054,13 +1079,12 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
/* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
-static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
+static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
+ unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
- if (!tso_segs ||
- (tso_segs > 1 &&
- tcp_skb_mss(skb) != mss_now)) {
+ if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
tcp_set_skb_tso_segs(sk, skb, mss_now);
tso_segs = tcp_skb_pcount(skb);
}
@@ -1080,16 +1104,13 @@ static inline int tcp_minshall_check(const struct tcp_sock *tp)
* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
* With Minshall's modification: all sent small packets are ACKed.
*/
-
static inline int tcp_nagle_check(const struct tcp_sock *tp,
const struct sk_buff *skb,
unsigned mss_now, int nonagle)
{
return (skb->len < mss_now &&
- ((nonagle&TCP_NAGLE_CORK) ||
- (!nonagle &&
- tp->packets_out &&
- tcp_minshall_check(tp))));
+ ((nonagle & TCP_NAGLE_CORK) ||
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp))));
}
/* Return non-zero if the Nagle test allows this packet to be
@@ -1121,14 +1142,15 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
}
/* Does at least the first segment of SKB fit into the send window? */
-static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,
+ unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (skb->len > cur_mss)
end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
- return !after(end_seq, tp->snd_una + tp->snd_wnd);
+ return !after(end_seq, tcp_wnd_end(tp));
}
/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
@@ -1147,8 +1169,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
return 0;
cwnd_quota = tcp_cwnd_test(tp, skb);
- if (cwnd_quota &&
- !tcp_snd_wnd_test(tp, skb, cur_mss))
+ if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
cwnd_quota = 0;
return cwnd_quota;
@@ -1172,7 +1193,8 @@ int tcp_may_send_now(struct sock *sk)
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+ unsigned int mss_now)
{
struct sk_buff *buff;
int nlen = skb->len - len;
@@ -1182,11 +1204,12 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (skb->len != skb->data_len)
return tcp_fragment(sk, skb, len, mss_now);
- buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
+ buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
if (unlikely(buff == NULL))
return -ENOMEM;
- sk_charge_skb(sk, buff);
+ sk->sk_wmem_queued += buff->truesize;
+ sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -1197,7 +1220,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->flags;
- TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+ TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
TCP_SKB_CB(buff)->flags = flags;
/* This packet was never sent out yet, so no SACK bits. */
@@ -1235,15 +1258,15 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
goto send_now;
/* Defer for less than two clock ticks. */
- if (!tp->tso_deferred && ((jiffies<<1)>>1) - (tp->tso_deferred>>1) > 1)
+ if (tp->tso_deferred &&
+ ((jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
- BUG_ON(tcp_skb_pcount(skb) <= 1 ||
- (tp->snd_cwnd <= in_flight));
+ BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
- send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
+ send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
/* From in_flight test above, we know that cwnd > in_flight. */
cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
@@ -1274,7 +1297,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
}
/* Ok, it looks like it is advisable to defer. */
- tp->tso_deferred = 1 | (jiffies<<1);
+ tp->tso_deferred = 1 | (jiffies << 1);
return 1;
@@ -1286,7 +1309,8 @@ send_now:
/* Create a new MTU probe if we are ready.
* Returns 0 if we should wait to probe (no cwnd available),
* 1 if a probe was sent,
- * -1 otherwise */
+ * -1 otherwise
+ */
static int tcp_mtu_probe(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1295,7 +1319,6 @@ static int tcp_mtu_probe(struct sock *sk)
int len;
int probe_size;
int size_needed;
- unsigned int pif;
int copy;
int mss_now;
@@ -1312,7 +1335,7 @@ static int tcp_mtu_probe(struct sock *sk)
/* Very simple search strategy: just double the MSS. */
mss_now = tcp_current_mss(sk, 0);
- probe_size = 2*tp->mss_cache;
+ probe_size = 2 * tp->mss_cache;
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
/* TODO: set timer for probe_converge_event */
@@ -1325,14 +1348,12 @@ static int tcp_mtu_probe(struct sock *sk)
if (tp->snd_wnd < size_needed)
return -1;
- if (after(tp->snd_nxt + size_needed, tp->snd_una + tp->snd_wnd))
+ if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
return 0;
- /* Do we need to wait to drain cwnd? */
- pif = tcp_packets_in_flight(tp);
- if (pif + 2 > tp->snd_cwnd) {
- /* With no packets in flight, don't stall. */
- if (pif == 0)
+ /* Do we need to wait to drain cwnd? With none in flight, don't stall */
+ if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
+ if (!tcp_packets_in_flight(tp))
return -1;
else
return 0;
@@ -1341,10 +1362,10 @@ static int tcp_mtu_probe(struct sock *sk)
/* We're allowed to probe. Build it now. */
if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
return -1;
- sk_charge_skb(sk, nskb);
+ sk->sk_wmem_queued += nskb->truesize;
+ sk_mem_charge(sk, nskb->truesize);
skb = tcp_send_head(sk);
- tcp_insert_write_queue_before(nskb, skb, sk);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
@@ -1353,30 +1374,32 @@ static int tcp_mtu_probe(struct sock *sk)
nskb->csum = 0;
nskb->ip_summed = skb->ip_summed;
- len = 0;
- while (len < probe_size) {
- next = tcp_write_queue_next(sk, skb);
+ tcp_insert_write_queue_before(nskb, skb, sk);
+ len = 0;
+ tcp_for_write_queue_from_safe(skb, next, sk) {
copy = min_t(int, skb->len, probe_size - len);
if (nskb->ip_summed)
skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
else
nskb->csum = skb_copy_and_csum_bits(skb, 0,
- skb_put(nskb, copy), copy, nskb->csum);
+ skb_put(nskb, copy),
+ copy, nskb->csum);
if (skb->len <= copy) {
/* We've eaten all the data from this skb.
* Throw it away. */
TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
tcp_unlink_write_queue(skb, sk);
- sk_stream_free_skb(sk, skb);
+ sk_wmem_free_skb(sk, skb);
} else {
TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
if (!skb_shinfo(skb)->nr_frags) {
skb_pull(skb, copy);
if (skb->ip_summed != CHECKSUM_PARTIAL)
- skb->csum = csum_partial(skb->data, skb->len, 0);
+ skb->csum = csum_partial(skb->data,
+ skb->len, 0);
} else {
__pskb_trim_head(skb, copy);
tcp_set_skb_tso_segs(sk, skb, mss_now);
@@ -1385,7 +1408,9 @@ static int tcp_mtu_probe(struct sock *sk)
}
len += copy;
- skb = next;
+
+ if (len >= probe_size)
+ break;
}
tcp_init_tso_segs(sk, nskb, nskb->len);
@@ -1394,9 +1419,9 @@ static int tcp_mtu_probe(struct sock *sk)
TCP_SKB_CB(nskb)->when = tcp_time_stamp;
if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
/* Decrement cwnd here because we are sending
- * effectively two packets. */
+ * effectively two packets. */
tp->snd_cwnd--;
- update_send_head(sk, nskb);
+ tcp_event_new_data_sent(sk, nskb);
icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
@@ -1408,7 +1433,6 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
-
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -1464,17 +1488,9 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
}
limit = mss_now;
- if (tso_segs > 1) {
- limit = tcp_window_allows(tp, skb,
- mss_now, cwnd_quota);
-
- if (skb->len < limit) {
- unsigned int trim = skb->len % mss_now;
-
- if (trim)
- limit = skb->len - trim;
- }
- }
+ if (tso_segs > 1)
+ limit = tcp_mss_split_point(sk, skb, mss_now,
+ cwnd_quota);
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now)))
@@ -1488,7 +1504,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
/* Advance the send_head. This one is sent out.
* This call will increment packets_out.
*/
- update_send_head(sk, skb);
+ tcp_event_new_data_sent(sk, skb);
tcp_minshall_update(tp, mss_now, skb);
sent_pkts++;
@@ -1521,7 +1537,6 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
*/
void tcp_push_one(struct sock *sk, unsigned int mss_now)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = tcp_send_head(sk);
unsigned int tso_segs, cwnd_quota;
@@ -1536,17 +1551,9 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
BUG_ON(!tso_segs);
limit = mss_now;
- if (tso_segs > 1) {
- limit = tcp_window_allows(tp, skb,
- mss_now, cwnd_quota);
-
- if (skb->len < limit) {
- unsigned int trim = skb->len % mss_now;
-
- if (trim)
- limit = skb->len - trim;
- }
- }
+ if (tso_segs > 1)
+ limit = tcp_mss_split_point(sk, skb, mss_now,
+ cwnd_quota);
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now)))
@@ -1556,7 +1563,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
- update_send_head(sk, skb);
+ tcp_event_new_data_sent(sk, skb);
tcp_cwnd_validate(sk);
return;
}
@@ -1633,11 +1640,12 @@ u32 __tcp_select_window(struct sock *sk)
if (mss > full_space)
mss = full_space;
- if (free_space < full_space/2) {
+ if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
if (tcp_memory_pressure)
- tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh,
+ 4U * tp->advmss);
if (free_space < mss)
return 0;
@@ -1670,9 +1678,9 @@ u32 __tcp_select_window(struct sock *sk)
* is too small.
*/
if (window <= free_space - mss || window > free_space)
- window = (free_space/mss)*mss;
+ window = (free_space / mss) * mss;
else if (mss == full_space &&
- free_space > window + full_space/2)
+ free_space > window + (full_space >> 1))
window = free_space;
}
@@ -1680,86 +1688,82 @@ u32 __tcp_select_window(struct sock *sk)
}
/* Attempt to collapse two adjacent SKB's during retransmission. */
-static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
+static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
+ int mss_now)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+ int skb_size, next_skb_size;
+ u16 flags;
/* The first test we must make is that neither of these two
* SKB's are still referenced by someone else.
*/
- if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
- int skb_size = skb->len, next_skb_size = next_skb->len;
- u16 flags = TCP_SKB_CB(skb)->flags;
+ if (skb_cloned(skb) || skb_cloned(next_skb))
+ return;
- /* Also punt if next skb has been SACK'd. */
- if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
- return;
+ skb_size = skb->len;
+ next_skb_size = next_skb->len;
+ flags = TCP_SKB_CB(skb)->flags;
- /* Next skb is out of window. */
- if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
- return;
+ /* Also punt if next skb has been SACK'd. */
+ if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
+ return;
- /* Punt if not enough space exists in the first SKB for
- * the data in the second, or the total combined payload
- * would exceed the MSS.
- */
- if ((next_skb_size > skb_tailroom(skb)) ||
- ((skb_size + next_skb_size) > mss_now))
- return;
+ /* Next skb is out of window. */
+ if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp)))
+ return;
- BUG_ON(tcp_skb_pcount(skb) != 1 ||
- tcp_skb_pcount(next_skb) != 1);
+ /* Punt if not enough space exists in the first SKB for
+ * the data in the second, or the total combined payload
+ * would exceed the MSS.
+ */
+ if ((next_skb_size > skb_tailroom(skb)) ||
+ ((skb_size + next_skb_size) > mss_now))
+ return;
- if (WARN_ON(tcp_is_sack(tp) && tp->sacked_out &&
- (TCP_SKB_CB(next_skb)->seq == tp->highest_sack)))
- return;
+ BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
- /* Ok. We will be able to collapse the packet. */
- tcp_unlink_write_queue(next_skb, sk);
+ tcp_highest_sack_combine(sk, next_skb, skb);
- skb_copy_from_linear_data(next_skb,
- skb_put(skb, next_skb_size),
- next_skb_size);
+ /* Ok. We will be able to collapse the packet. */
+ tcp_unlink_write_queue(next_skb, sk);
- if (next_skb->ip_summed == CHECKSUM_PARTIAL)
- skb->ip_summed = CHECKSUM_PARTIAL;
+ skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
+ next_skb_size);
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
+ if (next_skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_PARTIAL;
- /* Update sequence range on original skb. */
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
- /* Merge over control information. */
- flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
- TCP_SKB_CB(skb)->flags = flags;
+ /* Update sequence range on original skb. */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
- /* All done, get rid of second SKB and account for it so
- * packet counting does not break.
- */
- TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
- if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
- tp->retrans_out -= tcp_skb_pcount(next_skb);
- if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST)
- tp->lost_out -= tcp_skb_pcount(next_skb);
- /* Reno case is special. Sigh... */
- if (tcp_is_reno(tp) && tp->sacked_out)
- tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
-
- tcp_adjust_fackets_out(tp, next_skb, tcp_skb_pcount(next_skb));
- tp->packets_out -= tcp_skb_pcount(next_skb);
-
- /* changed transmit queue under us so clear hints */
- tcp_clear_retrans_hints_partial(tp);
- /* manually tune sacktag skb hint */
- if (tp->fastpath_skb_hint == next_skb) {
- tp->fastpath_skb_hint = skb;
- tp->fastpath_cnt_hint -= tcp_skb_pcount(skb);
- }
+ /* Merge over control information. */
+ flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
+ TCP_SKB_CB(skb)->flags = flags;
- sk_stream_free_skb(sk, next_skb);
- }
+ /* All done, get rid of second SKB and account for it so
+ * packet counting does not break.
+ */
+ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+ if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= tcp_skb_pcount(next_skb);
+ if (TCP_SKB_CB(next_skb)->sacked & TCPCB_LOST)
+ tp->lost_out -= tcp_skb_pcount(next_skb);
+ /* Reno case is special. Sigh... */
+ if (tcp_is_reno(tp) && tp->sacked_out)
+ tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
+
+ tcp_adjust_fackets_out(sk, next_skb, tcp_skb_pcount(next_skb));
+ tp->packets_out -= tcp_skb_pcount(next_skb);
+
+ /* changed transmit queue under us so clear hints */
+ tcp_clear_retrans_hints_partial(tp);
+
+ sk_wmem_free_skb(sk, next_skb);
}
/* Do a simple retransmit without using the backoff mechanisms in
@@ -1778,12 +1782,12 @@ void tcp_simple_retransmit(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
if (skb->len > mss &&
- !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
- if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
+ !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
}
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
lost = 1;
@@ -1848,7 +1852,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
* case, when window is shrunk to zero. In this case
* our retransmit serves as a zero window probe.
*/
- if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
+ if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))
&& TCP_SKB_CB(skb)->seq != tp->snd_una)
return -EAGAIN;
@@ -1862,8 +1866,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
(skb->len < (cur_mss >> 1)) &&
(tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
(!tcp_skb_is_last(sk, skb)) &&
- (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
- (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
+ (skb_shinfo(skb)->nr_frags == 0 &&
+ skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
+ (tcp_skb_pcount(skb) == 1 &&
+ tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
(sysctl_tcp_retrans_collapse != 0))
tcp_retrans_try_collapse(sk, skb, cur_mss);
@@ -1878,12 +1884,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
if (!pskb_trim(skb, 0)) {
- TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
+ /* Reuse, even though it does some unnecessary work */
+ tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
+ TCP_SKB_CB(skb)->flags);
skb->ip_summed = CHECKSUM_NONE;
- skb->csum = 0;
}
}
@@ -1901,7 +1905,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
tp->total_retrans++;
#if FASTRETRANS_DEBUG > 0
- if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
if (net_ratelimit())
printk(KERN_DEBUG "retrans_out leaked.\n");
}
@@ -1943,7 +1947,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tp->retransmit_skb_hint) {
skb = tp->retransmit_skb_hint;
packet_cnt = tp->retransmit_cnt_hint;
- }else{
+ } else {
skb = tcp_write_queue_head(sk);
packet_cnt = 0;
}
@@ -1970,7 +1974,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
return;
if (sacked & TCPCB_LOST) {
- if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
+ if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
if (tcp_retransmit_skb(sk, skb)) {
tp->retransmit_skb_hint = NULL;
return;
@@ -2028,7 +2032,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
break;
tp->forward_skb_hint = skb;
- if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack))
+ if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
break;
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
@@ -2052,7 +2056,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
}
}
-
/* Send a fin. The caller locks the socket for us. This cannot be
* allowed to fail queueing a FIN frame under any circumstances.
*/
@@ -2083,16 +2086,9 @@ void tcp_send_fin(struct sock *sk)
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
- skb->csum = 0;
- TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
- TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
-
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
- TCP_SKB_CB(skb)->seq = tp->write_seq;
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+ tcp_init_nondata_skb(skb, tp->write_seq,
+ TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
tcp_queue_skb(sk, skb);
}
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
@@ -2116,16 +2112,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
- skb->csum = 0;
- TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
- TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
-
+ tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
+ TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
/* Send it off. */
- TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk);
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
@@ -2138,14 +2127,14 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
*/
int tcp_send_synack(struct sock *sk)
{
- struct sk_buff* skb;
+ struct sk_buff *skb;
skb = tcp_write_queue_head(sk);
- if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
+ if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) {
printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
return -EFAULT;
}
- if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
+ if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) {
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
if (nskb == NULL)
@@ -2153,8 +2142,9 @@ int tcp_send_synack(struct sock *sk)
tcp_unlink_write_queue(skb, sk);
skb_header_release(nskb);
__tcp_add_write_queue_head(sk, nskb);
- sk_stream_free_skb(sk, skb);
- sk_charge_skb(sk, nskb);
+ sk_wmem_free_skb(sk, skb);
+ sk->sk_wmem_queued += nskb->truesize;
+ sk_mem_charge(sk, nskb->truesize);
skb = nskb;
}
@@ -2168,8 +2158,8 @@ int tcp_send_synack(struct sock *sk)
/*
* Prepare a SYN-ACK.
*/
-struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
- struct request_sock *req)
+struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2212,12 +2202,11 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
TCP_ECN_make_synack(req, th);
th->source = inet_sk(sk)->sport;
th->dest = ireq->rmt_port;
- TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
- TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
+ /* Setting of flags are superfluous here for callers (and ECE is
+ * not even correctly set)
+ */
+ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
+ TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
th->seq = htonl(TCP_SKB_CB(skb)->seq);
th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
@@ -2249,7 +2238,6 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
NULL)
);
- skb->csum = 0;
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS(TCP_MIB_OUTSEGS);
@@ -2341,23 +2329,17 @@ int tcp_connect(struct sock *sk)
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
- TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
- TCP_ECN_send_syn(sk, buff);
- TCP_SKB_CB(buff)->sacked = 0;
- skb_shinfo(buff)->gso_segs = 1;
- skb_shinfo(buff)->gso_size = 0;
- skb_shinfo(buff)->gso_type = 0;
- buff->csum = 0;
tp->snd_nxt = tp->write_seq;
- TCP_SKB_CB(buff)->seq = tp->write_seq++;
- TCP_SKB_CB(buff)->end_seq = tp->write_seq;
+ tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN);
+ TCP_ECN_send_syn(sk, buff);
/* Send it off. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
tp->retrans_stamp = TCP_SKB_CB(buff)->when;
skb_header_release(buff);
__tcp_add_write_queue_tail(sk, buff);
- sk_charge_skb(sk, buff);
+ sk->sk_wmem_queued += buff->truesize;
+ sk_mem_charge(sk, buff->truesize);
tp->packets_out += tcp_skb_pcount(buff);
tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
@@ -2386,9 +2368,10 @@ void tcp_send_delayed_ack(struct sock *sk)
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
- int max_ato = HZ/2;
+ int max_ato = HZ / 2;
- if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
+ if (icsk->icsk_ack.pingpong ||
+ (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
/* Slow path, intersegment interval is "high". */
@@ -2398,7 +2381,7 @@ void tcp_send_delayed_ack(struct sock *sk)
* directly.
*/
if (tp->srtt) {
- int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
+ int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
if (rtt < max_ato)
max_ato = rtt;
@@ -2432,37 +2415,32 @@ void tcp_send_delayed_ack(struct sock *sk)
/* This routine sends an ack and also updates the window. */
void tcp_send_ack(struct sock *sk)
{
- /* If we have been reset, we may not send again. */
- if (sk->sk_state != TCP_CLOSE) {
- struct sk_buff *buff;
+ struct sk_buff *buff;
- /* We are not putting this on the write queue, so
- * tcp_transmit_skb() will set the ownership to this
- * sock.
- */
- buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
- if (buff == NULL) {
- inet_csk_schedule_ack(sk);
- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX, TCP_RTO_MAX);
- return;
- }
+ /* If we have been reset, we may not send again. */
+ if (sk->sk_state == TCP_CLOSE)
+ return;
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(buff, MAX_TCP_HEADER);
- buff->csum = 0;
- TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
- TCP_SKB_CB(buff)->sacked = 0;
- skb_shinfo(buff)->gso_segs = 1;
- skb_shinfo(buff)->gso_size = 0;
- skb_shinfo(buff)->gso_type = 0;
-
- /* Send it off, this clears delayed acks for us. */
- TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk);
- TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
+ /* We are not putting this on the write queue, so
+ * tcp_transmit_skb() will set the ownership to this
+ * sock.
+ */
+ buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ if (buff == NULL) {
+ inet_csk_schedule_ack(sk);
+ inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
+ return;
}
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(buff, MAX_TCP_HEADER);
+ tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK);
+
+ /* Send it off, this clears delayed acks for us. */
+ TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
}
/* This routine sends a packet with an out of date sequence
@@ -2488,66 +2466,57 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
/* Reserve space for headers and set control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
- skb->csum = 0;
- TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
- TCP_SKB_CB(skb)->sacked = urgent;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
-
/* Use a previous sequence. This should cause the other
* end to send an ack. Don't queue or clone SKB, just
* send it.
*/
- TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
- TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
+ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
int tcp_write_wakeup(struct sock *sk)
{
- if (sk->sk_state != TCP_CLOSE) {
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- if ((skb = tcp_send_head(sk)) != NULL &&
- before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
- int err;
- unsigned int mss = tcp_current_mss(sk, 0);
- unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
-
- if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
- tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
-
- /* We are probing the opening of a window
- * but the window size is != 0
- * must have been a result SWS avoidance ( sender )
- */
- if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
- skb->len > mss) {
- seg_size = min(seg_size, mss);
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
- if (tcp_fragment(sk, skb, seg_size, mss))
- return -1;
- } else if (!tcp_skb_pcount(skb))
- tcp_set_skb_tso_segs(sk, skb, mss);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ if (sk->sk_state == TCP_CLOSE)
+ return -1;
+
+ if ((skb = tcp_send_head(sk)) != NULL &&
+ before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
+ int err;
+ unsigned int mss = tcp_current_mss(sk, 0);
+ unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+ if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+ tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+
+ /* We are probing the opening of a window
+ * but the window size is != 0
+ * must have been a result SWS avoidance ( sender )
+ */
+ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+ skb->len > mss) {
+ seg_size = min(seg_size, mss);
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
- err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
- if (!err) {
- update_send_head(sk, skb);
- }
- return err;
- } else {
- if (tp->urg_mode &&
- between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
- tcp_xmit_probe_skb(sk, TCPCB_URG);
- return tcp_xmit_probe_skb(sk, 0);
- }
+ if (tcp_fragment(sk, skb, seg_size, mss))
+ return -1;
+ } else if (!tcp_skb_pcount(skb))
+ tcp_set_skb_tso_segs(sk, skb, mss);
+
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ if (!err)
+ tcp_event_new_data_sent(sk, skb);
+ return err;
+ } else {
+ if (tp->urg_mode &&
+ between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
+ tcp_xmit_probe_skb(sk, 1);
+ return tcp_xmit_probe_skb(sk, 0);
}
- return -1;
}
/* A window probe timeout has occurred. If window is not closed send
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index be27a33a1c68..2747ec7bfb63 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,8 +15,7 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack,
- u32 in_flight, int flag)
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d8970ecfcfc8..803d758a2b12 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -114,13 +114,31 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
return retries;
}
+static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
+{
+ /* Black hole detection */
+ if (sysctl_tcp_mtu_probing) {
+ if (!icsk->icsk_mtup.enabled) {
+ icsk->icsk_mtup.enabled = 1;
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ } else {
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss;
+
+ mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
+ mss = min(sysctl_tcp_base_mss, mss);
+ mss = max(mss, 68 - tp->tcp_header_len);
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ }
+ }
+}
+
/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
int retry_until;
- int mss;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
@@ -129,18 +147,7 @@ static int tcp_write_timeout(struct sock *sk)
} else {
if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
/* Black hole detection */
- if (sysctl_tcp_mtu_probing) {
- if (!icsk->icsk_mtup.enabled) {
- icsk->icsk_mtup.enabled = 1;
- tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- } else {
- mss = min(sysctl_tcp_base_mss,
- tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2);
- mss = max(mss, 68 - tp->tcp_header_len);
- icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
- tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- }
- }
+ tcp_mtu_probing(icsk, sk);
dst_negative_advice(&sk->sk_dst_cache);
}
@@ -179,7 +186,7 @@ static void tcp_delack_timer(unsigned long data)
goto out_unlock;
}
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim_partial(sk);
if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
@@ -219,7 +226,7 @@ static void tcp_delack_timer(unsigned long data)
out:
if (tcp_memory_pressure)
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
@@ -413,7 +420,7 @@ static void tcp_write_timer(unsigned long data)
TCP_CHECK_TIMER(sk);
out:
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
@@ -507,7 +514,7 @@ static void tcp_keepalive_timer (unsigned long data)
}
TCP_CHECK_TIMER(sk);
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
resched:
inet_csk_reset_keepalive_timer (sk, elapsed);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 007304e99842..be24d6ee34bd 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -162,14 +162,13 @@ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
}
EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
-static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
- u32 in_flight, int flag)
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct vegas *vegas = inet_csk_ca(sk);
if (!vegas->doing_vegas_now)
- return tcp_reno_cong_avoid(sk, ack, in_flight, flag);
+ return tcp_reno_cong_avoid(sk, ack, in_flight);
/* The key players are v_beg_snd_una and v_beg_snd_nxt.
*
@@ -228,7 +227,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, in_flight, flag);
+ tcp_reno_cong_avoid(sk, ack, in_flight);
} else {
u32 rtt, target_cwnd, diff;
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 8fb2aee0b1a4..d16689e98516 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -114,14 +114,13 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
tcp_veno_init(sk);
}
-static void tcp_veno_cong_avoid(struct sock *sk, u32 ack,
- u32 in_flight, int flag)
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
if (!veno->doing_veno_now)
- return tcp_reno_cong_avoid(sk, ack, in_flight, flag);
+ return tcp_reno_cong_avoid(sk, ack, in_flight);
/* limited by applications */
if (!tcp_is_cwnd_limited(sk, in_flight))
@@ -132,7 +131,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack,
/* We don't have enough rtt samples to do the Veno
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, in_flight, flag);
+ tcp_reno_cong_avoid(sk, ack, in_flight);
} else {
u32 rtt, target_cwnd;
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index c107fba7430e..e03b10183a8b 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -69,8 +69,7 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
}
-static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack,
- u32 in_flight, int flag)
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
{
struct tcp_sock *tp = tcp_sk(sk);
struct yeah *yeah = inet_csk_ca(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 03c400ca14c5..2fb8d731026b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -82,6 +82,7 @@
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <linux/bootmem.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
@@ -110,10 +111,25 @@
*/
DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
+EXPORT_SYMBOL(udp_statistics);
+
+DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
+EXPORT_SYMBOL(udp_stats_in6);
struct hlist_head udp_hash[UDP_HTABLE_SIZE];
DEFINE_RWLOCK(udp_hash_lock);
+int sysctl_udp_mem[3] __read_mostly;
+int sysctl_udp_rmem_min __read_mostly;
+int sysctl_udp_wmem_min __read_mostly;
+
+EXPORT_SYMBOL(sysctl_udp_mem);
+EXPORT_SYMBOL(sysctl_udp_rmem_min);
+EXPORT_SYMBOL(sysctl_udp_wmem_min);
+
+atomic_t udp_memory_allocated;
+EXPORT_SYMBOL(udp_memory_allocated);
+
static inline int __udp_lib_lport_inuse(__u16 num,
const struct hlist_head udptable[])
{
@@ -214,7 +230,7 @@ gotit:
if (sk_unhashed(sk)) {
head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];
sk_add_node(sk, head);
- sock_prot_inc_use(sk->sk_prot);
+ sock_prot_inuse_add(sk->sk_prot, 1);
}
error = 0;
fail:
@@ -402,7 +418,7 @@ out:
void udp_err(struct sk_buff *skb, u32 info)
{
- return __udp4_lib_err(skb, info, udp_hash);
+ __udp4_lib_err(skb, info, udp_hash);
}
/*
@@ -471,6 +487,7 @@ static int udp_push_pending_frames(struct sock *sk)
struct sk_buff *skb;
struct udphdr *uh;
int err = 0;
+ int is_udplite = IS_UDPLITE(sk);
__wsum csum = 0;
/* Grab the skbuff where UDP header space exists. */
@@ -486,7 +503,7 @@ static int udp_push_pending_frames(struct sock *sk)
uh->len = htons(up->len);
uh->check = 0;
- if (up->pcflag) /* UDP-Lite */
+ if (is_udplite) /* UDP-Lite */
csum = udplite_csum_outgoing(sk, skb);
else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
@@ -514,7 +531,7 @@ out:
up->len = 0;
up->pending = 0;
if (!err)
- UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, up->pcflag);
+ UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite);
return err;
}
@@ -531,7 +548,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
__be32 daddr, faddr, saddr;
__be16 dport;
u8 tos;
- int err, is_udplite = up->pcflag;
+ int err, is_udplite = IS_UDPLITE(sk);
int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
@@ -621,7 +638,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
connected = 0;
}
- if (MULTICAST(daddr)) {
+ if (ipv4_is_multicast(daddr)) {
if (!ipc.oif)
ipc.oif = inet->mc_index;
if (!saddr)
@@ -643,7 +660,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
{ .sport = inet->sport,
.dport = dport } } };
security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(&rt, &fl, sk, 1);
+ err = ip_route_output_flow(&init_net, &rt, &fl, sk, 1);
if (err) {
if (err == -ENETUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
@@ -825,6 +842,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
struct sk_buff *skb;
unsigned int ulen, copied;
+ int peeked;
int err;
int is_udplite = IS_UDPLITE(sk);
@@ -838,7 +856,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
return ip_recv_error(sk, msg, len);
try_again:
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+ &peeked, &err);
if (!skb)
goto out;
@@ -873,6 +892,9 @@ try_again:
if (err)
goto out_free;
+ if (!peeked)
+ UDP_INC_STATS_USER(UDP_MIB_INDATAGRAMS, is_udplite);
+
sock_recv_timestamp(msg, sk, skb);
/* Copy the address. */
@@ -891,14 +913,17 @@ try_again:
err = ulen;
out_free:
+ lock_sock(sk);
skb_free_datagram(sk, skb);
+ release_sock(sk);
out:
return err;
csum_copy_err:
- UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);
-
- skb_kill_datagram(sk, skb, flags);
+ lock_sock(sk);
+ if (!skb_kill_datagram(sk, skb, flags))
+ UDP_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite);
+ release_sock(sk);
if (noblock)
return -EAGAIN;
@@ -940,6 +965,7 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int rc;
+ int is_udplite = IS_UDPLITE(sk);
/*
* Charge it to the socket, dropping if the queue is full.
@@ -967,7 +993,8 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
ret = (*up->encap_rcv)(sk, skb);
if (ret <= 0) {
- UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
+ UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS,
+ is_udplite);
return -ret;
}
}
@@ -978,7 +1005,7 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets
*/
- if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+ if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
/*
* MIB statistics other than incrementing the error count are
@@ -1019,15 +1046,14 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) {
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM)
- UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
+ UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, is_udplite);
goto drop;
}
- UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
return 0;
drop:
- UDP_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag);
+ UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
return -1;
}
@@ -1062,7 +1088,15 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
skb1 = skb_clone(skb, GFP_ATOMIC);
if (skb1) {
- int ret = udp_queue_rcv_skb(sk, skb1);
+ int ret = 0;
+
+ bh_lock_sock_nested(sk);
+ if (!sock_owned_by_user(sk))
+ ret = udp_queue_rcv_skb(sk, skb1);
+ else
+ sk_add_backlog(sk, skb1);
+ bh_unlock_sock(sk);
+
if (ret > 0)
/* we should probably re-process instead
* of dropping packets here. */
@@ -1155,7 +1189,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
inet_iif(skb), udptable);
if (sk != NULL) {
- int ret = udp_queue_rcv_skb(sk, skb);
+ int ret = 0;
+ bh_lock_sock_nested(sk);
+ if (!sock_owned_by_user(sk))
+ ret = udp_queue_rcv_skb(sk, skb);
+ else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
@@ -1236,6 +1276,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
struct udp_sock *up = udp_sk(sk);
int val;
int err = 0;
+ int is_udplite = IS_UDPLITE(sk);
if (optlen<sizeof(int))
return -EINVAL;
@@ -1277,7 +1318,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
/* The sender sets actual checksum coverage length via this option.
* The case coverage > packet length is handled by send module. */
case UDPLITE_SEND_CSCOV:
- if (!up->pcflag) /* Disable the option on UDP sockets */
+ if (!is_udplite) /* Disable the option on UDP sockets */
return -ENOPROTOOPT;
if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
val = 8;
@@ -1289,7 +1330,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
* sense, this should be set to at least 8 (as done below). If zero is
* used, this again means full checksum coverage. */
case UDPLITE_RECV_CSCOV:
- if (!up->pcflag) /* Disable the option on UDP sockets */
+ if (!is_udplite) /* Disable the option on UDP sockets */
return -ENOPROTOOPT;
if (val != 0 && val < 8) /* Avoid silly minimal values. */
val = 8;
@@ -1449,6 +1490,10 @@ struct proto udp_prot = {
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem = &sysctl_udp_wmem_min,
+ .sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
@@ -1505,6 +1550,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
}
static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(udp_hash_lock)
{
read_lock(&udp_hash_lock);
return *pos ? udp_get_idx(seq, *pos-1) : (void *)1;
@@ -1524,6 +1570,7 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void udp_seq_stop(struct seq_file *seq, void *v)
+ __releases(udp_hash_lock)
{
read_unlock(&udp_hash_lock);
}
@@ -1644,6 +1691,25 @@ void udp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
+void __init udp_init(void)
+{
+ unsigned long limit;
+
+ /* Set the pressure threshold up by the same strategy of TCP. It is a
+ * fraction of global memory that is up to 1/2 at 256 MB, decreasing
+ * toward zero with the amount of memory, with a floor of 128 pages.
+ */
+ limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
+ limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+ limit = max(limit, 128UL);
+ sysctl_udp_mem[0] = limit / 4 * 3;
+ sysctl_udp_mem[1] = limit;
+ sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
+
+ sysctl_udp_rmem_min = SK_MEM_QUANTUM;
+ sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+}
+
EXPORT_SYMBOL(udp_disconnect);
EXPORT_SYMBOL(udp_hash);
EXPORT_SYMBOL(udp_hash_lock);
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index f5baeb3e8b85..001b881ca36f 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -35,7 +35,7 @@ static int udplite_rcv(struct sk_buff *skb)
static void udplite_err(struct sk_buff *skb, u32 info)
{
- return __udp4_lib_err(skb, info, udplite_hash);
+ __udp4_lib_err(skb, info, udplite_hash);
}
static struct net_protocol udplite_protocol = {
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 5e95c8a07efb..390dcb1354a5 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -16,7 +16,11 @@
#include <net/ip.h>
#include <net/xfrm.h>
-#ifdef CONFIG_NETFILTER
+int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ return xfrm4_extract_header(skb);
+}
+
static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
{
if (skb->dst == NULL) {
@@ -31,129 +35,35 @@ drop:
kfree_skb(skb);
return NET_RX_DROP;
}
-#endif
int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
int encap_type)
{
- int err;
- __be32 seq;
- struct xfrm_state *xfrm_vec[XFRM_MAX_DEPTH];
- struct xfrm_state *x;
- int xfrm_nr = 0;
- int decaps = 0;
- unsigned int nhoff = offsetof(struct iphdr, protocol);
-
- seq = 0;
- if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0)
- goto drop;
-
- do {
- const struct iphdr *iph = ip_hdr(skb);
-
- if (xfrm_nr == XFRM_MAX_DEPTH)
- goto drop;
-
- x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi,
- nexthdr, AF_INET);
- if (x == NULL)
- goto drop;
-
- spin_lock(&x->lock);
- if (unlikely(x->km.state != XFRM_STATE_VALID))
- goto drop_unlock;
-
- if ((x->encap ? x->encap->encap_type : 0) != encap_type)
- goto drop_unlock;
-
- if (x->props.replay_window && xfrm_replay_check(x, seq))
- goto drop_unlock;
-
- if (xfrm_state_check_expire(x))
- goto drop_unlock;
-
- nexthdr = x->type->input(x, skb);
- if (nexthdr <= 0)
- goto drop_unlock;
-
- skb_network_header(skb)[nhoff] = nexthdr;
-
- /* only the first xfrm gets the encap type */
- encap_type = 0;
-
- if (x->props.replay_window)
- xfrm_replay_advance(x, seq);
-
- x->curlft.bytes += skb->len;
- x->curlft.packets++;
-
- spin_unlock(&x->lock);
-
- xfrm_vec[xfrm_nr++] = x;
-
- if (x->outer_mode->input(x, skb))
- goto drop;
-
- if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) {
- decaps = 1;
- break;
- }
-
- err = xfrm_parse_spi(skb, nexthdr, &spi, &seq);
- if (err < 0)
- goto drop;
- } while (!err);
-
- /* Allocate new secpath or COW existing one. */
-
- if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
- struct sec_path *sp;
- sp = secpath_dup(skb->sp);
- if (!sp)
- goto drop;
- if (skb->sp)
- secpath_put(skb->sp);
- skb->sp = sp;
- }
- if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH)
- goto drop;
-
- memcpy(skb->sp->xvec + skb->sp->len, xfrm_vec,
- xfrm_nr * sizeof(xfrm_vec[0]));
- skb->sp->len += xfrm_nr;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+ return xfrm_input(skb, nexthdr, spi, encap_type);
+}
+EXPORT_SYMBOL(xfrm4_rcv_encap);
- nf_reset(skb);
+int xfrm4_transport_finish(struct sk_buff *skb, int async)
+{
+ struct iphdr *iph = ip_hdr(skb);
- if (decaps) {
- dst_release(skb->dst);
- skb->dst = NULL;
- netif_rx(skb);
- return 0;
- } else {
-#ifdef CONFIG_NETFILTER
- __skb_push(skb, skb->data - skb_network_header(skb));
- ip_hdr(skb)->tot_len = htons(skb->len);
- ip_send_check(ip_hdr(skb));
+ iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
- NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL,
- xfrm4_rcv_encap_finish);
- return 0;
-#else
- return -ip_hdr(skb)->protocol;
+#ifndef CONFIG_NETFILTER
+ if (!async)
+ return -iph->protocol;
#endif
- }
-drop_unlock:
- spin_unlock(&x->lock);
- xfrm_state_put(x);
-drop:
- while (--xfrm_nr >= 0)
- xfrm_state_put(xfrm_vec[xfrm_nr]);
+ __skb_push(skb, skb->data - skb_network_header(skb));
+ iph->tot_len = htons(skb->len);
+ ip_send_check(iph);
- kfree_skb(skb);
+ NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
+ xfrm4_rcv_encap_finish);
return 0;
}
-EXPORT_SYMBOL(xfrm4_rcv_encap);
/* If it's a keepalive packet, then just eat it.
* If it's an encapsulated packet, then pass it to the
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index e42e122414be..e093a7b59e18 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -17,6 +17,21 @@
#include <net/ip.h>
#include <net/xfrm.h>
+static void xfrm4_beet_make_header(struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ iph->ihl = 5;
+ iph->version = 4;
+
+ iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
+ iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
+
+ iph->id = XFRM_MODE_SKB_CB(skb)->id;
+ iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off;
+ iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl;
+}
+
/* Add encapsulation header.
*
* The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
@@ -40,10 +55,12 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
offsetof(struct iphdr, protocol);
skb->transport_header = skb->network_header + sizeof(*iph);
+ xfrm4_beet_make_header(skb);
+
ph = (struct ip_beet_phdr *)__skb_pull(skb, sizeof(*iph) - hdrlen);
top_iph = ip_hdr(skb);
- memmove(top_iph, iph, sizeof(*iph));
+
if (unlikely(optlen)) {
BUG_ON(optlen < 0);
@@ -65,43 +82,46 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct iphdr *iph = ip_hdr(skb);
- int phlen = 0;
+ struct iphdr *iph;
int optlen = 0;
- u8 ph_nexthdr = 0;
int err = -EINVAL;
- if (unlikely(iph->protocol == IPPROTO_BEETPH)) {
+ if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) {
struct ip_beet_phdr *ph;
+ int phlen;
if (!pskb_may_pull(skb, sizeof(*ph)))
goto out;
- ph = (struct ip_beet_phdr *)(ipip_hdr(skb) + 1);
+
+ ph = (struct ip_beet_phdr *)skb->data;
phlen = sizeof(*ph) + ph->padlen;
optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen);
if (optlen < 0 || optlen & 3 || optlen > 250)
goto out;
- if (!pskb_may_pull(skb, phlen + optlen))
- goto out;
- skb->len -= phlen + optlen;
+ XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr;
- ph_nexthdr = ph->nexthdr;
+ if (!pskb_may_pull(skb, phlen));
+ goto out;
+ __skb_pull(skb, phlen);
}
- skb_set_network_header(skb, phlen - sizeof(*iph));
- memmove(skb_network_header(skb), iph, sizeof(*iph));
- skb_set_transport_header(skb, phlen + optlen);
- skb->data = skb_transport_header(skb);
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+
+ memmove(skb->data - skb->mac_len, skb_mac_header(skb),
+ skb->mac_len);
+ skb_set_mac_header(skb, -skb->mac_len);
+
+ xfrm4_beet_make_header(skb);
iph = ip_hdr(skb);
- iph->ihl = (sizeof(*iph) + optlen) / 4;
- iph->tot_len = htons(skb->len + iph->ihl * 4);
+
+ iph->ihl += optlen / 4;
+ iph->tot_len = htons(skb->len);
iph->daddr = x->sel.daddr.a4;
iph->saddr = x->sel.saddr.a4;
- if (ph_nexthdr)
- iph->protocol = ph_nexthdr;
iph->check = 0;
iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
err = 0;
@@ -110,8 +130,10 @@ out:
}
static struct xfrm_mode xfrm4_beet_mode = {
- .input = xfrm4_beet_input,
- .output = xfrm4_beet_output,
+ .input2 = xfrm4_beet_input,
+ .input = xfrm_prepare_input,
+ .output2 = xfrm4_beet_output,
+ .output = xfrm4_prepare_output,
.owner = THIS_MODULE,
.encap = XFRM_MODE_BEET,
.flags = XFRM_MODE_FLAG_TUNNEL,
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index e4deecba6dd2..8dee617ee900 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -16,92 +16,60 @@
static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
{
- struct iphdr *outer_iph = ip_hdr(skb);
struct iphdr *inner_iph = ipip_hdr(skb);
- if (INET_ECN_is_ce(outer_iph->tos))
+ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
IP_ECN_set_ce(inner_iph);
}
-static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
-{
- if (INET_ECN_is_ce(iph->tos))
- IP6_ECN_set_ce(ipv6_hdr(skb));
-}
-
/* Add encapsulation header.
*
* The top IP header will be constructed per RFC 2401.
*/
-static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
+static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
- struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
- struct iphdr *iph, *top_iph;
+ struct iphdr *top_iph;
int flags;
- iph = ip_hdr(skb);
-
skb_set_network_header(skb, -x->props.header_len);
skb->mac_header = skb->network_header +
offsetof(struct iphdr, protocol);
- skb->transport_header = skb->network_header + sizeof(*iph);
+ skb->transport_header = skb->network_header + sizeof(*top_iph);
top_iph = ip_hdr(skb);
top_iph->ihl = 5;
top_iph->version = 4;
- flags = x->props.flags;
+ top_iph->protocol = x->inner_mode->afinfo->proto;
/* DS disclosed */
- if (xdst->route->ops->family == AF_INET) {
- top_iph->protocol = IPPROTO_IPIP;
- top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
- top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
- 0 : (iph->frag_off & htons(IP_DF));
- }
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- else {
- struct ipv6hdr *ipv6h = (struct ipv6hdr*)iph;
- top_iph->protocol = IPPROTO_IPV6;
- top_iph->tos = INET_ECN_encapsulate(iph->tos, ipv6_get_dsfield(ipv6h));
- top_iph->frag_off = 0;
- }
-#endif
+ top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos,
+ XFRM_MODE_SKB_CB(skb)->tos);
+ flags = x->props.flags;
if (flags & XFRM_STATE_NOECN)
IP_ECN_clear(top_iph);
- if (!top_iph->frag_off)
- __ip_select_ident(top_iph, dst->child, 0);
+ top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+ 0 : XFRM_MODE_SKB_CB(skb)->frag_off;
+ ip_select_ident(top_iph, dst->child, NULL);
top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
top_iph->saddr = x->props.saddr.a4;
top_iph->daddr = x->id.daddr.a4;
- skb->protocol = htons(ETH_P_IP);
-
- memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
return 0;
}
-static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct iphdr *iph = ip_hdr(skb);
const unsigned char *old_mac;
int err = -EINVAL;
- switch (iph->protocol){
- case IPPROTO_IPIP:
- break;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- case IPPROTO_IPV6:
- break;
-#endif
- default:
- goto out;
- }
+ if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
+ goto out;
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto out;
@@ -110,20 +78,11 @@ static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
(err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
goto out;
- iph = ip_hdr(skb);
- if (iph->protocol == IPPROTO_IPIP) {
- if (x->props.flags & XFRM_STATE_DECAP_DSCP)
- ipv4_copy_dscp(iph, ipip_hdr(skb));
- if (!(x->props.flags & XFRM_STATE_NOECN))
- ipip_ecn_decapsulate(skb);
- }
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- else {
- if (!(x->props.flags & XFRM_STATE_NOECN))
- ipip6_ecn_decapsulate(iph, skb);
- skb->protocol = htons(ETH_P_IPV6);
- }
-#endif
+ if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+ ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb));
+ if (!(x->props.flags & XFRM_STATE_NOECN))
+ ipip_ecn_decapsulate(skb);
+
old_mac = skb_mac_header(skb);
skb_set_mac_header(skb, -skb->mac_len);
memmove(skb_mac_header(skb), old_mac, skb->mac_len);
@@ -135,19 +94,21 @@ out:
}
static struct xfrm_mode xfrm4_tunnel_mode = {
- .input = xfrm4_tunnel_input,
- .output = xfrm4_tunnel_output,
+ .input2 = xfrm4_mode_tunnel_input,
+ .input = xfrm_prepare_input,
+ .output2 = xfrm4_mode_tunnel_output,
+ .output = xfrm4_prepare_output,
.owner = THIS_MODULE,
.encap = XFRM_MODE_TUNNEL,
.flags = XFRM_MODE_FLAG_TUNNEL,
};
-static int __init xfrm4_tunnel_init(void)
+static int __init xfrm4_mode_tunnel_init(void)
{
return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
}
-static void __exit xfrm4_tunnel_exit(void)
+static void __exit xfrm4_mode_tunnel_exit(void)
{
int err;
@@ -155,7 +116,7 @@ static void __exit xfrm4_tunnel_exit(void)
BUG_ON(err);
}
-module_init(xfrm4_tunnel_init);
-module_exit(xfrm4_tunnel_exit);
+module_init(xfrm4_mode_tunnel_init);
+module_exit(xfrm4_mode_tunnel_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index c4a7156962bd..d5a58a818021 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -8,11 +8,12 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/compiler.h>
#include <linux/if_ether.h>
#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netfilter_ipv4.h>
+#include <net/dst.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/icmp.h>
@@ -25,8 +26,6 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
goto out;
- IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
-
if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
goto out;
@@ -40,106 +39,54 @@ out:
return ret;
}
-static inline int xfrm4_output_one(struct sk_buff *skb)
+int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
{
- struct dst_entry *dst = skb->dst;
- struct xfrm_state *x = dst->xfrm;
- struct iphdr *iph;
int err;
- if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) {
- err = xfrm4_tunnel_check_size(skb);
- if (err)
- goto error_nolock;
- }
-
- err = xfrm_output(skb);
+ err = xfrm4_tunnel_check_size(skb);
if (err)
- goto error_nolock;
+ return err;
- iph = ip_hdr(skb);
- iph->tot_len = htons(skb->len);
- ip_send_check(iph);
+ XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol;
- IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
- err = 0;
-
-out_exit:
- return err;
-error_nolock:
- kfree_skb(skb);
- goto out_exit;
+ return xfrm4_extract_header(skb);
}
-static int xfrm4_output_finish2(struct sk_buff *skb)
+int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
- while (likely((err = xfrm4_output_one(skb)) == 0)) {
- nf_reset(skb);
-
- err = nf_hook(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
- skb->dst->dev, dst_output);
- if (unlikely(err != 1))
- break;
+ err = x->inner_mode->afinfo->extract_output(x, skb);
+ if (err)
+ return err;
- if (!skb->dst->xfrm)
- return dst_output(skb);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED;
- err = nf_hook(PF_INET, NF_IP_POST_ROUTING, skb, NULL,
- skb->dst->dev, xfrm4_output_finish2);
- if (unlikely(err != 1))
- break;
- }
+ skb->protocol = htons(ETH_P_IP);
- return err;
+ return x->outer_mode->output2(x, skb);
}
+EXPORT_SYMBOL(xfrm4_prepare_output);
static int xfrm4_output_finish(struct sk_buff *skb)
{
- struct sk_buff *segs;
-
#ifdef CONFIG_NETFILTER
if (!skb->dst->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
-#endif
- if (!skb_is_gso(skb))
- return xfrm4_output_finish2(skb);
+ IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
+#endif
skb->protocol = htons(ETH_P_IP);
- segs = skb_gso_segment(skb, 0);
- kfree_skb(skb);
- if (unlikely(IS_ERR(segs)))
- return PTR_ERR(segs);
-
- do {
- struct sk_buff *nskb = segs->next;
- int err;
-
- segs->next = NULL;
- err = xfrm4_output_finish2(segs);
-
- if (unlikely(err)) {
- while ((segs = nskb)) {
- nskb = segs->next;
- segs->next = NULL;
- kfree_skb(segs);
- }
- return err;
- }
-
- segs = nskb;
- } while (segs);
-
- return 0;
+ return xfrm_output(skb);
}
int xfrm4_output(struct sk_buff *skb)
{
- return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev,
- xfrm4_output_finish,
+ return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb,
+ NULL, skb->dst->dev, xfrm4_output_finish,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index cc86fb110dd8..3783e3ee56a4 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -8,36 +8,54 @@
*
*/
-#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
#include <linux/inetdevice.h>
+#include <net/dst.h>
#include <net/xfrm.h>
#include <net/ip.h>
static struct dst_ops xfrm4_dst_ops;
static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
-static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
+static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr,
+ xfrm_address_t *daddr)
{
- return __ip_route_output_key((struct rtable**)dst, fl);
-}
-
-static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr)
-{
- struct rtable *rt;
- struct flowi fl_tunnel = {
+ struct flowi fl = {
.nl_u = {
.ip4_u = {
+ .tos = tos,
.daddr = daddr->a4,
},
},
};
+ struct dst_entry *dst;
+ struct rtable *rt;
+ int err;
- if (!xfrm4_dst_lookup((struct xfrm_dst **)&rt, &fl_tunnel)) {
- saddr->a4 = rt->rt_src;
- dst_release(&rt->u.dst);
- return 0;
- }
- return -EHOSTUNREACH;
+ if (saddr)
+ fl.fl4_src = saddr->a4;
+
+ err = __ip_route_output_key(&init_net, &rt, &fl);
+ dst = &rt->u.dst;
+ if (err)
+ dst = ERR_PTR(err);
+ return dst;
+}
+
+static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr)
+{
+ struct dst_entry *dst;
+ struct rtable *rt;
+
+ dst = xfrm4_dst_lookup(0, NULL, daddr);
+ if (IS_ERR(dst))
+ return -EHOSTUNREACH;
+
+ rt = (struct rtable *)dst;
+ saddr->a4 = rt->rt_src;
+ dst_release(dst);
+ return 0;
}
static struct dst_entry *
@@ -61,142 +79,49 @@ __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
return dst;
}
-/* Allocate chain of dst_entry's, attach known xfrm's, calculate
- * all the metrics... Shortly, bundle a bundle.
- */
+static int xfrm4_get_tos(struct flowi *fl)
+{
+ return fl->fl4_tos;
+}
-static int
-__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
- struct flowi *fl, struct dst_entry **dst_p)
+static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
+ int nfheader_len)
{
- struct dst_entry *dst, *dst_prev;
- struct rtable *rt0 = (struct rtable*)(*dst_p);
- struct rtable *rt = rt0;
- struct flowi fl_tunnel = {
- .nl_u = {
- .ip4_u = {
- .saddr = fl->fl4_src,
- .daddr = fl->fl4_dst,
- .tos = fl->fl4_tos
- }
- }
- };
- int i;
- int err;
- int header_len = 0;
- int trailer_len = 0;
+ return 0;
+}
- dst = dst_prev = NULL;
- dst_hold(&rt->u.dst);
+static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev)
+{
+ struct rtable *rt = (struct rtable *)xdst->route;
- for (i = 0; i < nx; i++) {
- struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
- struct xfrm_dst *xdst;
+ xdst->u.rt.fl = rt->fl;
- if (unlikely(dst1 == NULL)) {
- err = -ENOBUFS;
- dst_release(&rt->u.dst);
- goto error;
- }
+ xdst->u.dst.dev = dev;
+ dev_hold(dev);
- if (!dst)
- dst = dst1;
- else {
- dst_prev->child = dst1;
- dst1->flags |= DST_NOHASH;
- dst_clone(dst1);
- }
+ xdst->u.rt.idev = in_dev_get(dev);
+ if (!xdst->u.rt.idev)
+ return -ENODEV;
- xdst = (struct xfrm_dst *)dst1;
- xdst->route = &rt->u.dst;
- xdst->genid = xfrm[i]->genid;
-
- dst1->next = dst_prev;
- dst_prev = dst1;
-
- header_len += xfrm[i]->props.header_len;
- trailer_len += xfrm[i]->props.trailer_len;
-
- if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
- unsigned short encap_family = xfrm[i]->props.family;
- switch (encap_family) {
- case AF_INET:
- fl_tunnel.fl4_dst = xfrm[i]->id.daddr.a4;
- fl_tunnel.fl4_src = xfrm[i]->props.saddr.a4;
- break;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- case AF_INET6:
- ipv6_addr_copy(&fl_tunnel.fl6_dst, (struct in6_addr*)&xfrm[i]->id.daddr.a6);
- ipv6_addr_copy(&fl_tunnel.fl6_src, (struct in6_addr*)&xfrm[i]->props.saddr.a6);
- break;
-#endif
- default:
- BUG_ON(1);
- }
- err = xfrm_dst_lookup((struct xfrm_dst **)&rt,
- &fl_tunnel, encap_family);
- if (err)
- goto error;
- } else
- dst_hold(&rt->u.dst);
- }
+ xdst->u.rt.peer = rt->peer;
+ if (rt->peer)
+ atomic_inc(&rt->peer->refcnt);
- dst_prev->child = &rt->u.dst;
- dst->path = &rt->u.dst;
-
- *dst_p = dst;
- dst = dst_prev;
-
- dst_prev = *dst_p;
- i = 0;
- for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
- struct xfrm_dst *x = (struct xfrm_dst*)dst_prev;
- x->u.rt.fl = *fl;
-
- dst_prev->xfrm = xfrm[i++];
- dst_prev->dev = rt->u.dst.dev;
- if (rt->u.dst.dev)
- dev_hold(rt->u.dst.dev);
- dst_prev->obsolete = -1;
- dst_prev->flags |= DST_HOST;
- dst_prev->lastuse = jiffies;
- dst_prev->header_len = header_len;
- dst_prev->nfheader_len = 0;
- dst_prev->trailer_len = trailer_len;
- memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
-
- /* Copy neighbout for reachability confirmation */
- dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour);
- dst_prev->input = rt->u.dst.input;
- dst_prev->output = dst_prev->xfrm->outer_mode->afinfo->output;
- if (rt0->peer)
- atomic_inc(&rt0->peer->refcnt);
- x->u.rt.peer = rt0->peer;
- /* Sheit... I remember I did this right. Apparently,
- * it was magically lost, so this code needs audit */
- x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL);
- x->u.rt.rt_type = rt0->rt_type;
- x->u.rt.rt_src = rt0->rt_src;
- x->u.rt.rt_dst = rt0->rt_dst;
- x->u.rt.rt_gateway = rt0->rt_gateway;
- x->u.rt.rt_spec_dst = rt0->rt_spec_dst;
- x->u.rt.idev = rt0->idev;
- in_dev_hold(rt0->idev);
- header_len -= x->u.dst.xfrm->props.header_len;
- trailer_len -= x->u.dst.xfrm->props.trailer_len;
- }
+ /* Sheit... I remember I did this right. Apparently,
+ * it was magically lost, so this code needs audit */
+ xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
+ RTCF_LOCAL);
+ xdst->u.rt.rt_type = rt->rt_type;
+ xdst->u.rt.rt_src = rt->rt_src;
+ xdst->u.rt.rt_dst = rt->rt_dst;
+ xdst->u.rt.rt_gateway = rt->rt_gateway;
+ xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;
- xfrm_init_pmtu(dst);
return 0;
-
-error:
- if (dst)
- dst_free(dst);
- return err;
}
static void
-_decode_session4(struct sk_buff *skb, struct flowi *fl)
+_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
{
struct iphdr *iph = ip_hdr(skb);
u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
@@ -212,8 +137,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
__be16 *ports = (__be16 *)xprth;
- fl->fl_ip_sport = ports[0];
- fl->fl_ip_dport = ports[1];
+ fl->fl_ip_sport = ports[!!reverse];
+ fl->fl_ip_dport = ports[!reverse];
}
break;
@@ -255,12 +180,12 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
}
}
fl->proto = iph->protocol;
- fl->fl4_dst = iph->daddr;
- fl->fl4_src = iph->saddr;
+ fl->fl4_dst = reverse ? iph->saddr : iph->daddr;
+ fl->fl4_src = reverse ? iph->daddr : iph->saddr;
fl->fl4_tos = iph->tos;
}
-static inline int xfrm4_garbage_collect(void)
+static inline int xfrm4_garbage_collect(struct dst_ops *ops)
{
xfrm4_policy_afinfo.garbage_collect();
return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
@@ -295,7 +220,8 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
xdst = (struct xfrm_dst *)dst;
if (xdst->u.rt.idev->dev == dev) {
- struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
+ struct in_device *loopback_idev =
+ in_dev_get(dev->nd_net->loopback_dev);
BUG_ON(!loopback_idev);
do {
@@ -318,6 +244,7 @@ static struct dst_ops xfrm4_dst_ops = {
.update_pmtu = xfrm4_update_pmtu,
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
+ .local_out = __ip_local_out,
.gc_thresh = 1024,
.entry_size = sizeof(struct xfrm_dst),
};
@@ -328,8 +255,10 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.dst_lookup = xfrm4_dst_lookup,
.get_saddr = xfrm4_get_saddr,
.find_bundle = __xfrm4_find_bundle,
- .bundle_create = __xfrm4_bundle_create,
.decode_session = _decode_session4,
+ .get_tos = xfrm4_get_tos,
+ .init_path = xfrm4_init_path,
+ .fill_dst = xfrm4_fill_dst,
};
static void __init xfrm4_policy_init(void)
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 13d54a1c3337..fdeebe68a379 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -11,6 +11,7 @@
#include <net/xfrm.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
+#include <linux/netfilter_ipv4.h>
static struct xfrm_state_afinfo xfrm4_state_afinfo;
@@ -47,12 +48,31 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
x->props.family = AF_INET;
}
+int xfrm4_extract_header(struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ XFRM_MODE_SKB_CB(skb)->id = iph->id;
+ XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off;
+ XFRM_MODE_SKB_CB(skb)->tos = iph->tos;
+ XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl;
+ memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0,
+ sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
+
+ return 0;
+}
+
static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.family = AF_INET,
+ .proto = IPPROTO_IPIP,
+ .eth_proto = htons(ETH_P_IP),
.owner = THIS_MODULE,
.init_flags = xfrm4_init_flags,
.init_tempsel = __xfrm4_init_tempsel,
.output = xfrm4_output,
+ .extract_input = xfrm4_extract_input,
+ .extract_output = xfrm4_extract_output,
+ .transport_finish = xfrm4_transport_finish,
};
void __init xfrm4_state_init(void)