diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 20:41:05 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 20:41:05 +0300 |
commit | b0f85fa11aefc4f3e03306b4cd47f113bd57dcba (patch) | |
tree | 1333d36d99fde3f97210795941fc246f0ad08a75 /net/ipv4 | |
parent | ccc9d4a6d640cbde05d519edeb727881646cf71b (diff) | |
parent | f32bfb9a8ca083f8d148ea90ae5ba66f4831836e (diff) | |
download | linux-b0f85fa11aefc4f3e03306b4cd47f113bd57dcba.tar.xz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
Changes of note:
1) Allow to schedule ICMP packets in IPVS, from Alex Gartrell.
2) Provide FIB table ID in ipv4 route dumps just as ipv6 does, from
David Ahern.
3) Allow the user to ask for the statistics to be filtered out of
ipv4/ipv6 address netlink dumps. From Sowmini Varadhan.
4) More work to pass the network namespace context around deep into
various packet path APIs, starting with the netfilter hooks. From
Eric W Biederman.
5) Add layer 2 TX/RX checksum offloading to qeth driver, from Thomas
Richter.
6) Use usec resolution for SYN/ACK RTTs in TCP, from Yuchung Cheng.
7) Support Very High Throughput in wireless MESH code, from Bob
Copeland.
8) Allow setting the ageing_time in switchdev/rocker. From Scott
Feldman.
9) Properly autoload L2TP type modules, from Stephen Hemminger.
10) Fix and enable offload features by default in 8139cp driver, from
David Woodhouse.
11) Support both ipv4 and ipv6 sockets in a single vxlan device, from
Jiri Benc.
12) Fix CWND limiting of thin streams in TCP, from Bendik Rønning
Opstad.
13) Fix IPSEC flowcache overflows on large systems, from Steffen
Klassert.
14) Convert bridging to track VLANs using rhashtable entries rather than
a bitmap. From Nikolay Aleksandrov.
15) Make TCP listener handling completely lockless, this is a major
accomplishment. Incoming request sockets now live in the
established hash table just like any other socket too.
From Eric Dumazet.
15) Provide more bridging attributes to netlink, from Nikolay
Aleksandrov.
16) Use hash based algorithm for ipv4 multipath routing, this was very
long overdue. From Peter Nørlund.
17) Several y2038 cures, mostly avoiding timespec. From Arnd Bergmann.
18) Allow non-root execution of EBPF programs, from Alexei Starovoitov.
19) Support SO_INCOMING_CPU as setsockopt, from Eric Dumazet. This
influences the port binding selection logic used by SO_REUSEPORT.
20) Add ipv6 support to VRF, from David Ahern.
21) Add support for Mellanox Spectrum switch ASIC, from Jiri Pirko.
22) Add rtl8xxxu Realtek wireless driver, from Jes Sorensen.
23) Implement RACK loss recovery in TCP, from Yuchung Cheng.
24) Support multipath routes in MPLS, from Roopa Prabhu.
25) Fix POLLOUT notification for listening sockets in AF_UNIX, from Eric
Dumazet.
26) Add new QED Qlogic river, from Yuval Mintz, Manish Chopra, and
Sudarsana Kalluru.
27) Don't fetch timestamps on AF_UNIX sockets, from Hannes Frederic
Sowa.
28) Support ipv6 geneve tunnels, from John W Linville.
29) Add flood control support to switchdev layer, from Ido Schimmel.
30) Fix CHECKSUM_PARTIAL handling of potentially fragmented frames, from
Hannes Frederic Sowa.
31) Support persistent maps and progs in bpf, from Daniel Borkmann.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1790 commits)
sh_eth: use DMA barriers
switchdev: respect SKIP_EOPNOTSUPP flag in case there is no recursion
net: sched: kill dead code in sch_choke.c
irda: Delete an unnecessary check before the function call "irlmp_unregister_service"
net: dsa: mv88e6xxx: include DSA ports in VLANs
net: dsa: mv88e6xxx: disable SA learning for DSA and CPU ports
net/core: fix for_each_netdev_feature
vlan: Invoke driver vlan hooks only if device is present
arcnet/com20020: add LEDS_CLASS dependency
bpf, verifier: annotate verbose printer with __printf
dp83640: Only wait for timestamps for packets with timestamping enabled.
ptp: Change ptp_class to a proper bitmask
dp83640: Prune rx timestamp list before reading from it
dp83640: Delay scheduled work.
dp83640: Include hash in timestamp/packet matching
ipv6: fix tunnel error handling
net/mlx5e: Fix LSO vlan insertion
net/mlx5e: Re-eanble client vlan TX acceleration
net/mlx5e: Return error in case mlx5e_set_features() fails
net/mlx5e: Don't allow more than max supported channels
...
Diffstat (limited to 'net/ipv4')
66 files changed, 1398 insertions, 1236 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 89aacb630a53..c29809f765dc 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ + tcp_recovery.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1d0c3adb6f34..11c4ca13ec3b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,7 +119,7 @@ #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif -#include <net/vrf.h> +#include <net/l3mdev.h> /* The inetsw table contains everything that inet_create needs to @@ -219,17 +219,13 @@ int inet_listen(struct socket *sock, int backlog) * shutdown() (rather than close()). */ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && - !inet_csk(sk)->icsk_accept_queue.fastopenq) { + !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) - err = fastopen_init_queue(sk, backlog); + fastopen_queue_tune(sk, backlog); else if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT2) != 0) - err = fastopen_init_queue(sk, + fastopen_queue_tune(sk, ((uint)sysctl_tcp_fastopen) >> 16); - else - err = 0; - if (err) - goto out; tcp_fastopen_init_key_once(true); } @@ -450,7 +446,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - tb_id = vrf_dev_table_ifindex(net, sk->sk_bound_dev_if) ? : tb_id; + tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too @@ -1043,22 +1039,16 @@ void inet_register_protosw(struct inet_protosw *p) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ - answer = NULL; last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); - /* Check only the non-wild match. */ - if (INET_PROTOSW_PERMANENT & answer->flags) { - if (protocol == answer->protocol) - break; - last_perm = lh; - } - - answer = NULL; + if ((INET_PROTOSW_PERMANENT & answer->flags) == 0) + break; + if (protocol == answer->protocol) + goto out_permanent; + last_perm = lh; } - if (answer) - goto out_permanent; /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 0c9c3482e419..59b3e0e8fd51 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -624,14 +624,20 @@ out: } EXPORT_SYMBOL(arp_create); +static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + return dev_queue_xmit(skb); +} + /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { /* Send it off, maybe filter it using firewalling first. */ - NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb, - NULL, skb->dev, dev_queue_xmit_sk); + NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, + dev_net(skb->dev), NULL, skb, NULL, skb->dev, + arp_xmit_finish); } EXPORT_SYMBOL(arp_xmit); @@ -639,7 +645,7 @@ EXPORT_SYMBOL(arp_xmit); * Process an arp request. */ -static int arp_process(struct sock *sk, struct sk_buff *skb) +static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -651,7 +657,6 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) u16 dev_type = dev->type; int addr_type; struct neighbour *n; - struct net *net = dev_net(dev); struct dst_entry *reply_dst = NULL; bool is_garp = false; @@ -872,7 +877,7 @@ out_free_dst: static void parp_redo(struct sk_buff *skb) { - arp_process(NULL, skb); + arp_process(dev_net(skb->dev), NULL, skb); } @@ -905,8 +910,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb, - dev, NULL, arp_process); + return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, + dev_net(dev), NULL, skb, dev, NULL, + arp_process); consumeskb: consume_skb(skb); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 2d9cb1748f81..cebd9d31e65a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1644,7 +1644,8 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } -static size_t inet_get_link_af_size(const struct net_device *dev) +static size_t inet_get_link_af_size(const struct net_device *dev, + u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); @@ -1654,7 +1655,8 @@ static size_t inet_get_link_af_size(const struct net_device *dev) return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ } -static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, + u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; @@ -2397,4 +2399,3 @@ void __init devinet_init(void) rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, inet_netconf_dump_devconf, NULL); } - diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 690bcbc59f26..cc8f3e506cde 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -45,7 +45,7 @@ #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/xfrm.h> -#include <net/vrf.h> +#include <net/l3mdev.h> #include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -255,7 +255,7 @@ EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } @@ -268,7 +268,7 @@ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { - u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } @@ -332,7 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = vrf_master_ifindex_rcu(dev); + fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev); if (!fl4.flowi4_iif) fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; @@ -367,7 +367,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (nh->nh_dev == dev) { dev_match = true; break; - } else if (vrf_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { dev_match = true; break; } @@ -804,7 +804,7 @@ out: static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa->ifa_dev->dev); - u32 tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev); + u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, @@ -867,9 +867,10 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { - fib_magic(RTM_NEWROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - prefix, ifa->ifa_prefixlen, prim); + if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) + fib_magic(RTM_NEWROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + prefix, ifa->ifa_prefixlen, prim); /* Add network specific broadcasts, when it takes a sense */ if (ifa->ifa_prefixlen < 31) { @@ -914,9 +915,10 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) } } else if (!ipv4_is_zeronet(any) && (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { - fib_magic(RTM_DELROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - any, ifa->ifa_prefixlen, prim); + if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); subnet = 1; } @@ -1110,9 +1112,10 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, unsigned long event) +static void fib_disable_ip(struct net_device *dev, unsigned long event, + bool force) { - if (fib_sync_down_dev(dev, event)) + if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); rt_cache_flush(dev_net(dev)); arp_ifdown(dev); @@ -1140,7 +1143,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, event); + fib_disable_ip(dev, event, true); } else { rt_cache_flush(dev_net(dev)); } @@ -1157,7 +1160,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo unsigned int flags; if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, event); + fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1178,14 +1181,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo rt_cache_flush(net); break; case NETDEV_DOWN: - fib_disable_ip(dev, event); + fib_disable_ip(dev, event, false); break; case NETDEV_CHANGE: flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) fib_sync_up(dev, RTNH_F_LINKDOWN); else - fib_sync_down_dev(dev, event); + fib_sync_down_dev(dev, event, false); /* fall through */ case NETDEV_CHANGEMTU: rt_cache_flush(net); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 064bd3caaa4f..3e87447e65c7 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -57,8 +57,7 @@ static unsigned int fib_info_cnt; static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; #ifdef CONFIG_IP_ROUTE_MULTIPATH - -static DEFINE_SPINLOCK(fib_multipath_lock); +u32 fib_multipath_secret __read_mostly; #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ @@ -532,7 +531,67 @@ errout: return ret; } -#endif +static void fib_rebalance(struct fib_info *fi) +{ + int total; + int w; + struct in_device *in_dev; + + if (fi->fib_nhs < 2) + return; + + total = 0; + for_nexthops(fi) { + if (nh->nh_flags & RTNH_F_DEAD) + continue; + + in_dev = __in_dev_get_rtnl(nh->nh_dev); + + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN) + continue; + + total += nh->nh_weight; + } endfor_nexthops(fi); + + w = 0; + change_nexthops(fi) { + int upper_bound; + + in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev); + + if (nexthop_nh->nh_flags & RTNH_F_DEAD) { + upper_bound = -1; + } else if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) { + upper_bound = -1; + } else { + w += nexthop_nh->nh_weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, + total) - 1; + } + + atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); + } endfor_nexthops(fi); + + net_get_random_once(&fib_multipath_secret, + sizeof(fib_multipath_secret)); +} + +static inline void fib_add_weight(struct fib_info *fi, + const struct fib_nh *nh) +{ + fi->fib_weight += nh->nh_weight; +} + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define fib_rebalance(fi) do { } while (0) +#define fib_add_weight(fi, nh) do { } while (0) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, @@ -1094,8 +1153,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg) change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); + fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) + fib_rebalance(fi); + link_it: ofi = fib_find_info(fi); if (ofi) { @@ -1281,7 +1343,13 @@ int fib_sync_down_addr(struct net *net, __be32 local) return ret; } -int fib_sync_down_dev(struct net_device *dev, unsigned long event) +/* Event force Flags Description + * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host + * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host + * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed + * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed + */ +int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { int ret = 0; int scope = RT_SCOPE_NOWHERE; @@ -1290,8 +1358,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event) struct hlist_head *head = &fib_info_devhash[hash]; struct fib_nh *nh; - if (event == NETDEV_UNREGISTER || - event == NETDEV_DOWN) + if (force) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { @@ -1317,12 +1384,6 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event) nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; break; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - fi->fib_power -= nexthop_nh->nh_power; - nexthop_nh->nh_power = 0; - spin_unlock_bh(&fib_multipath_lock); -#endif dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1345,6 +1406,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event) } ret++; } + + fib_rebalance(fi); } return ret; @@ -1440,6 +1503,13 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) if (!(dev->flags & IFF_UP)) return 0; + if (nh_flags & RTNH_F_DEAD) { + unsigned int flags = dev_get_flags(dev); + + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + nh_flags |= RTNH_F_LINKDOWN; + } + prev_fi = NULL; hash = fib_devindex_hashfn(dev->ifindex); head = &fib_info_devhash[hash]; @@ -1467,20 +1537,15 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) !__in_dev_get_rtnl(dev)) continue; alive++; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); - nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~nh_flags; - spin_unlock_bh(&fib_multipath_lock); -#else nexthop_nh->nh_flags &= ~nh_flags; -#endif } endfor_nexthops(fi) if (alive > 0) { fi->fib_flags &= ~nh_flags; ret++; } + + fib_rebalance(fi); } return ret; @@ -1488,62 +1553,41 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) #ifdef CONFIG_IP_ROUTE_MULTIPATH -/* - * The algorithm is suboptimal, but it provides really - * fair weighted route distribution. - */ -void fib_select_multipath(struct fib_result *res) +void fib_select_multipath(struct fib_result *res, int hash) { struct fib_info *fi = res->fi; - struct in_device *in_dev; - int w; - - spin_lock_bh(&fib_multipath_lock); - if (fi->fib_power <= 0) { - int power = 0; - change_nexthops(fi) { - in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); - if (nexthop_nh->nh_flags & RTNH_F_DEAD) - continue; - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - nexthop_nh->nh_flags & RTNH_F_LINKDOWN) - continue; - power += nexthop_nh->nh_weight; - nexthop_nh->nh_power = nexthop_nh->nh_weight; - } endfor_nexthops(fi); - fi->fib_power = power; - if (power <= 0) { - spin_unlock_bh(&fib_multipath_lock); - /* Race condition: route has just become dead. */ - res->nh_sel = 0; - return; - } - } - - /* w should be random number [0..fi->fib_power-1], - * it is pretty bad approximation. - */ - - w = jiffies % fi->fib_power; + for_nexthops(fi) { + if (hash > atomic_read(&nh->nh_upper_bound)) + continue; - change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && - nexthop_nh->nh_power) { - w -= nexthop_nh->nh_power; - if (w <= 0) { - nexthop_nh->nh_power--; - fi->fib_power--; - res->nh_sel = nhsel; - spin_unlock_bh(&fib_multipath_lock); - return; - } - } + res->nh_sel = nhsel; + return; } endfor_nexthops(fi); /* Race condition: route has just become dead. */ res->nh_sel = 0; - spin_unlock_bh(&fib_multipath_lock); } #endif + +void fib_select_path(struct net *net, struct fib_result *res, + struct flowi4 *fl4, int mp_hash) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (mp_hash < 0) + mp_hash = get_hash_from_flowi4(fl4) >> 1; + + fib_select_multipath(res, mp_hash); + } + else +#endif + if (!res->prefixlen && + res->table->tb_num_default > 1 && + res->type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(fl4, res); + + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, *res); +} +EXPORT_SYMBOL_GPL(fib_select_path); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e5eb8ac4089d..36e26977c908 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -96,7 +96,7 @@ #include <net/xfrm.h> #include <net/inet_common.h> #include <net/ip_fib.h> -#include <net/vrf.h> +#include <net/l3mdev.h> /* * Build xmit assembly blocks @@ -309,7 +309,7 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, rc = false; if (icmp_global_allow()) { - int vif = vrf_master_ifindex(dst->dev); + int vif = l3mdev_master_ifindex(dst->dev); struct inet_peer *peer; peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); @@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; - fl4.flowi4_oif = vrf_master_ifindex(skb->dev); + fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -440,6 +440,22 @@ out_unlock: icmp_xmit_unlock(sk); } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* Source and destination is swapped. See ip_multipath_icmp_hash */ +static int icmp_multipath_hash_skb(const struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + + return fib_multipath_hash(iph->daddr, iph->saddr); +} + +#else + +#define icmp_multipath_hash_skb(skb) (-1) + +#endif + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -461,10 +477,11 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev); + fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key_hash(net, fl4, + icmp_multipath_hash_skb(skb_in)); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d38b8b61eaee..64aaf3522a59 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -397,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb) pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); - return ip_local_out(skb); + return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -739,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ih->group = group; ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - return ip_local_out(skb); + return ip_local_out(net, skb->sk, skb); } static void igmp_gq_timer_expire(unsigned long data) @@ -2569,7 +2569,7 @@ void ip_mc_drop_socket(struct sock *sk) } /* called with rcu_read_lock() */ -int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) +int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto) { struct ip_mc_list *im; struct ip_mc_list __rcu **mc_hash; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 61b45a17fc73..1feb15f23de8 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -330,14 +330,12 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) if (error) goto out_err; } - req = reqsk_queue_remove(queue); + req = reqsk_queue_remove(queue, sk); newsk = req->sk; - sk_acceptq_removed(sk); if (sk->sk_protocol == IPPROTO_TCP && - tcp_rsk(req)->tfo_listener && - queue->fastopenq) { - spin_lock_bh(&queue->fastopenq->lock); + tcp_rsk(req)->tfo_listener) { + spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to @@ -348,7 +346,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) req->sk = NULL; req = NULL; } - spin_unlock_bh(&queue->fastopenq->lock); + spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); @@ -408,7 +406,7 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) } EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); -struct dst_entry *inet_csk_route_req(struct sock *sk, +struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { @@ -439,7 +437,7 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_req); -struct dst_entry *inet_csk_route_child_sock(struct sock *sk, +struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req) { @@ -478,65 +476,12 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, - const u32 rnd, const u32 synq_hsize) -{ - return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); -} - #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else #define AF_INET_FAMILY(fam) true #endif -/* Note: this is temporary : - * req sock will no longer be in listener hash table -*/ -struct request_sock *inet_csk_search_req(struct sock *sk, - const __be16 rport, - const __be32 raddr, - const __be32 laddr) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req; - u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries); - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->ir_rmt_port == rport && - ireq->ir_rmt_addr == raddr && - ireq->ir_loc_addr == laddr && - AF_INET_FAMILY(req->rsk_ops->family)) { - atomic_inc(&req->rsk_refcnt); - WARN_ON(req->sk); - break; - } - } - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return req; -} -EXPORT_SYMBOL_GPL(inet_csk_search_req); - -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, - inet_rsk(req)->ir_rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); - inet_csk_reqsk_queue_added(sk, timeout); -} -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); - /* Only thing we need from tcp.h */ extern int sysctl_tcp_synack_retries; @@ -563,7 +508,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, req->num_timeout >= rskq_defer_accept - 1; } -int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) +int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { int err = req->rsk_ops->rtx_syn_ack(parent, req); @@ -573,27 +518,20 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); -/* return true if req was found in the syn_table[] */ +/* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { - struct request_sock **prev; - struct listen_sock *lopt; + struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; bool found = false; - spin_lock(&queue->syn_wait_lock); - lopt = queue->listen_opt; - if (lopt) { - for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; - prev = &(*prev)->dl_next) { - if (*prev == req) { - *prev = req->dl_next; - found = true; - break; - } - } + if (sk_hashed(req_to_sk(req))) { + spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); + + spin_lock(lock); + found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); + spin_unlock(lock); } - spin_unlock(&queue->syn_wait_lock); if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; @@ -608,21 +546,25 @@ void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); +void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) +{ + inet_csk_reqsk_queue_drop(sk, req); + reqsk_put(req); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); + static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct listen_sock *lopt = queue->listen_opt; int qlen, expire = 0, resend = 0; int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN || !lopt) { - reqsk_put(req); - return; - } + if (sk_listener->sk_state != TCP_LISTEN) + goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; thresh = max_retries; @@ -643,9 +585,9 @@ static void reqsk_timer_handler(unsigned long data) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ - qlen = listen_sock_qlen(lopt); - if (qlen >> (lopt->max_qlen_log - 1)) { - int young = listen_sock_young(lopt) << 1; + qlen = reqsk_queue_len(queue); + if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) { + int young = reqsk_queue_len_young(queue) << 1; while (thresh > 2) { if (qlen < young) @@ -667,41 +609,40 @@ static void reqsk_timer_handler(unsigned long data) unsigned long timeo; if (req->num_timeout++ == 0) - atomic_inc(&lopt->young_dec); + atomic_dec(&queue->young); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer_pinned(&req->rsk_timer, jiffies + timeo); return; } - inet_csk_reqsk_queue_drop(sk_listener, req); - reqsk_put(req); +drop: + inet_csk_reqsk_queue_drop_and_put(sk_listener, req); } -void reqsk_queue_hash_req(struct request_sock_queue *queue, - u32 hash, struct request_sock *req, - unsigned long timeout) +static void reqsk_queue_hash_req(struct request_sock *req, + unsigned long timeout) { - struct listen_sock *lopt = queue->listen_opt; - req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); mod_timer_pinned(&req->rsk_timer, jiffies + timeout); - req->rsk_hash = hash; + inet_ehash_insert(req_to_sk(req), NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); - atomic_set(&req->rsk_refcnt, 2); + atomic_set(&req->rsk_refcnt, 2 + 1); +} - spin_lock(&queue->syn_wait_lock); - req->dl_next = lopt->syn_table[hash]; - lopt->syn_table[hash] = req; - spin_unlock(&queue->syn_wait_lock); +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + reqsk_queue_hash_req(req, timeout); + inet_csk_reqsk_queue_added(sk); } -EXPORT_SYMBOL(reqsk_queue_hash_req); +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /** * inet_csk_clone_lock - clone an inet socket, and lock its clone @@ -792,16 +733,14 @@ void inet_csk_prepare_forced_close(struct sock *sk) } EXPORT_SYMBOL(inet_csk_prepare_forced_close); -int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) +int inet_csk_listen_start(struct sock *sk, int backlog) { - struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + struct inet_sock *inet = inet_sk(sk); - if (rc != 0) - return rc; + reqsk_queue_alloc(&icsk->icsk_accept_queue); - sk->sk_max_ack_backlog = 0; + sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); @@ -821,11 +760,76 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) } sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); +static void inet_child_forget(struct sock *sk, struct request_sock *req, + struct sock *child) +{ + sk->sk_prot->disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + percpu_counter_inc(sk->sk_prot->orphan_count); + + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { + BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(sk != req->rsk_listener); + + /* Paranoid, to prevent race condition if + * an inbound pkt destined for child is + * blocked by sock lock in tcp_v4_rcv(). + * Also to satisfy an assertion in + * tcp_v4_destroy_sock(). + */ + tcp_sk(child)->fastopen_rsk = NULL; + } + inet_csk_destroy_sock(child); + reqsk_put(req); +} + +void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, + struct sock *child) +{ + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; + + spin_lock(&queue->rskq_lock); + if (unlikely(sk->sk_state != TCP_LISTEN)) { + inet_child_forget(sk, req, child); + } else { + req->sk = child; + req->dl_next = NULL; + if (queue->rskq_accept_head == NULL) + queue->rskq_accept_head = req; + else + queue->rskq_accept_tail->dl_next = req; + queue->rskq_accept_tail = req; + sk_acceptq_added(sk); + } + spin_unlock(&queue->rskq_lock); +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_add); + +struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, + struct request_sock *req, bool own_req) +{ + if (own_req) { + inet_csk_reqsk_queue_drop(sk, req); + reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); + inet_csk_reqsk_queue_add(sk, req, child); + /* Warning: caller must not call reqsk_put(req); + * child stole last reference on it. + */ + return child; + } + /* Too bad, another child took ownership of the request, undo. */ + bh_unlock_sock(child); + sock_put(child); + return NULL; +} +EXPORT_SYMBOL(inet_csk_complete_hashdance); + /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. @@ -834,11 +838,7 @@ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct request_sock *acc_req; - struct request_sock *req; - - /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(queue); + struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -848,57 +848,34 @@ void inet_csk_listen_stop(struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(queue); - - while ((req = acc_req) != NULL) { + while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk; - acc_req = req->dl_next; - local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); - sk->sk_prot->disconnect(child, O_NONBLOCK); - - sock_orphan(child); - - percpu_counter_inc(sk->sk_prot->orphan_count); - - if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { - BUG_ON(tcp_sk(child)->fastopen_rsk != req); - BUG_ON(sk != req->rsk_listener); - - /* Paranoid, to prevent race condition if - * an inbound pkt destined for child is - * blocked by sock lock in tcp_v4_rcv(). - * Also to satisfy an assertion in - * tcp_v4_destroy_sock(). - */ - tcp_sk(child)->fastopen_rsk = NULL; - } - inet_csk_destroy_sock(child); - + inet_child_forget(sk, req, child); bh_unlock_sock(child); local_bh_enable(); sock_put(child); - sk_acceptq_removed(sk); - reqsk_put(req); + cond_resched(); } - if (queue->fastopenq) { + if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ - spin_lock_bh(&queue->fastopenq->lock); - acc_req = queue->fastopenq->rskq_rst_head; - queue->fastopenq->rskq_rst_head = NULL; - spin_unlock_bh(&queue->fastopenq->lock); - while ((req = acc_req) != NULL) { - acc_req = req->dl_next; + spin_lock_bh(&queue->fastopenq.lock); + req = queue->fastopenq.rskq_rst_head; + queue->fastopenq.rskq_rst_head = NULL; + spin_unlock_bh(&queue->fastopenq.lock); + while (req != NULL) { + next = req->dl_next; reqsk_put(req); + req = next; } } - WARN_ON(sk->sk_ack_backlog); + WARN_ON_ONCE(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c3b1f3a0f4cf..ab9f8a66615d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -730,91 +730,21 @@ static void twsk_build_assert(void) #endif } -static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct inet_diag_entry entry; - int j, s_j, reqnum, s_reqnum; - struct listen_sock *lopt; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - - lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !listen_sock_qlen(lopt)) - goto out; - - if (bc) { - entry.sport = inet->inet_num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < lopt->nr_table_entries; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.idiag_dport != ireq->ir_rmt_port && - r->id.idiag_dport) - continue; - - if (bc) { - /* Note: entry.sport and entry.userlocks are already set */ - entry_fill_addrs(&entry, req_to_sk(req)); - entry.dport = ntohs(ireq->ir_rmt_port); - - if (!inet_diag_bc_run(bc, &entry)) - continue; - } - - err = inet_req_diag_fill(req_to_sk(req), skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return err; -} - void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; + u32 idiag_states = r->idiag_states; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & TCPF_LISTEN)) goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { @@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (!(r->idiag_states & TCPF_LISTEN) || - r->id.idiag_dport || + if (r->id.idiag_dport || cb->args[3] > 0) - goto syn_recv; - - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { - spin_unlock_bh(&ilb->lock); - goto done; - } - -syn_recv: - if (!(r->idiag_states & TCPF_SYN_RECV)) goto next_listen; - if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -879,7 +799,7 @@ skip_listen_ht: s_i = num = s_num = 0; } - if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & ~TCPF_LISTEN)) goto out; for (i = s_i; i <= hashinfo->ehash_mask; i++) { @@ -906,7 +826,7 @@ skip_listen_ht: goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_substate : sk->sk_state; - if (!(r->idiag_states & (1 << state))) + if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index d0a7c0319e3d..fe144dae7372 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -209,12 +209,6 @@ int inet_frags_init(struct inet_frags *f) } EXPORT_SYMBOL(inet_frags_init); -void inet_frags_init_net(struct netns_frags *nf) -{ - init_frag_mem_limit(nf); -} -EXPORT_SYMBOL(inet_frags_init_net); - void inet_frags_fini(struct inet_frags *f) { cancel_work_sync(&f->frags_work); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 89120196a949..ccc5980797fc 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -126,7 +126,7 @@ void inet_put_port(struct sock *sk) } EXPORT_SYMBOL(inet_put_port); -int __inet_inherit_port(struct sock *sk, struct sock *child) +int __inet_inherit_port(const struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; unsigned short port = inet_sk(child)->inet_num; @@ -137,6 +137,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; + if (unlikely(!tb)) { + spin_unlock(&head->lock); + return -ENOENT; + } if (tb->port != port) { /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption @@ -185,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; } return score; } @@ -398,14 +404,18 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -void __inet_hash_nolisten(struct sock *sk, struct sock *osk) +/* insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT + */ +bool inet_ehash_insert(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; + bool ret = true; - WARN_ON(!sk_unhashed(sk)); + WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); @@ -413,24 +423,41 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock(lock); - __sk_nulls_add_node_rcu(sk, list); if (osk) { - WARN_ON(sk->sk_hash != osk->sk_hash); - sk_nulls_del_node_init_rcu(osk); + WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); + ret = sk_nulls_del_node_init_rcu(osk); } + if (ret) + __sk_nulls_add_node_rcu(sk, list); spin_unlock(lock); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + return ret; +} + +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) +{ + bool ok = inet_ehash_insert(sk, osk); + + if (ok) { + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + } else { + percpu_counter_inc(sk->sk_prot->orphan_count); + sk->sk_state = TCP_CLOSE; + sock_set_flag(sk, SOCK_DEAD); + inet_csk_destroy_sock(sk); + } + return ok; } -EXPORT_SYMBOL_GPL(__inet_hash_nolisten); +EXPORT_SYMBOL_GPL(inet_ehash_nolisten); void __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; - if (sk->sk_state != TCP_LISTEN) - return __inet_hash_nolisten(sk, osk); - + if (sk->sk_state != TCP_LISTEN) { + inet_ehash_nolisten(sk, osk); + return; + } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -551,7 +578,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - __inet_hash_nolisten(sk, (struct sock *)tw); + inet_ehash_nolisten(sk, (struct sock *)tw); } if (tw) inet_twsk_bind_unhash(tw, hinfo); @@ -568,7 +595,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, NULL); spin_unlock_bh(&head->lock); return 0; } else { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 2d3aa408fbdc..da0d7ce85844 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -61,18 +61,18 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) } -static int ip_forward_finish(struct sock *sk, struct sk_buff *skb) +static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); skb_sender_cpu_clear(skb); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } int ip_forward(struct sk_buff *skb) @@ -81,6 +81,7 @@ int ip_forward(struct sk_buff *skb) struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options *opt = &(IPCB(skb)->opt); + struct net *net; /* that should never happen */ if (skb->pkt_type != PACKET_HOST) @@ -99,6 +100,7 @@ int ip_forward(struct sk_buff *skb) return NET_RX_SUCCESS; skb_forward_csum(skb); + net = dev_net(skb->dev); /* * According to the RFC, we must first decrease the TTL field. If @@ -119,7 +121,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { - IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto drop; @@ -143,8 +145,9 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); - return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb, - skb->dev, rt->dst.dev, ip_forward_finish); + return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, + net, NULL, skb, skb->dev, rt->dst.dev, + ip_forward_finish); sr_failed: /* @@ -155,7 +158,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fa7f15305f9a..1fe55ae81781 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -48,7 +48,7 @@ #include <linux/inet.h> #include <linux/netfilter_ipv4.h> #include <net/inet_ecn.h> -#include <net/vrf.h> +#include <net/l3mdev.h> /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -78,7 +78,7 @@ struct ipq { u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; - int vif; /* VRF device index */ + int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; @@ -654,11 +654,10 @@ out_fail: } /* Process an incoming IP datagram fragment. */ -int ip_defrag(struct sk_buff *skb, u32 user) +int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; - int vif = vrf_master_ifindex_rcu(dev); - struct net *net = dev_net(dev); + int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); @@ -683,7 +682,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_defrag); -struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct iphdr iph; int netoff; @@ -712,7 +711,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) if (pskb_trim_rcsum(skb, netoff + len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - if (ip_defrag(skb, user)) + if (ip_defrag(net, skb, user)) return NULL; skb_clear_hash(skb); } @@ -840,6 +839,8 @@ static void __init ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { + int res; + /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -863,9 +864,13 @@ static int __net_init ipv4_frags_init_net(struct net *net) */ net->ipv4.frags.timeout = IP_FRAG_TIME; - inet_frags_init_net(&net->ipv4.frags); - - return ip4_frags_ns_ctl_register(net); + res = inet_frags_init_net(&net->ipv4.frags); + if (res) + return res; + res = ip4_frags_ns_ctl_register(net); + if (res) + inet_frags_uninit_net(&net->ipv4.frags); + return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f4fc8a77aaa7..b1209b63381f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -157,6 +157,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; @@ -167,9 +168,9 @@ bool ip_call_ra_chain(struct sk_buff *skb) if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) && - net_eq(sock_net(sk), dev_net(dev))) { + net_eq(sock_net(sk), net)) { if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) + if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { @@ -188,10 +189,8 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } -static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb) +static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); - __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); @@ -248,14 +247,15 @@ int ip_local_deliver(struct sk_buff *skb) /* * Reassemble IP fragments. */ + struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) + if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } - return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb, - skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, + net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } @@ -311,7 +311,7 @@ drop: int sysctl_ip_early_demux __read_mostly = 1; EXPORT_SYMBOL(sysctl_ip_early_demux); -static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; @@ -337,8 +337,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) - NET_INC_STATS_BH(dev_net(skb->dev), - LINUX_MIB_IPRPFILTER); + NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER); goto drop; } } @@ -359,11 +358,9 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, - skb->len); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, - skb->len); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); return dst_input(skb); @@ -378,6 +375,7 @@ drop: int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct iphdr *iph; + struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -387,11 +385,12 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); + net = dev_net(dev); + IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); goto out; } @@ -417,7 +416,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); - IP_ADD_STATS_BH(dev_net(dev), + IP_ADD_STATS_BH(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); @@ -431,7 +430,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; @@ -441,7 +440,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -453,14 +452,14 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, - dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, ip_rcv_finish); csum_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0138fada0951..4233cbe47052 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,9 +83,10 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); -static int ip_fragment(struct sock *sk, struct sk_buff *skb, - unsigned int mtu, - int (*output)(struct sock *, struct sk_buff *)); +static int +ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct net *, struct sock *, struct sk_buff *)); /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) @@ -95,32 +96,28 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->tot_len = htons(skb->len); ip_send_check(iph); - return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL, - skb_dst(skb)->dev, dst_output_sk); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, skb_dst(skb)->dev, + dst_output); } -int __ip_local_out(struct sk_buff *skb) -{ - return __ip_local_out_sk(skb->sk, skb); -} - -int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; - err = __ip_local_out(skb); + err = __ip_local_out(net, sk, skb); if (likely(err == 1)) - err = dst_output_sk(sk, skb); + err = dst_output(net, sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out_sk); +EXPORT_SYMBOL_GPL(ip_local_out); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -135,11 +132,12 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * Add an ip header to a skbuff and send it out. * */ -int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, +int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); + struct net *net = sock_net(sk); struct iphdr *iph; /* Build the IP header. */ @@ -149,15 +147,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; - if (ip_dont_fragment(sk, &rt->dst)) - iph->frag_off = htons(IP_DF); - else - iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(sock_net(sk), skb, sk); + if (ip_dont_fragment(sk, &rt->dst)) { + iph->frag_off = htons(IP_DF); + iph->id = 0; + } else { + iph->frag_off = 0; + __ip_select_ident(net, iph, 1); + } if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -168,11 +168,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, skb->mark = sk->sk_mark; /* Send it out. */ - return ip_local_out(skb); + return ip_local_out(net, skb->sk, skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); -static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; @@ -182,9 +182,9 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -220,8 +220,8 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, - unsigned int mtu) +static int ip_finish_output_gso(struct net *net, struct sock *sk, + struct sk_buff *skb, unsigned int mtu) { netdev_features_t features; struct sk_buff *segs; @@ -230,7 +230,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || skb_gso_network_seglen(skb) <= mtu) - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. * @@ -253,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, int err; segs->next = NULL; - err = ip_fragment(sk, segs, mtu, ip_finish_output2); + err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; @@ -263,7 +263,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, return ret; } -static int ip_finish_output(struct sock *sk, struct sk_buff *skb) +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; @@ -271,20 +271,20 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif mtu = ip_skb_dst_mtu(skb); if (skb_is_gso(skb)) - return ip_finish_output_gso(sk, skb, mtu); + return ip_finish_output_gso(net, sk, skb, mtu); if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) - return ip_fragment(sk, skb, mtu, ip_finish_output2); + return ip_fragment(net, sk, skb, mtu, ip_finish_output2); - return ip_finish_output2(sk, skb); + return ip_finish_output2(net, sk, skb); } -int ip_mc_output(struct sock *sk, struct sk_buff *skb) +int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -292,7 +292,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) /* * If the indicated interface is up and running, send the packet. */ - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -320,7 +320,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, - sk, newskb, NULL, newskb->dev, + net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); } @@ -335,26 +335,28 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb, - NULL, newskb->dev, dev_loopback_xmit); + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, newskb, NULL, newskb->dev, + dev_loopback_xmit); } - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL, - skb->dev, ip_finish_output, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb->dev, + ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sock *sk, struct sk_buff *skb) +int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, - NULL, dev, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -377,6 +379,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; @@ -407,7 +410,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(sock_net(sk), fl4, sk, + rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, @@ -444,20 +447,20 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_segs(sock_net(sk), skb, sk, + ip_select_ident_segs(net, skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - res = ip_local_out(skb); + res = ip_local_out(net, sk, skb); rcu_read_unlock(); return res; no_route: rcu_read_unlock(); - IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -486,29 +489,26 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -static int ip_fragment(struct sock *sk, struct sk_buff *skb, +static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, - int (*output)(struct sock *, struct sk_buff *)) + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph = ip_hdr(skb); if ((iph->frag_off & htons(IP_DF)) == 0) - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); if (unlikely(!skb->ignore_df || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { - struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->dst.dev; - - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } - return ip_do_fragment(sk, skb, output); + return ip_do_fragment(net, sk, skb, output); } /* @@ -518,8 +518,8 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb, * single device frame, and queue such a frame for sending. */ -int ip_do_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(struct sock *, struct sk_buff *)) +int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; @@ -533,6 +533,11 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, dev = rt->dst.dev; + /* for offloaded checksums cleanup checksum before fragmentation */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto fail; + /* * Point into the IP datagram header. */ @@ -621,10 +626,10 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, ip_send_check(iph); } - err = output(sk, skb); + err = output(net, sk, skb); if (!err) - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -634,7 +639,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, } if (err == 0) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return 0; } @@ -643,7 +648,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb, kfree_skb(frag); frag = skb; } - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; slow_path_clean: @@ -657,9 +662,6 @@ slow_path_clean: } slow_path: - /* for offloaded checksums cleanup checksum before fragmentation */ - if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) - goto fail; iph = ip_hdr(skb); left = skb->len - hlen; /* Space per frame */ @@ -761,19 +763,19 @@ slow_path: ip_send_check(iph); - err = output(sk, skb2); + err = output(net, sk, skb2); if (err) goto fail; - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); } consume_skb(skb); - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; } EXPORT_SYMBOL(ip_do_fragment); @@ -911,6 +913,7 @@ static int __ip_append_data(struct sock *sk, if (transhdrlen && length + fragheaderlen <= mtu && rt->dst.dev->features & NETIF_F_V4_CSUM && + !(flags & MSG_MORE) && !exthdrlen) csummode = CHECKSUM_PARTIAL; @@ -1434,7 +1437,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb) { int err; - err = ip_local_out(skb); + err = ip_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); @@ -1561,7 +1564,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, } oif = arg->bound_dev_if; - if (!oif && netif_index_is_vrf(net, skb->skb_iif)) + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; flowi4_init_output(&fl4, oif, @@ -1596,7 +1599,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } out: diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 84dce6a92f93..6cb9009c3d96 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -53,6 +53,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __u8 tos, __u8 ttl, __be16 df, bool xnet) { int pkt_len = skb->len - skb_inner_network_offset(skb); + struct net *net = dev_net(rt->dst.dev); struct iphdr *iph; int err; @@ -76,10 +77,9 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - __ip_select_ident(dev_net(rt->dst.dev), iph, - skb_shinfo(skb)->gso_segs ?: 1); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); - err = ip_local_out_sk(sk, skb); + err = ip_local_out(net, sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 0c152087ca15..4d8f0b698777 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; - err = dst_output(skb); + err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = skb->len; iptunnel_xmit_stats(err, &dev->stats, dev->tstats); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index ed4ef09c2136..0bc7412d9e14 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -146,6 +146,10 @@ u8 root_server_path[256] = { 0, }; /* Path to mount as root */ /* vendor class identifier */ static char vendor_class_identifier[253] __initdata; +#if defined(CONFIG_IP_PNP_DHCP) +static char dhcp_client_identifier[253] __initdata; +#endif + /* Persistent data: */ static int ic_proto_used; /* Protocol used, if any */ @@ -728,6 +732,16 @@ ic_dhcp_init_options(u8 *options) memcpy(e, vendor_class_identifier, len); e += len; } + len = strlen(dhcp_client_identifier + 1); + /* the minimum length of identifier is 2, include 1 byte type, + * and can not be larger than the length of options + */ + if (len >= 1 && len < 312 - (e - options) - 1) { + *e++ = 61; + *e++ = len + 1; + memcpy(e, dhcp_client_identifier, len + 1); + e += len + 1; + } } *e++ = 255; /* End of the list */ @@ -1557,8 +1571,24 @@ static int __init ic_proto_name(char *name) return 0; } #ifdef CONFIG_IP_PNP_DHCP - else if (!strcmp(name, "dhcp")) { + else if (!strncmp(name, "dhcp", 4)) { + char *client_id; + ic_proto_enabled &= ~IC_RARP; + client_id = strstr(name, "dhcp,"); + if (client_id) { + char *v; + + client_id = client_id + 5; + v = strchr(client_id, ','); + if (!v) + return 1; + *v = 0; + if (kstrtou8(client_id, 0, dhcp_client_identifier)) + DBG("DHCP: Invalid client identifier type\n"); + strncpy(dhcp_client_identifier + 1, v + 1, 251); + *v = ','; + } return 1; } #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 866ee89f5254..92dd4b74d513 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1678,17 +1678,18 @@ static void ip_encap(struct net *net, struct sk_buff *skb, nf_reset(skb); } -static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb) +static inline int ipmr_forward_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } /* @@ -1745,7 +1746,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * to blackhole. */ - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } @@ -1787,8 +1788,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb, - skb->dev, dev, + NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, + net, NULL, skb, skb->dev, dev, ipmr_forward_finish); return; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 61eafc9b4545..c3776ff6749f 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -17,9 +17,8 @@ #include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) +int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type) { - struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; @@ -104,7 +103,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb, } } -static int nf_ip_reroute(struct sk_buff *skb, +static int nf_ip_reroute(struct net *net, struct sk_buff *skb, const struct nf_queue_entry *entry) { const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -116,7 +115,7 @@ static int nf_ip_reroute(struct sk_buff *skb, skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(skb, RTN_UNSPEC); + return ip_route_me_harder(net, skb, RTN_UNSPEC); } return 0; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 8f87fc38ccde..11dccba474b7 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -186,7 +186,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", indev, arpinfo->iniface, - arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":""); + arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : ""); return 0; } @@ -195,7 +195,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", outdev, arpinfo->outiface, - arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":""); + arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : ""); return 0; } @@ -247,10 +247,10 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) } unsigned int arpt_do_table(struct sk_buff *skb, - unsigned int hook, const struct nf_hook_state *state, struct xt_table *table) { + unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; const struct arphdr *arp; @@ -285,6 +285,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, */ e = get_entry(table_base, private->hook_entry[hook]); + acpar.net = state->net; acpar.in = state->in; acpar.out = state->out; acpar.hooknum = hook; @@ -467,7 +468,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } - next: +next: duprintf("Finished chain %u\n", hook); } return 1; @@ -631,7 +632,7 @@ static inline void cleanup_entry(struct arpt_entry *e) * newinfo). */ static int translate_table(struct xt_table_info *newinfo, void *entry0, - const struct arpt_replace *repl) + const struct arpt_replace *repl) { struct arpt_entry *iter; unsigned int i; @@ -891,7 +892,7 @@ static int compat_table_info(const struct xt_table_info *info, #endif static int get_info(struct net *net, void __user *user, - const int *len, int compat) + const int *len, int compat) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -1068,7 +1069,7 @@ static int __do_replace(struct net *net, const char *name, } static int do_replace(struct net *net, const void __user *user, - unsigned int len) + unsigned int len) { int ret; struct arpt_replace tmp; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 93876d03120c..1897ee160920 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -27,13 +27,10 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c */ static unsigned int -arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +arptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net = dev_net(state->in ? state->in : state->out); - - return arpt_do_table(skb, ops->hooknum, state, - net->ipv4.arptable_filter); + return arpt_do_table(skb, state, state->net->ipv4.arptable_filter); } static struct nf_hook_ops *arpfilter_ops __read_mostly; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b0a86e73451c..b99affad6ba1 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -102,7 +102,7 @@ ip_packet_match(const struct iphdr *ip, if (FWINV(ret != 0, IPT_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", indev, ipinfo->iniface, - ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":""); + ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : ""); return false; } @@ -111,7 +111,7 @@ ip_packet_match(const struct iphdr *ip, if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", outdev, ipinfo->outiface, - ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":""); + ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : ""); return false; } @@ -120,7 +120,7 @@ ip_packet_match(const struct iphdr *ip, FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { dprintf("Packet protocol %hi does not match %hi.%s\n", ip->protocol, ipinfo->proto, - ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); + ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : ""); return false; } @@ -246,7 +246,8 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, return 0; } -static void trace_packet(const struct sk_buff *skb, +static void trace_packet(struct net *net, + const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, @@ -258,7 +259,6 @@ static void trace_packet(const struct sk_buff *skb, const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; - struct net *net = dev_net(in ? in : out); root = get_entry(private->entries, private->hook_entry[hook]); @@ -285,10 +285,10 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ipt_do_table(struct sk_buff *skb, - unsigned int hook, const struct nf_hook_state *state, struct xt_table *table) { + unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct iphdr *ip; /* Initializing verdict to NF_DROP keeps gcc happy. */ @@ -315,6 +315,7 @@ ipt_do_table(struct sk_buff *skb, acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; acpar.thoff = ip_hdrlen(skb); acpar.hotdrop = false; + acpar.net = state->net; acpar.in = state->in; acpar.out = state->out; acpar.family = NFPROTO_IPV4; @@ -378,8 +379,8 @@ ipt_do_table(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) - trace_packet(skb, hook, state->in, state->out, - table->name, private, e); + trace_packet(state->net, skb, hook, state->in, + state->out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { @@ -430,8 +431,8 @@ ipt_do_table(struct sk_buff *skb, } while (!acpar.hotdrop); pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); - xt_write_recseq_end(addend); - local_bh_enable(); + xt_write_recseq_end(addend); + local_bh_enable(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -483,7 +484,7 @@ mark_source_chains(const struct xt_table_info *newinfo, unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < -NF_MAX_VERDICT - 1) { duprintf("mark_source_chains: bad " "negative verdict (%i)\n", @@ -548,7 +549,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } - next: +next: duprintf("Finished chain %u\n", hook); } return 1; @@ -803,7 +804,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net) newinfo) */ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, - const struct ipt_replace *repl) + const struct ipt_replace *repl) { struct ipt_entry *iter; unsigned int i; @@ -1077,7 +1078,7 @@ static int compat_table_info(const struct xt_table_info *info, #endif static int get_info(struct net *net, void __user *user, - const int *len, int compat) + const int *len, int compat) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -1303,7 +1304,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) static int do_add_counters(struct net *net, const void __user *user, - unsigned int len, int compat) + unsigned int len, int compat) { unsigned int i; struct xt_counters_info tmp; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 45cb16a6a4a3..4a9e6db9df8d 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -492,14 +492,14 @@ static void arp_print(struct arp_payload *payload) { #define HBUFFERLEN 30 char hbuffer[HBUFFERLEN]; - int j,k; + int j, k; - for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) { + for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) { hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); - hbuffer[k++]=':'; + hbuffer[k++] = ':'; } - hbuffer[--k]='\0'; + hbuffer[--k] = '\0'; pr_debug("src %pI4@%s, dst %pI4\n", &payload->src_ip, hbuffer, &payload->dst_ip); @@ -507,14 +507,14 @@ static void arp_print(struct arp_payload *payload) #endif static unsigned int -arp_mangle(const struct nf_hook_ops *ops, +arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; struct clusterip_config *c; - struct net *net = dev_net(state->in ? state->in : state->out); + struct net *net = state->net; /* we don't care about non-ethernet and non-ipv4 ARP */ if (arp->ar_hrd != htons(ARPHRD_ETHER) || diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 87907d4bd259..1d16c0f28df0 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(skb, hook); + nf_send_reset(par->net, skb, hook); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 95ea633e8356..5fdc556514ba 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -39,11 +39,14 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) } static void -synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, +synproxy_send_tcp(const struct synproxy_net *snet, + const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct iphdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { + struct net *net = nf_ct_net(snet->tmpl); + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; @@ -51,7 +54,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, skb_dst_set_noref(nskb, skb_dst(skb)); nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) goto free_nskb; if (nfct) { @@ -60,7 +63,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, nf_conntrack_get(nfct); } - ip_local_out(nskb); + ip_local_out(net, nskb->sk, nskb); return; free_nskb: @@ -68,7 +71,8 @@ free_nskb: } static void -synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, +synproxy_send_client_synack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { struct sk_buff *nskb; @@ -104,7 +108,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } @@ -148,7 +152,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } @@ -188,7 +192,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void @@ -226,8 +230,8 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, - niph, nth, tcp_hdr_size); + synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); } static bool @@ -258,7 +262,7 @@ static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_net *snet = synproxy_pernet(par->net); struct synproxy_options opts = {}; struct tcphdr *th, _th; @@ -287,7 +291,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(skb, th, &opts); + synproxy_send_client_synack(snet, skb, th, &opts); return NF_DROP; } else if (th->ack && !(th->fin || th->rst || th->syn)) { @@ -299,11 +303,11 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops, +static unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out)); + struct synproxy_net *snet = synproxy_pernet(nhs->net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; @@ -433,14 +437,12 @@ static struct xt_target synproxy_tg4_reg __read_mostly = { static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { { .hook = ipv4_synproxy_hook, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, }, { .hook = ipv4_synproxy_hook, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index 14a2aa8b8a14..a787d07f6cb7 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -25,7 +25,7 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) bool r; pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, spi, max); - r=(spi >= min && spi <= max) ^ invert; + r = (spi >= min && spi <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index c4ffc9de1654..78cc64eddfc1 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -32,12 +32,11 @@ static __be32 rpfilter_get_saddr(__be32 addr) return addr; } -static bool rpfilter_lookup_reverse(struct flowi4 *fl4, +static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, const struct net_device *dev, u8 flags) { struct fib_result res; bool dev_match; - struct net *net = dev_net(dev); int ret __maybe_unused; if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) @@ -96,7 +95,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_tos = RT_TOS(iph->tos); flow.flowi4_scope = RT_SCOPE_UNIVERSE; - return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert; + return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index a0f3beca52d2..397ef2dd133e 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -33,19 +33,16 @@ static const struct xt_table packet_filter = { }; static unsigned int -iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter); + return ipt_do_table(skb, state, state->net->ipv4.iptable_filter); } static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 62cbb8c5f4a8..ba5d392a13c4 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -39,7 +39,6 @@ static const struct xt_table packet_mangler = { static unsigned int ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) { - struct net_device *out = state->out; unsigned int ret; const struct iphdr *iph; u_int8_t tos; @@ -59,8 +58,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state, - dev_net(out)->ipv4.iptable_mangle); + ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); @@ -69,7 +67,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { - err = ip_route_me_harder(skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -80,18 +78,17 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) /* The work comes in here from netfilter.c. */ static unsigned int -iptable_mangle_hook(const struct nf_hook_ops *ops, +iptable_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (ops->hooknum == NF_INET_LOCAL_OUT) + if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, state); - if (ops->hooknum == NF_INET_POST_ROUTING) - return ipt_do_table(skb, ops->hooknum, state, - dev_net(state->out)->ipv4.iptable_mangle); + if (state->hook == NF_INET_POST_ROUTING) + return ipt_do_table(skb, state, + state->net->ipv4.iptable_mangle); /* PREROUTING/INPUT/FORWARD: */ - return ipt_do_table(skb, ops->hooknum, state, - dev_net(state->in)->ipv4.iptable_mangle); + return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); } static struct nf_hook_ops *mangle_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 0d4d9cdf98a4..ae2cd2752046 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -28,49 +28,46 @@ static const struct xt_table nf_nat_ipv4_table = { .af = NFPROTO_IPV4, }; -static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table); + return ipt_do_table(skb, state, state->net->ipv4.nat_table); } -static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain); } -static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops, +static unsigned int iptable_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain); + return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_in, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, @@ -78,7 +75,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* After packet filtering, change source */ { .hook = iptable_nat_ipv4_out, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, @@ -86,7 +82,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* Before packet filtering, change destination */ { .hook = iptable_nat_ipv4_local_fn, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, @@ -94,7 +89,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* After packet filtering, change source */ { .hook = iptable_nat_ipv4_fn, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 0356e6da4bb7..1ba02811acb0 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -20,19 +20,16 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int -iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw); + return ipt_do_table(skb, state, state->net->ipv4.iptable_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 4bce3980ccd9..c2e23d5e9cd4 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -37,20 +37,16 @@ static const struct xt_table security_table = { }; static unsigned int -iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, +iptable_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - const struct net *net; - - if (ops->hooknum == NF_INET_LOCAL_OUT && + if (state->hook == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* Somebody is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net(state->in ? state->in : state->out); - return ipt_do_table(skb, ops->hooknum, state, - net->ipv4.iptable_security); + return ipt_do_table(skb, state, state->net->ipv4.iptable_security); } static struct nf_hook_ops *sectbl_ops __read_mostly; @@ -83,7 +79,7 @@ static int __init iptable_security_init(void) int ret; ret = register_pernet_subsys(&iptable_security_net_ops); - if (ret < 0) + if (ret < 0) return ret; sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 8a2caaf3940b..461ca926fd39 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, return NF_ACCEPT; } -static unsigned int ipv4_helper(const struct nf_hook_ops *ops, +static unsigned int ipv4_helper(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -119,7 +119,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops, ct, ctinfo); } -static unsigned int ipv4_confirm(const struct nf_hook_ops *ops, +static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -143,14 +143,14 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } -static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -158,7 +158,7 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); } /* Connection tracking may drop packets, but never alters them, so @@ -166,42 +166,36 @@ static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { { .hook = ipv4_conntrack_in, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_local, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_helper, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ipv4_helper, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index cdde3ec496e9..c567e1b5d799 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -30,7 +30,7 @@ static inline struct nf_icmp_net *icmp_pernet(struct net *net) } static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct nf_conntrack_tuple *tuple) + struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; @@ -144,7 +144,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb) + ip_hdrlen(skb) + sizeof(struct icmphdr), - PF_INET, &origtuple)) { + PF_INET, net, &origtuple)) { pr_debug("icmp_error_message: failed to get tuple\n"); return -NF_ACCEPT; } diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 9306ec4fab41..0e5591c2ee9f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,14 +22,15 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, + u_int32_t user) { int err; skb_orphan(skb); local_bh_disable(); - err = ip_defrag(skb, user); + err = ip_defrag(net, skb, user); local_bh_enable(); if (!err) { @@ -61,7 +62,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, return IP_DEFRAG_CONNTRACK_OUT + zone_id; } -static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, +static unsigned int ipv4_conntrack_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -83,9 +84,9 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { enum ip_defrag_users user = - nf_ct_defrag_user(ops->hooknum, skb); + nf_ct_defrag_user(state->hook, skb); - if (nf_ct_ipv4_gather_frags(skb, user)) + if (nf_ct_ipv4_gather_frags(state->net, skb, user)) return NF_STOLEN; } return NF_ACCEPT; @@ -94,14 +95,12 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, static struct nf_hook_ops ipv4_defrag_ops[] = { { .hook = ipv4_conntrack_defrag, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_defrag, - .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 2d79e6e8d934..ceb187308120 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -23,25 +23,10 @@ #include <net/netfilter/nf_conntrack.h> #endif -static struct net *pick_net(struct sk_buff *skb) -{ -#ifdef CONFIG_NET_NS - const struct dst_entry *dst; - - if (skb->dev != NULL) - return dev_net(skb->dev); - dst = skb_dst(skb); - if (dst != NULL && dst->dev != NULL) - return dev_net(dst->dev); -#endif - return &init_net; -} - -static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw, - int oif) +static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, + const struct in_addr *gw, int oif) { const struct iphdr *iph = ip_hdr(skb); - struct net *net = pick_net(skb); struct rtable *rt; struct flowi4 fl4; @@ -65,7 +50,7 @@ static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw, return true; } -void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, +void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in_addr *gw, int oif) { struct iphdr *iph; @@ -105,9 +90,9 @@ void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, --iph->ttl; ip_send_check(iph); - if (nf_dup_ipv4_route(skb, gw, oif)) { + if (nf_dup_ipv4_route(net, skb, gw, oif)) { __this_cpu_write(nf_skb_duplicated, true); - ip_local_out(skb); + ip_local_out(net, skb->sk, skb); __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 22f4579b0c2a..5075b7ecd26d 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -255,9 +255,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); unsigned int -nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -266,7 +266,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); /* We never see fragments: conntrack defrags on pre-routing * and local-out, and nf_nat_out protects post-routing. @@ -295,7 +295,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, case IP_CT_RELATED_REPLY: if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) + state->hook)) return NF_DROP; else return NF_ACCEPT; @@ -308,21 +308,21 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(ops, skb, state, ct); + ret = do_chain(priv, skb, state, ct); if (ret != NF_ACCEPT) return ret; - if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + if (nf_nat_initialized(ct, HOOK2MANIP(state->hook))) break; - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } @@ -332,11 +332,11 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); @@ -345,9 +345,9 @@ oif_changed: EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); unsigned int -nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -355,7 +355,7 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); @@ -365,9 +365,9 @@ nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv4_in); unsigned int -nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -384,7 +384,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -396,7 +396,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET); + err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } @@ -407,9 +407,9 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_ipv4_out); unsigned int -nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, +nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, - unsigned int (*do_chain)(const struct nf_hook_ops *ops, + unsigned int (*do_chain)(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct)) @@ -424,14 +424,14 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { - err = ip_route_me_harder(skb, RTN_UNSPEC); + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } @@ -440,7 +440,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET); + err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 7c676671329d..ddb894ac1458 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -1156,7 +1156,7 @@ static int snmp_parse_mangle(unsigned char *msg, } if (obj->type == SNMP_IPADDR) - mangle_address(ctx.begin, ctx.pointer - 4 , map, check); + mangle_address(ctx.begin, ctx.pointer - 4, map, check); kfree(obj->id); kfree(obj); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 3262e41ff76f..c747b2d9eb77 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -99,7 +99,7 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); /* Send RST reply */ -void nf_send_reset(struct sk_buff *oldskb, int hook) +void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; const struct iphdr *oiph; @@ -129,7 +129,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) ip4_dst_hoplimit(skb_dst(nskb))); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) goto free_nskb; /* "Never happens" */ @@ -157,7 +157,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip_local_out(nskb); + ip_local_out(net, nskb->sk, nskb); return; diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 8412268bbad1..9d09d4f59545 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -15,15 +15,15 @@ #include <net/netfilter/nf_tables.h> static unsigned int -nft_do_chain_arp(const struct nf_hook_ops *ops, +nft_do_chain_arp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; - nft_set_pktinfo(&pkt, ops, skb, state); + nft_set_pktinfo(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } static struct nft_af_info nft_af_arp __read_mostly = { diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index aa180d3a69a5..ca9dc3c46c4f 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -18,18 +18,18 @@ #include <net/ip.h> #include <net/netfilter/nf_tables_ipv4.h> -static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, +static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, +static unsigned int nft_ipv4_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -41,7 +41,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, return NF_ACCEPT; } - return nft_do_chain_ipv4(ops, skb, state); + return nft_do_chain_ipv4(priv, skb, state); } struct nft_af_info nft_af_ipv4 __read_mostly = { diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index bf5c30ae14e4..f5c66a7a4bf2 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -26,44 +26,44 @@ #include <net/netfilter/nf_nat_l3proto.h> #include <net/ip.h> -static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, +static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, struct nf_conn *ct) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); - return nft_do_chain(&pkt, ops); + return nft_do_chain(&pkt, priv); } -static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain); } -static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops, +static unsigned int nft_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain); + return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv4 = { diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index e335b0afdaf3..2375b0a8be46 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -21,7 +21,7 @@ #include <net/route.h> #include <net/ip.h> -static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, +static unsigned int nf_route_table_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { @@ -37,7 +37,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - nft_set_pktinfo_ipv4(&pkt, ops, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb, state); mark = skb->mark; iph = ip_hdr(skb); @@ -45,7 +45,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, daddr = iph->daddr; tos = iph->tos; - ret = nft_do_chain(&pkt, ops); + ret = nft_do_chain(&pkt, priv); if (ret != NF_DROP && ret != NF_QUEUE) { iph = ip_hdr(skb); @@ -53,7 +53,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, iph->daddr != daddr || skb->mark != mark || iph->tos != tos) - if (ip_route_me_harder(skb, RTN_UNSPEC)) + if (ip_route_me_harder(state->net, skb, RTN_UNSPEC)) ret = NF_DROP; } return ret; diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index b45932d43b69..bf855e64fc45 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -30,7 +30,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr, }; int oif = regs->data[priv->sreg_dev]; - nf_dup_ipv4(pkt->skb, pkt->ops->hooknum, &gw, oif); + nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif); } static int nft_dup_ipv4_init(const struct nft_ctx *ctx, diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 40e414c4ca56..b72ffc58e255 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -26,7 +26,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, + regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, &range, pkt->out); } diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index d8d795df9c13..c09d4381427e 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -36,7 +36,7 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, mr.range[0].flags |= priv->flags; regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, - pkt->ops->hooknum); + pkt->hook); } static struct nft_expr_type nft_redir_ipv4_type; diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index b07e58b51158..c24f41c816b3 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,11 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach(pkt->skb, priv->icmp_code, - pkt->ops->hooknum); + nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook); break; case NFT_REJECT_TCP_RST: - nf_send_reset(pkt->skb, pkt->ops->hooknum); + nf_send_reset(pkt->net, pkt->skb, pkt->hook); break; default: break; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 561cd4b8fc6e..8c0d0bdc2a7c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -411,8 +411,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); - err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, - NULL, rt->dst.dev, dst_output_sk); + err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, + net, sk, skb, NULL, rt->dst.dev, + dst_output); if (err > 0) err = net_xmit_errno(err); if (err) @@ -483,6 +484,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; @@ -542,7 +544,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); + err = ip_cmsg_send(net, msg, &ipc, false); if (err) goto out; if (ipc.opt) @@ -597,6 +599,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); + if (!saddr && ipc.oif) + l3mdev_get_saddr(net, ipc.oif, &fl4); + if (!inet->hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -607,7 +612,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c81deb85acb4..85f184e429c6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -112,7 +112,7 @@ #endif #include <net/secure_seq.h> #include <net/ip_tunnels.h> -#include <net/vrf.h> +#include <net/l3mdev.h> #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -847,7 +847,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); - vif = vrf_master_ifindex_rcu(rt->dst.dev); + vif = l3mdev_master_ifindex_rcu(rt->dst.dev); rcu_read_unlock(); net = dev_net(rt->dst.dev); @@ -941,7 +941,7 @@ static int ip_error(struct sk_buff *skb) } peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, - vrf_master_ifindex(skb->dev), 1); + l3mdev_master_ifindex(skb->dev), 1); send = true; if (peer) { @@ -1152,7 +1152,7 @@ static void ipv4_link_failure(struct sk_buff *skb) dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) +static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1438,12 +1438,34 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, } static struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, bool nopolicy, bool noxfrm, bool will_cache) { - return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | - (nopolicy ? DST_NOPOLICY : 0) | - (noxfrm ? DST_NOXFRM : 0)); + struct rtable *rt; + + rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0)); + + if (rt) { + rt->rt_genid = rt_genid_ipv4(dev_net(dev)); + rt->rt_flags = flags; + rt->rt_type = type; + rt->rt_is_input = 0; + rt->rt_iif = 0; + rt->rt_pmtu = 0; + rt->rt_gateway = 0; + rt->rt_uses_gateway = 0; + rt->rt_table_id = 0; + INIT_LIST_HEAD(&rt->rt_uncached); + + rt->dst.output = ip_output; + if (flags & RTCF_LOCAL) + rt->dst.input = ip_local_deliver; + } + + return rt; } /* called in rcu_read_lock() section */ @@ -1452,6 +1474,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct rtable *rth; struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; u32 itag = 0; int err; @@ -1464,9 +1487,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) - if (ipv4_is_loopback(saddr)) - goto e_inval; + if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) + goto e_inval; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) @@ -1477,7 +1499,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (err < 0) goto e_err; } - rth = rt_dst_alloc(dev_net(dev)->loopback_dev, + if (our) + flags |= RTCF_LOCAL; + + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; @@ -1486,20 +1511,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; - - rth->rt_genid = rt_genid_ipv4(dev_net(dev)); - rth->rt_flags = RTCF_MULTICAST; - rth->rt_type = RTN_MULTICAST; rth->rt_is_input= 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); - if (our) { - rth->dst.input= ip_local_deliver; - rth->rt_flags |= RTCF_LOCAL; - } #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) @@ -1608,7 +1620,7 @@ static int __mkroute_input(struct sk_buff *skb, } } - rth = rt_dst_alloc(out_dev->dev, + rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { @@ -1616,19 +1628,12 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); - rth->rt_flags = 0; - rth->rt_type = res->type; rth->rt_is_input = 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res->table) + rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; - rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); if (lwtunnel_output_redirect(rth->dst.lwtstate)) { @@ -1646,6 +1651,48 @@ out: return err; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* To make ICMP packets follow the right flow, the multipath hash is + * calculated from the inner IP addresses in reverse order. + */ +static int ip_multipath_icmp_hash(struct sk_buff *skb) +{ + const struct iphdr *outer_iph = ip_hdr(skb); + struct icmphdr _icmph; + const struct icmphdr *icmph; + struct iphdr _inner_iph; + const struct iphdr *inner_iph; + + if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) + goto standard_hash; + + icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), + &_icmph); + if (!icmph) + goto standard_hash; + + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_REDIRECT && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB) { + goto standard_hash; + } + + inner_iph = skb_header_pointer(skb, + outer_iph->ihl * 4 + sizeof(_icmph), + sizeof(_inner_iph), &_inner_iph); + if (!inner_iph) + goto standard_hash; + + return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr); + +standard_hash: + return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr); +} + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, const struct flowi4 *fl4, @@ -1653,8 +1700,15 @@ static int ip_mkroute_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 tos) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) - fib_select_multipath(res); + if (res->fi && res->fi->fib_nhs > 1) { + int h; + + if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP)) + h = ip_multipath_icmp_hash(skb); + else + h = fib_multipath_hash(saddr, daddr); + fib_select_multipath(res, h); + } #endif /* create a routing cache entry */ @@ -1706,6 +1760,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_source; res.fi = NULL; + res.table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; @@ -1733,7 +1788,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = vrf_master_ifindex_rcu(dev) ? : dev->ifindex; + fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev); fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; @@ -1754,7 +1809,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &itag); if (err < 0) - goto martian_source_keep_err; + goto martian_source; goto local_input; } @@ -1776,7 +1831,7 @@ brd_input: err = fib_validate_source(skb, saddr, 0, tos, 0, dev, in_dev, &itag); if (err < 0) - goto martian_source_keep_err; + goto martian_source; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; @@ -1796,26 +1851,18 @@ local_input: } } - rth = rt_dst_alloc(net->loopback_dev, + rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; - rth->dst.input= ip_local_deliver; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif - - rth->rt_genid = rt_genid_ipv4(net); - rth->rt_flags = flags|RTCF_LOCAL; - rth->rt_type = res.type; rth->rt_is_input = 1; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res.table) + rth->rt_table_id = res.table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { @@ -1837,6 +1884,7 @@ no_route: RT_CACHE_STAT_INC(in_no_route); res.type = RTN_UNREACHABLE; res.fi = NULL; + res.table = NULL; goto local_input; /* @@ -1859,8 +1907,6 @@ e_nobufs: goto out; martian_source: - err = -EINVAL; -martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } @@ -1988,28 +2034,19 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } add: - rth = rt_dst_alloc(dev_out, + rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM), do_cache); if (!rth) return ERR_PTR(-ENOBUFS); - rth->dst.output = ip_output; - - rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); - rth->rt_flags = flags; - rth->rt_type = type; - rth->rt_is_input = 0; rth->rt_iif = orig_oif ? : 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; - INIT_LIST_HEAD(&rth->rt_uncached); + if (res->table) + rth->rt_table_id = res->table->tb_id; + RT_CACHE_STAT_INC(out_slow_tot); - if (flags & RTCF_LOCAL) - rth->dst.input = ip_local_deliver; if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { @@ -2038,7 +2075,8 @@ add: * Major route resolver routine. */ -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, + int mp_hash) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2137,11 +2175,10 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } - if (netif_is_vrf(dev_out) && - !(fl4->flowi4_flags & FLOWI_FLAG_VRFSRC)) { - rth = vrf_dev_get_rth(dev_out); + + rth = l3mdev_get_rtable(dev_out, fl4); + if (rth) goto out; - } } if (!fl4->daddr) { @@ -2159,7 +2196,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) if (err) { res.fi = NULL; res.table = NULL; - if (fl4->flowi4_oif) { + if (fl4->flowi4_oif && + !netif_index_is_l3_master(net, fl4->flowi4_oif)) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2201,18 +2239,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) - fib_select_multipath(&res); - else -#endif - if (!res.prefixlen && - res.table->tb_num_default > 1 && - res.type == RTN_UNICAST && !fl4->flowi4_oif) - fib_select_default(fl4, &res); - - if (!fl4->saddr) - fl4->saddr = FIB_RES_PREFSRC(net, res); + fib_select_path(net, &res, fl4, mp_hash); dev_out = FIB_RES_DEV(res); fl4->flowi4_oif = dev_out->ifindex; @@ -2225,7 +2252,7 @@ out: rcu_read_unlock(); return rth; } -EXPORT_SYMBOL_GPL(__ip_route_output_key); +EXPORT_SYMBOL_GPL(__ip_route_output_key_hash); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) { @@ -2277,7 +2304,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; - new->output = dst_discard_sk; + new->output = dst_discard_out; new->dev = ort->dst.dev; if (new->dev) @@ -2303,7 +2330,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, - struct sock *sk) + const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); @@ -2319,7 +2346,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); -static int rt_fill_info(struct net *net, __be32 dst, __be32 src, +static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, u32 seq, int event, int nowait, unsigned int flags) { @@ -2339,8 +2366,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = fl4->flowi4_tos; - r->rtm_table = RT_TABLE_MAIN; - if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) + r->rtm_table = table_id; + if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; @@ -2445,6 +2472,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) int err; int mark; struct sk_buff *skb; + u32 table_id = RT_TABLE_MAIN; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) @@ -2480,6 +2508,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; + if (netif_index_is_l3_master(net, fl4.flowi4_oif)) + fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF; + if (iif) { struct net_device *dev; @@ -2514,7 +2545,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(net, dst, src, &fl4, skb, + if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) + table_id = rt->rt_table_id; + + err = rt_fill_info(net, dst, src, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err < 0) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index d70b1f603692..4cbe9f0a4281 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -192,15 +192,11 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); -__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, - __u16 *mssp) +__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - tcp_synq_overflow(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return __cookie_v4_init_sequence(iph, th, mssp); } @@ -225,10 +221,13 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; + bool own_req; - child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, + NULL, &own_req); if (child) { atomic_set(&req->rsk_refcnt, 1); + sock_rps_save_rxhash(child, skb); inet_csk_reqsk_queue_add(sk, req, child); } else { reqsk_free(req); @@ -288,6 +287,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, } EXPORT_SYMBOL(cookie_ecn_ok); +/* On input, sk is a listener. + * Output is listener if incoming packet would not create a child + * NULL if memory could not be allocated. + */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; @@ -326,7 +329,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */ if (!req) goto out; @@ -345,7 +348,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; + treq->snt_synack.v64 = 0; treq->tfo_listener = false; ireq->ir_iif = sk->sk_bound_dev_if; @@ -381,10 +384,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, - &req->rcv_wnd, &req->window_clamp, + &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 894da3a70aff..25300c5e283b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_recovery", + .data = &sysctl_tcp_recovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "tcp_reordering", .data = &sysctl_tcp_reordering, .maxlen = sizeof(int), @@ -577,6 +584,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_min_rtt_wlen", + .data = &sysctl_tcp_min_rtt_wlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b8b8fa184f75..0cfa7c0c1e80 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + tp->rtt_min[0].rtt = ~0U; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -900,7 +901,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto out_err; } @@ -967,7 +969,8 @@ new_segment: copied += copy; offset += copy; - if (!(size -= copy)) { + size -= copy; + if (!size) { tcp_tx_timestamp(sk, skb); goto out; } @@ -988,7 +991,8 @@ wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1111,7 +1115,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { - if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) goto do_error; } @@ -1267,7 +1272,8 @@ wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); - if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1767,7 +1773,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* __ Restore normal policy in scheduler __ */ - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; @@ -1778,7 +1785,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, do_prequeue: tcp_prequeue_process(sk); - if ((chunk = len - tp->ucopy.len) != 0) { + chunk = len - tp->ucopy.len; + if (chunk != 0) { NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; @@ -2230,7 +2238,8 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; - if ((tp->write_seq += tp->max_window + 2) == 0) + tp->write_seq += tp->max_window + 2; + if (tp->write_seq == 0) tp->write_seq = 1; icsk->icsk_backoff = 0; tp->snd_cwnd = 2; @@ -2253,13 +2262,6 @@ int tcp_disconnect(struct sock *sk, int flags) } EXPORT_SYMBOL(tcp_disconnect); -void tcp_sock_destruct(struct sock *sk) -{ - inet_sock_destruct(sk); - - kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); -} - static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && @@ -2581,7 +2583,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, TCPF_LISTEN))) { tcp_fastopen_init_key_once(true); - err = fastopen_init_queue(sk, val); + fastopen_queue_tune(sk, val); } else { err = -EINVAL; } @@ -2849,10 +2851,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - if (icsk->icsk_accept_queue.fastopenq) - val = icsk->icsk_accept_queue.fastopenq->max_qlen; - else - val = 0; + val = icsk->icsk_accept_queue.fastopenq.max_qlen; break; case TCP_TIMESTAMP: diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 93c4dc3ab23f..882caa4e72bc 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -173,6 +173,10 @@ out: */ if (ca->get_info) memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + if (ca->flags & TCP_CONG_NEEDS_ECN) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) @@ -181,6 +185,10 @@ void tcp_init_congestion_control(struct sock *sk) if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); } static void tcp_reinit_congestion_control(struct sock *sk, @@ -192,8 +200,8 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; - if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) - icsk->icsk_ca_ops->init(sk); + if (sk->sk_state != TCP_CLOSE) + tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f9c0fb84e435..55be6ac70cff 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -124,27 +124,29 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, return false; } -static bool tcp_fastopen_create_child(struct sock *sk, - struct sk_buff *skb, - struct dst_entry *dst, - struct request_sock *req) +static struct sock *tcp_fastopen_create_child(struct sock *sk, + struct sk_buff *skb, + struct dst_entry *dst, + struct request_sock *req) { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; u32 end_seq; + bool own_req; req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + NULL, &own_req); if (!child) - return false; + return NULL; - spin_lock(&queue->fastopenq->lock); - queue->fastopenq->qlen++; - spin_unlock(&queue->fastopenq->lock); + spin_lock(&queue->fastopenq.lock); + queue->fastopenq.qlen++; + spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created @@ -161,15 +163,13 @@ static bool tcp_fastopen_create_child(struct sock *sk, tp->snd_wnd = ntohs(tcp_hdr(skb)->window); /* Activate the retrans timer so that SYNACK can be retransmitted. - * The request socket is not added to the SYN table of the parent + * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); - atomic_set(&req->rsk_refcnt, 1); - /* Add the child socket directly into the accept queue */ - inet_csk_reqsk_queue_add(sk, req, child); + atomic_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ inet_csk(child)->icsk_af_ops->rebuild_header(child); @@ -178,12 +178,10 @@ static bool tcp_fastopen_create_child(struct sock *sk, tcp_init_metrics(child); tcp_init_buffer_space(child); - /* Queue the data carried in the SYN packet. We need to first - * bump skb's refcnt because the caller will attempt to free it. - * Note that IPv6 might also have used skb_get() trick - * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts) - * So we need to eventually get a clone of the packet, - * before inserting it in sk_receive_queue. + /* Queue the data carried in the SYN packet. + * We used to play tricky games with skb_get(). + * With lockless listener, it is a dead end. + * Do not think about it. * * XXX (TFO) - we honor a zero-payload TFO request for now, * (any reason not to?) but no need to queue the skb since @@ -191,12 +189,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, */ end_seq = TCP_SKB_CB(skb)->end_seq; if (end_seq != TCP_SKB_CB(skb)->seq + 1) { - struct sk_buff *skb2; - - if (unlikely(skb_shared(skb))) - skb2 = skb_clone(skb, GFP_ATOMIC); - else - skb2 = skb_get(skb); + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (likely(skb2)) { skb_dst_drop(skb2); @@ -214,11 +207,10 @@ static bool tcp_fastopen_create_child(struct sock *sk, } } tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; - sk->sk_data_ready(sk); - bh_unlock_sock(child); - sock_put(child); - WARN_ON(!req->sk); - return true; + /* tcp_conn_request() is sending the SYNACK, + * and queues the child into listener accept queue. + */ + return child; } static bool tcp_fastopen_queue_check(struct sock *sk) @@ -235,8 +227,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ - fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - if (!fastopenq || fastopenq->max_qlen == 0) + fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; + if (fastopenq->max_qlen == 0) return false; if (fastopenq->qlen >= fastopenq->max_qlen) { @@ -261,13 +253,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ -bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct dst_entry *dst) +struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst) { struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); @@ -276,7 +269,7 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; - return false; + return NULL; } if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) @@ -296,11 +289,12 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, * data in SYN_RECV state. */ fastopen: - if (tcp_fastopen_create_child(sk, skb, dst, req)) { + child = tcp_fastopen_create_child(sk, skb, dst, req); + if (child) { foc->len = -1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); - return true; + return child; } NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else if (foc->len > 0) /* Client presents an invalid cookie */ @@ -308,6 +302,5 @@ fastopen: valid_foc.exp = foc->exp; *foc = valid_foc; - return false; + return NULL; } -EXPORT_SYMBOL(tcp_try_fastopen); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a8f515bb19c4..fdd88c3803a6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; +int sysctl_tcp_min_rtt_wlen __read_mostly = 300; int sysctl_tcp_thin_dupack __read_mostly; @@ -880,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, if (metric > 0) tcp_disable_early_retrans(tp); + tp->rack.reord = 1; } /* This must be called before lost_out is incremented */ @@ -905,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) } } -static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, - struct sk_buff *skb) +void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) { tcp_verify_retransmit_hint(tp, skb); @@ -1047,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, return !before(start_seq, end_seq - tp->max_window); } -/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "B". Later note: FACK people cheated me again 8), we have to account - * for reordering! Ugly, but should help. - * - * Search retransmitted skbs from write_queue that were sent when snd_nxt was - * less than what is now known to be received by the other end (derived from - * highest SACK block). Also calculate the lowest snd_nxt among the remaining - * retransmitted skbs to avoid some costly processing per ACKs. - */ -static void tcp_mark_lost_retrans(struct sock *sk, int *flag) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - int cnt = 0; - u32 new_low_seq = tp->snd_nxt; - u32 received_upto = tcp_highest_sack_seq(tp); - - if (!tcp_is_fack(tp) || !tp->retrans_out || - !after(received_upto, tp->lost_retrans_low) || - icsk->icsk_ca_state != TCP_CA_Recovery) - return; - - tcp_for_write_queue(skb, sk) { - u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; - - if (skb == tcp_send_head(sk)) - break; - if (cnt == tp->retrans_out) - break; - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) - continue; - - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) - continue; - - /* TODO: We would like to get rid of tcp_is_fack(tp) only - * constraint here (see above) but figuring out that at - * least tp->reordering SACK blocks reside between ack_seq - * and received_upto is not easy task to do cheaply with - * the available datastructures. - * - * Whether FACK should check here for tp->reordering segs - * in-between one could argue for either way (it would be - * rather simple to implement as we could count fack_count - * during the walk and do tp->fackets_out - fack_count). - */ - if (after(received_upto, ack_seq)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - *flag |= FLAG_LOST_RETRANS; - tcp_skb_mark_lost_uncond_verify(tp, skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); - } else { - if (before(ack_seq, new_low_seq)) - new_low_seq = ack_seq; - cnt += tcp_skb_pcount(skb); - } - } - - if (tp->retrans_out) - tp->lost_retrans_low = new_low_seq; -} - static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) @@ -1236,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { + tcp_rack_advance(tp, xmit_time, sacked); + if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, * we do not clear RETRANS, believing @@ -1837,7 +1776,6 @@ advance_sp: ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); - tcp_mark_lost_retrans(sk, &state->flag); tcp_verify_left_out(tp); out: @@ -2314,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } +static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) +{ + return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + before(tp->rx_opt.rcv_tsecr, when); +} + +/* skb is spurious retransmitted if the returned timestamp echo + * reply is prior to the skb transmission time + */ +static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, + const struct sk_buff *skb) +{ + return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && + tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb)); +} + /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { return !tp->retrans_stamp || - (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp)); + tcp_tsopt_ecr_before(tp, tp->retrans_stamp); } /* Undo procedures. */ @@ -2853,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } } + /* Use RACK to detect loss */ + if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && + tcp_rack_mark_lost(sk)) + flag |= FLAG_LOST_RETRANS; + /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: @@ -2915,8 +2873,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_xmit_retransmit_queue(sk); } +/* Kathleen Nichols' algorithm for tracking the minimum value of + * a data stream over some fixed time interval. (E.g., the minimum + * RTT over the past five minutes.) It uses constant space and constant + * time per update yet almost always delivers the same minimum as an + * implementation that has to keep all the data in the window. + * + * The algorithm keeps track of the best, 2nd best & 3rd best min + * values, maintaining an invariant that the measurement time of the + * n'th best >= n-1'th best. It also makes sure that the three values + * are widely separated in the time window since that bounds the worse + * case error when that data is monotonically increasing over the window. + * + * Upon getting a new min, we can forget everything earlier because it + * has no value - the new min is <= everything else in the window by + * definition and it's the most recent. So we restart fresh on every new min + * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd + * best. + */ +static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) +{ + const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; + struct rtt_meas *m = tcp_sk(sk)->rtt_min; + struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now }; + u32 elapsed; + + /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ + if (unlikely(rttm.rtt <= m[0].rtt)) + m[0] = m[1] = m[2] = rttm; + else if (rttm.rtt <= m[1].rtt) + m[1] = m[2] = rttm; + else if (rttm.rtt <= m[2].rtt) + m[2] = rttm; + + elapsed = now - m[0].ts; + if (unlikely(elapsed > wlen)) { + /* Passed entire window without a new min so make 2nd choice + * the new min & 3rd choice the new 2nd. So forth and so on. + */ + m[0] = m[1]; + m[1] = m[2]; + m[2] = rttm; + if (now - m[0].ts > wlen) { + m[0] = m[1]; + m[1] = rttm; + if (now - m[0].ts > wlen) + m[0] = rttm; + } + } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) { + /* Passed a quarter of the window without a new min so + * take 2nd choice from the 2nd quarter of the window. + */ + m[2] = m[1] = rttm; + } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) { + /* Passed half the window without a new min so take the 3rd + * choice from the last half of the window. + */ + m[2] = rttm; + } +} + static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - long seq_rtt_us, long sack_rtt_us) + long seq_rtt_us, long sack_rtt_us, + long ca_rtt_us) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2925,9 +2944,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * Karn's algorithm forbids taking RTT if some retransmitted data * is acked (RFC6298). */ - if (flag & FLAG_RETRANS_DATA_ACKED) - seq_rtt_us = -1L; - if (seq_rtt_us < 0) seq_rtt_us = sack_rtt_us; @@ -2939,11 +2955,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); - + seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp - + tp->rx_opt.rcv_tsecr); if (seq_rtt_us < 0) return false; + /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is + * always taken together with ACK, SACK, or TS-opts. Any negative + * values will be skipped with the seq_rtt_us < 0 check above. + */ + tcp_update_rtt_min(sk, ca_rtt_us); tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); @@ -2953,21 +2974,21 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) +void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); - long seq_rtt_us = -1L; + long rtt_us = -1L; - if (synack_stamp && !tp->total_retrans) - seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); + if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) { + struct skb_mstamp now; - /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets - * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() - */ - if (!tp->srtt_us) - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); + skb_mstamp_get(&now); + rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); + } + + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us); } + static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3131,6 +3152,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= acked_pcount; + else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb)) + tcp_rack_advance(tp, &skb->skb_mstamp, sacked); if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3169,7 +3192,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_SACK_RENEGING; skb_mstamp_get(&now); - if (likely(first_ackt.v64)) { + if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); } @@ -3178,7 +3201,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); } - rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, + ca_rtt_us); if (flag & FLAG_ACKED) { tcp_rearm_rto(sk); @@ -5472,7 +5496,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, } static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -5698,15 +5722,14 @@ reset_and_undo: * address independent. */ -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; bool acceptable; - u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; @@ -5750,7 +5773,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_SYN_SENT: - queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5785,15 +5808,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (!acceptable) return 1; + if (!tp->srtt_us) + tcp_synack_rtt_meas(sk, req); + /* Once we leave TCP_SYN_RECV, we no longer need req * so release it. */ if (req) { - synack_stamp = tcp_rsk(req)->snt_synack; tp->total_retrans = req->num_retrans; reqsk_fastopen_remove(sk, req, false); } else { - synack_stamp = tp->lsndtime; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_init_congestion_control(sk); @@ -5816,7 +5840,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_synack_rtt_meas(sk, synack_stamp); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -6023,11 +6046,11 @@ static void tcp_openreq_init(struct request_sock *req, { struct inet_request_sock *ireq = inet_rsk(req); - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tcp_rsk(req)->snt_synack = tcp_time_stamp; + skb_mstamp_get(&tcp_rsk(req)->snt_synack); tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; @@ -6043,9 +6066,11 @@ static void tcp_openreq_init(struct request_sock *req, } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk_listener) + struct sock *sk_listener, + bool attach_listener) { - struct request_sock *req = reqsk_alloc(ops, sk_listener); + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); if (req) { struct inet_request_sock *ireq = inet_rsk(req); @@ -6065,13 +6090,13 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(struct sock *sk, +static bool tcp_syn_flood_action(const struct sock *sk, const struct sk_buff *skb, const char *proto) { + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; bool want_cookie = false; - struct listen_sock *lopt; #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { @@ -6082,12 +6107,12 @@ static bool tcp_syn_flood_action(struct sock *sk, #endif NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { - lopt->synflood_warned = 1; + if (!queue->synflood_warned && + sysctl_tcp_syncookies != 2 && + xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); - } + return want_cookie; } @@ -6112,16 +6137,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { + struct tcp_fastopen_cookie foc = { .len = -1 }; + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; - struct request_sock *req; struct tcp_sock *tp = tcp_sk(sk); + struct sock *fastopen_sk = NULL; struct dst_entry *dst = NULL; - __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; - bool want_cookie = false, fastopen; + struct request_sock *req; + bool want_cookie = false; struct flowi fl; - struct tcp_fastopen_cookie foc = { .len = -1 }; - int err; - /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is @@ -6145,7 +6169,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - req = inet_reqsk_alloc(rsk_ops, sk); + req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie); if (!req) goto drop; @@ -6228,20 +6252,30 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); - fastopen = !want_cookie && - tcp_try_fastopen(sk, skb, req, &foc, dst); - err = af_ops->send_synack(sk, dst, &fl, req, - skb_get_queue_mapping(skb), &foc); - if (!fastopen) { - if (err || want_cookie) - goto drop_and_free; - + if (!want_cookie) { + tcp_reqsk_record_syn(sk, req, skb); + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); + } + if (fastopen_sk) { + af_ops->send_synack(fastopen_sk, dst, &fl, req, + &foc, false); + /* Add the child socket directly into the accept queue */ + inet_csk_reqsk_queue_add(sk, req, fastopen_sk); + sk->sk_data_ready(sk); + bh_unlock_sock(fastopen_sk); + sock_put(fastopen_sk); + } else { tcp_rsk(req)->tfo_listener = false; - af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + if (!want_cookie) + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + af_ops->send_synack(sk, dst, &fl, req, + &foc, !want_cookie); + if (want_cookie) + goto drop_and_free; } - tcp_reqsk_record_syn(sk, req, skb); - + reqsk_put(req); return 0; drop_and_release: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 93898e093d4e..1c2648bbac4b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -324,7 +324,6 @@ void tcp_req_err(struct sock *sk, u32 seq) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); } else { /* * Still in SYN_RECV, just remove it silently. @@ -332,9 +331,10 @@ void tcp_req_err(struct sock *sk, u32 seq) * created socket, and POSIX does not want network * errors returned from accept(). */ - NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); inet_csk_reqsk_queue_drop(req->rsk_listener, req); + NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); } + reqsk_put(req); } EXPORT_SYMBOL(tcp_req_err); @@ -576,7 +576,7 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -795,7 +795,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) inet_twsk_put(tw); } -static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV @@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, - tcp_rsk(req)->rcv_nxt, req->rcv_wnd, + tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd, tcp_time_stamp, req->ts_recent, 0, @@ -818,11 +818,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, +static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -833,12 +833,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc); + skb = tcp_make_synack(sk, dst, req, foc, attach_req); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); - skb_set_queue_mapping(skb, queue_mapping); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->opt); @@ -865,7 +864,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) */ /* Find the Key structure for an address. */ -struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, const union tcp_md5_addr *addr, int family) { @@ -877,7 +876,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, sock_owned_by_user(sk) || - lockdep_is_held(&sk->sk_lock.slock)); + lockdep_is_held((spinlock_t *)&sk->sk_lock.slock)); if (!md5sig) return NULL; #if IS_ENABLED(CONFIG_IPV6) @@ -894,7 +893,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, } EXPORT_SYMBOL(tcp_md5_do_lookup); -struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; @@ -1112,10 +1111,13 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); +#endif + /* Called with rcu_read_lock() */ -static bool tcp_v4_inbound_md5_hash(struct sock *sk, +static bool tcp_v4_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb) { +#ifdef CONFIG_TCP_MD5SIG /* * This gets called for each TCP segment that arrives * so we want to be efficient. @@ -1165,10 +1167,12 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, return true; } return false; -} #endif + return false; +} -static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, +static void tcp_v4_init_req(struct request_sock *req, + const struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); @@ -1179,7 +1183,8 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, ireq->opt = tcp_v4_save_options(skb); } -static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, +static struct dst_entry *tcp_v4_route_req(const struct sock *sk, + struct flowi *fl, const struct request_sock *req, bool *strict) { @@ -1218,7 +1223,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .route_req = tcp_v4_route_req, .init_seq = tcp_v4_init_sequence, .send_synack = tcp_v4_send_synack, - .queue_hash_add = inet_csk_reqsk_queue_hash_add, }; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -1241,9 +1245,11 @@ EXPORT_SYMBOL(tcp_v4_conn_request); * The three way handshake has completed - we got a valid synack - * now create the new socket. */ -struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) { struct inet_request_sock *ireq; struct inet_sock *newinet; @@ -1277,7 +1283,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; - sk_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; @@ -1320,7 +1325,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - __inet_hash_nolisten(newsk, NULL); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); return newsk; @@ -1338,34 +1343,11 @@ put_and_exit: } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); - const struct iphdr *iph = ip_hdr(skb); - struct request_sock *req; - struct sock *nsk; - - req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); - if (req) { - nsk = tcp_check_req(sk, skb, req, false); - if (!nsk || nsk == sk) - reqsk_put(req); - return nsk; - } - - nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, - th->source, iph->daddr, th->dest, inet_iif(skb)); - - if (nsk) { - if (nsk->sk_state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - inet_twsk_put(inet_twsk(nsk)); - return NULL; - } -#ifdef CONFIG_SYN_COOKIES if (!th->syn) sk = cookie_v4_check(sk, skb); #endif @@ -1373,7 +1355,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) } /* The socket must have it's spinlock held when we get - * here. + * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. @@ -1404,13 +1386,13 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto csum_err; if (sk->sk_state == TCP_LISTEN) { - struct sock *nsk = tcp_v4_hnd_req(sk, skb); + struct sock *nsk = tcp_v4_cookie_check(sk, skb); + if (!nsk) goto discard; - if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(sk, skb); + sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1420,7 +1402,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { + if (tcp_rcv_state_process(sk, skb)) { rsk = sk; goto reset; } @@ -1590,6 +1572,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; +lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; @@ -1598,6 +1581,33 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk = NULL; + + sk = req->rsk_listener; + if (tcp_v4_inbound_md5_hash(sk, skb)) + goto discard_and_relse; + if (likely(sk->sk_state == TCP_LISTEN)) { + nsk = tcp_check_req(sk, skb, req, false); + } else { + inet_csk_reqsk_queue_drop_and_put(sk, req); + goto lookup; + } + if (!nsk) { + reqsk_put(req); + goto discard_it; + } + if (nsk == sk) { + sock_hold(sk); + reqsk_put(req); + } else if (tcp_child_process(sk, nsk, skb)) { + tcp_v4_send_reset(nsk, skb); + goto discard_it; + } else { + return 0; + } + } if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; @@ -1606,25 +1616,23 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; -#ifdef CONFIG_TCP_MD5SIG - /* - * We really want to reject the packet as early as possible - * if: - * o We're expecting an MD5'd packet and this is no MD5 tcp option - * o There is an MD5 option and we're not expecting one - */ if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; -#endif nf_reset(skb); if (sk_filter(sk, skb)) goto discard_and_relse; - sk_incoming_cpu_update(sk); skb->dev = NULL; + if (sk->sk_state == TCP_LISTEN) { + ret = tcp_v4_do_rcv(sk, skb); + goto put_and_return; + } + + sk_incoming_cpu_update(sk); + bh_lock_sock_nested(sk); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; @@ -1639,6 +1647,7 @@ process: } bh_unlock_sock(sk); +put_and_return: sock_put(sk); return ret; @@ -1833,35 +1842,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - if (st->state == TCP_SEQ_STATE_OPENREQ) { - struct request_sock *req = cur; - - icsk = inet_csk(st->syn_wait_sk); - req = req->dl_next; - while (1) { - while (req) { - if (req->rsk_ops->family == st->family) { - cur = req; - goto out; - } - req = req->dl_next; - } - if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) - break; -get_req: - req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; - } - sk = sk_nulls_next(st->syn_wait_sk); - st->state = TCP_SEQ_STATE_LISTENING; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } else { - icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) - goto start_req; - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - sk = sk_nulls_next(sk); - } + sk = sk_nulls_next(sk); get_sk: sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) @@ -1871,16 +1852,6 @@ get_sk: goto out; } icsk = inet_csk(sk); - spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - if (reqsk_queue_len(&icsk->icsk_accept_queue)) { -start_req: - st->uid = sock_i_uid(sk); - st->syn_wait_sk = sk; - st->state = TCP_SEQ_STATE_OPENREQ; - st->sbucket = 0; - goto get_req; - } - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2012,7 +1983,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq) void *rc = NULL; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: if (st->bucket >= INET_LHTABLE_SIZE) break; @@ -2071,7 +2041,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { @@ -2096,11 +2065,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) struct tcp_iter_state *st = seq->private; switch (st->state) { - case TCP_SEQ_STATE_OPENREQ: - if (v) { - struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); @@ -2154,7 +2118,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(const struct request_sock *req, - struct seq_file *f, int i, kuid_t uid) + struct seq_file *f, int i) { const struct inet_request_sock *ireq = inet_rsk(req); long delta = req->rsk_timer.expires - jiffies; @@ -2171,7 +2135,8 @@ static void get_openreq4(const struct request_sock *req, 1, /* timers active (only the expire timer) */ jiffies_delta_to_clock_t(delta), req->num_timeout, - from_kuid_munged(seq_user_ns(f), uid), + from_kuid_munged(seq_user_ns(f), + sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, @@ -2185,7 +2150,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_sock *inet = inet_sk(sk); - struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; + const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); @@ -2272,18 +2237,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) } st = seq->private; - switch (st->state) { - case TCP_SEQ_STATE_LISTENING: - case TCP_SEQ_STATE_ESTABLISHED: - if (sk->sk_state == TCP_TIME_WAIT) - get_timewait4_sock(v, seq, st->num); - else - get_tcp4_sock(v, seq, st->num); - break; - case TCP_SEQ_STATE_OPENREQ: - get_openreq4(v, seq, st->num, st->uid); - break; - } + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait4_sock(v, seq, st->num); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + get_openreq4(v, seq, st->num); + else + get_tcp4_sock(v, seq, st->num); out: seq_pad(seq, '\n'); return 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index def765911ff8..3575dd1e5b67 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -361,30 +361,38 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +/* Warning : This function is called without sk_listener being locked. + * Be sure to read socket fields once, as their value could change under us. + */ void tcp_openreq_init_rwin(struct request_sock *req, - struct sock *sk, struct dst_entry *dst) + const struct sock *sk_listener, + const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_sock *tp = tcp_sk(sk); - __u8 rcv_wscale; + const struct tcp_sock *tp = tcp_sk(sk_listener); + u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); + int full_space = tcp_full_space(sk_listener); int mss = dst_metric_advmss(dst); + u32 window_clamp; + __u8 rcv_wscale; - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) - mss = tp->rx_opt.user_mss; + if (user_mss && user_mss < mss) + mss = user_mss; + window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ - if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) - req->window_clamp = tcp_full_space(sk); + if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); @@ -433,7 +441,9 @@ EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ -struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) +struct sock *tcp_create_openreq_child(const struct sock *sk, + struct request_sock *req, + struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); @@ -460,6 +470,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->srtt_us = 0; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + newtp->rtt_min[0].rtt = ~0U; newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; @@ -469,7 +480,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; - newtp->lsndtime = treq->snt_synack; + newtp->lsndtime = treq->snt_synack.stamp_jiffies; + newsk->sk_txhash = treq->txhash; newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; @@ -501,9 +513,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (sysctl_tcp_fack) tcp_enable_fack(newtp); } - newtp->window_clamp = req->window_clamp; - newtp->rcv_ssthresh = req->rcv_wnd; - newtp->rcv_wnd = req->rcv_wnd; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; @@ -536,6 +548,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_ecn_openreq_child(newtp, req); newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; + newtp->rack.mstamp.v64 = 0; + newtp->rack.advanced = 0; newtp->saved_syn = req->saved_syn; req->saved_syn = NULL; @@ -566,8 +580,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; - - BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); + bool own_req; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { @@ -698,7 +711,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); @@ -755,16 +768,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); if (!child) goto listen_overflow; - inet_csk_reqsk_queue_drop(sk, req); - inet_csk_reqsk_queue_add(sk, req, child); - /* Warning: caller must not call reqsk_put(req); - * child stole last reference on it. - */ - return child; + sock_rps_save_rxhash(child, skb); + tcp_synack_rtt_meas(child, req); + return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: if (!sysctl_tcp_abort_on_overflow) { @@ -811,8 +822,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int state = child->sk_state; if (!sock_owned_by_user(child)) { - ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), - skb->len); + ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3dbee0d83b15..cb7ca569052c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -357,14 +357,10 @@ static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) } static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, - struct sock *sk) +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { - if (inet_rsk(req)->ecn_ok) { + if (inet_rsk(req)->ecn_ok) th->ece = 1; - if (tcp_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); - } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to @@ -612,12 +608,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct sock *sk, - struct request_sock *req, - unsigned int mss, struct sk_buff *skb, - struct tcp_out_options *opts, - const struct tcp_md5sig_key *md5, - struct tcp_fastopen_cookie *foc) +static unsigned int tcp_synack_options(struct request_sock *req, + unsigned int mss, struct sk_buff *skb, + struct tcp_out_options *opts, + const struct tcp_md5sig_key *md5, + struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -1827,7 +1822,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, /* Ok, it looks like it is advisable to defer. */ - if (cong_win < send_win && cong_win < skb->len) + if (cong_win < send_win && cong_win <= skb->len) *is_cwnd_limited = true; return true; @@ -2060,7 +2055,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { - is_cwnd_limited = true; if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; @@ -2142,6 +2136,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); tcp_cwnd_validate(sk, is_cwnd_limited); return false; } @@ -2165,7 +2160,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (sk->sk_state == TCP_SYN_RECV) + if (tp->fastopen_rsk) return false; /* TLP is only scheduled when next timer event is RTO. */ @@ -2175,7 +2170,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; @@ -2184,9 +2179,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) return false; /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account - * for delayed ack when there's one outstanding packet. + * for delayed ack when there's one outstanding packet. If no RTT + * sample is available then probe after TCP_TIMEOUT_INIT. */ - timeout = rtt << 1; + timeout = rtt << 1 ? : TCP_TIMEOUT_INIT; if (tp->packets_out == 1) timeout = max_t(u32, timeout, (rtt + (rtt >> 1) + TCP_DELACK_MAX)); @@ -2659,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) net_dbg_ratelimited("retrans_out leaked\n"); } #endif - if (!tp->retrans_out) - tp->lost_retrans_low = tp->snd_nxt; TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); @@ -2668,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, - * see tcp_input.c tcp_sacktag_write_queue(). - */ - TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } @@ -2949,20 +2939,22 @@ int tcp_send_synack(struct sock *sk) * Allocate one skb and build a SYNACK packet. * @dst is consumed : Caller should not use it again. */ -struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, +struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + bool attach_req) { - struct tcp_out_options opts; struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th; - struct sk_buff *skb; + const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *md5 = NULL; + struct tcp_out_options opts; + struct sk_buff *skb; int tcp_header_size; + struct tcphdr *th; + u16 user_mss; int mss; - skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -2970,11 +2962,21 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); + if (attach_req) { + skb_set_owner_w(skb, req_to_sk(req)); + } else { + /* sk is a const pointer, because we want to express multiple + * cpu might call us concurrently. + * sk->sk_wmem_alloc in an atomic, we can promote to rw. + */ + skb_set_owner_w(skb, (struct sock *)sk); + } skb_dst_set(skb, dst); mss = dst_metric_advmss(dst); - if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) - mss = tp->rx_opt.user_mss; + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && user_mss < mss) + mss = user_mss; memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES @@ -2988,8 +2990,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, rcu_read_lock(); md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif - tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc) + sizeof(*th); + skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2998,7 +3001,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th, sk); + tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is @@ -3012,8 +3015,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts); + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); + tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); @@ -3500,13 +3503,14 @@ void tcp_send_probe0(struct sock *sk) TCP_RTO_MAX); } -int tcp_rtx_synack(struct sock *sk, struct request_sock *req) +int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; - res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + tcp_rsk(req)->txhash = net_tx_rndhash(); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true); if (!res) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c new file mode 100644 index 000000000000..5353085fd0b2 --- /dev/null +++ b/net/ipv4/tcp_recovery.c @@ -0,0 +1,109 @@ +#include <linux/tcp.h> +#include <net/tcp.h> + +int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; + +/* Marks a packet lost, if some packet sent later has been (s)acked. + * The underlying idea is similar to the traditional dupthresh and FACK + * but they look at different metrics: + * + * dupthresh: 3 OOO packets delivered (packet count) + * FACK: sequence delta to highest sacked sequence (sequence space) + * RACK: sent time delta to the latest delivered packet (time domain) + * + * The advantage of RACK is it applies to both original and retransmitted + * packet and therefore is robust against tail losses. Another advantage + * is being more resilient to reordering by simply allowing some + * "settling delay", instead of tweaking the dupthresh. + * + * The current version is only used after recovery starts but can be + * easily extended to detect the first loss. + */ +int tcp_rack_mark_lost(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + u32 reo_wnd, prior_retrans = tp->retrans_out; + + if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) + return 0; + + /* Reset the advanced flag to avoid unnecessary queue scanning */ + tp->rack.advanced = 0; + + /* To be more reordering resilient, allow min_rtt/4 settling delay + * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed + * RTT because reordering is often a path property and less related + * to queuing or delayed ACKs. + * + * TODO: measure and adapt to the observed reordering delay, and + * use a timer to retransmit like the delayed early retransmit. + */ + reo_wnd = 1000; + if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) + reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); + + tcp_for_write_queue(skb, sk) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + + if (skb == tcp_send_head(sk)) + break; + + /* Skip ones already (s)acked */ + if (!after(scb->end_seq, tp->snd_una) || + scb->sacked & TCPCB_SACKED_ACKED) + continue; + + if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { + + if (skb_mstamp_us_delta(&tp->rack.mstamp, + &skb->skb_mstamp) <= reo_wnd) + continue; + + /* skb is lost if packet sent later is sacked */ + tcp_skb_mark_lost_uncond_verify(tp, skb); + if (scb->sacked & TCPCB_SACKED_RETRANS) { + scb->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPLOSTRETRANSMIT); + } + } else if (!(scb->sacked & TCPCB_RETRANS)) { + /* Original data are sent sequentially so stop early + * b/c the rest are all sent after rack_sent + */ + break; + } + } + return prior_retrans - tp->retrans_out; +} + +/* Record the most recently (re)sent time among the (s)acked packets */ +void tcp_rack_advance(struct tcp_sock *tp, + const struct skb_mstamp *xmit_time, u8 sacked) +{ + if (tp->rack.mstamp.v64 && + !skb_mstamp_after(xmit_time, &tp->rack.mstamp)) + return; + + if (sacked & TCPCB_RETRANS) { + struct skb_mstamp now; + + /* If the sacked packet was retransmitted, it's ambiguous + * whether the retransmission or the original (or the prior + * retransmission) was sacked. + * + * If the original is lost, there is no ambiguity. Otherwise + * we assume the original can be delayed up to aRTT + min_rtt. + * the aRTT term is bounded by the fast recovery or timeout, + * so it's at least one RTT (i.e., retransmission is at least + * an RTT later). + */ + skb_mstamp_get(&now); + if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp)) + return; + } + + tp->rack.mstamp = *xmit_time; + tp->rack.advanced = 1; +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 7149ebc820c7..c9c716a483e4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -83,7 +83,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) } /* Calculate maximal number or retries on an orphaned socket. */ -static int tcp_orphan_retries(struct sock *sk, int alive) +static int tcp_orphan_retries(struct sock *sk, bool alive) { int retries = sysctl_tcp_orphan_retries; /* May be zero. */ @@ -184,7 +184,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = icsk->icsk_rto < TCP_RTO_MAX; + const bool alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -298,7 +298,7 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f7d1d5e19e95..24ec14f9825c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -375,7 +375,8 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; return score; } @@ -419,6 +420,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, score += 4; } + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; } @@ -1017,30 +1021,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - /* unconnected socket. If output device is enslaved to a VRF - * device lookup source address from VRF table. This mimics - * behavior of ip_route_connect{_init}. - */ - if (netif_index_is_vrf(net, ipc.oif)) { - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, - RT_SCOPE_UNIVERSE, sk->sk_protocol, - (flow_flags | FLOWI_FLAG_VRFSRC | - FLOWI_FLAG_SKIP_NH_OIF), - faddr, saddr, dport, - inet->inet_sport); - - rt = ip_route_output_flow(net, fl4, sk); - if (!IS_ERR(rt)) { - saddr = fl4->saddr; - ip_rt_put(rt); - } - } - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport); + if (!saddr && ipc.oif) + l3mdev_get_saddr(net, ipc.oif, fl4); + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 60b032f58ccc..62e1e72db461 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -22,7 +22,8 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm4_extract_header(skb); } -static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb) +static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); @@ -52,8 +53,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) iph->tot_len = htons(skb->len); ip_send_check(iph); - NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, - skb->dev, NULL, + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; } diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 41a261355662..7ee6518afa86 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -82,24 +82,25 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) return xfrm_output(sk, skb); } -static int __xfrm4_output(struct sock *sk, struct sk_buff *skb) +static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct xfrm_state *x = skb_dst(skb)->xfrm; #ifdef CONFIG_NETFILTER if (!x) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output_sk(sk, skb); + return dst_output(net, sk, skb); } #endif return x->outer_mode->afinfo->output_finish(sk, skb); } -int xfrm4_output(struct sock *sk, struct sk_buff *skb) +int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, - NULL, skb_dst(skb)->dev, __xfrm4_output, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, NULL, skb_dst(skb)->dev, + __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index c10a9ee68433..1e0c3c835a63 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -15,7 +15,7 @@ #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> -#include <net/vrf.h> +#include <net/l3mdev.h> static struct xfrm_policy_afinfo xfrm4_policy_afinfo; @@ -97,6 +97,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; + xdst->u.rt.rt_table_id = rt->rt_table_id; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); return 0; @@ -110,10 +111,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; - if (skb_dst(skb)) { - oif = vrf_master_ifindex(skb_dst(skb)->dev) ? - : skb_dst(skb)->dev->ifindex; - } + if (skb_dst(skb)) + oif = l3mdev_fib_oif(skb_dst(skb)->dev); memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; @@ -128,7 +127,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) case IPPROTO_DCCP: if (xprth + 4 < skb->data || pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ports = (__be16 *)xprth; + __be16 *ports; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ports = (__be16 *)xprth; fl4->fl4_sport = ports[!!reverse]; fl4->fl4_dport = ports[!reverse]; @@ -136,8 +138,12 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) break; case IPPROTO_ICMP: - if (pskb_may_pull(skb, xprth + 2 - skb->data)) { - u8 *icmp = xprth; + if (xprth + 2 < skb->data || + pskb_may_pull(skb, xprth + 2 - skb->data)) { + u8 *icmp; + + xprth = skb_network_header(skb) + iph->ihl * 4; + icmp = xprth; fl4->fl4_icmp_type = icmp[0]; fl4->fl4_icmp_code = icmp[1]; @@ -145,33 +151,50 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) break; case IPPROTO_ESP: - if (pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be32 *ehdr = (__be32 *)xprth; + if (xprth + 4 < skb->data || + pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be32 *ehdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ehdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ehdr[0]; } break; case IPPROTO_AH: - if (pskb_may_pull(skb, xprth + 8 - skb->data)) { - __be32 *ah_hdr = (__be32 *)xprth; + if (xprth + 8 < skb->data || + pskb_may_pull(skb, xprth + 8 - skb->data)) { + __be32 *ah_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ah_hdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ah_hdr[1]; } break; case IPPROTO_COMP: - if (pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ipcomp_hdr = (__be16 *)xprth; + if (xprth + 4 < skb->data || + pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be16 *ipcomp_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ipcomp_hdr = (__be16 *)xprth; fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); } break; case IPPROTO_GRE: - if (pskb_may_pull(skb, xprth + 12 - skb->data)) { - __be16 *greflags = (__be16 *)xprth; - __be32 *gre_hdr = (__be32 *)xprth; + if (xprth + 12 < skb->data || + pskb_may_pull(skb, xprth + 12 - skb->data)) { + __be16 *greflags; + __be32 *gre_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + greflags = (__be16 *)xprth; + gre_hdr = (__be32 *)xprth; if (greflags[0] & GRE_KEY) { if (greflags[0] & GRE_CSUM) @@ -245,7 +268,7 @@ static struct dst_ops xfrm4_dst_ops = { .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = 32768, + .gc_thresh = INT_MAX, }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { |