From ba6f5e33bdbb9ed2014b778fbbaecf20060ca989 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 6 Apr 2016 15:15:19 -0300 Subject: sctp: avoid refreshing heartbeat timer too often Currently on high rate SCTP streams the heartbeat timer refresh can consume quite a lot of resources as timer updates are costly and it contains a random factor, which a) is also costly and b) invalidates mod_timer() optimization for not editing a timer to the same value. It may even cause the timer to be slightly advanced, for no good reason. As suggested by David Laight this patch now removes this timer update from hot path by leaving the timer on and re-evaluating upon its expiration if the heartbeat is still needed or not, similarly to what is done for TCP. If it's not needed anymore the timer is re-scheduled to the new timeout, considering the time already elapsed. For this, we now record the last tx timestamp per transport, updated in the same spots as hb timer was restarted on tx. Also split up sctp_transport_reset_timers into sctp_transport_reset_t3_rtx and sctp_transport_reset_hb_timer, so we can re-arm T3 without re-arming the heartbeat one. On loopback with MTU of 65535 and data chunks with 1636, so that we have a considerable amount of chunks without stressing system calls, netperf -t SCTP_STREAM -l 30, perf looked like this before: Samples: 103K of event 'cpu-clock', Event count (approx.): 25833000000 Overhead Command Shared Object Symbol + 6,15% netperf [kernel.vmlinux] [k] copy_user_enhanced_fast_string - 5,43% netperf [kernel.vmlinux] [k] _raw_write_unlock_irqrestore - _raw_write_unlock_irqrestore - 96,54% _raw_spin_unlock_irqrestore - 36,14% mod_timer + 97,24% sctp_transport_reset_timers + 2,76% sctp_do_sm + 33,65% __wake_up_sync_key + 28,77% sctp_ulpq_tail_event + 1,40% del_timer - 1,84% mod_timer + 99,03% sctp_transport_reset_timers + 0,97% sctp_do_sm + 1,50% sctp_ulpq_tail_event And after this patch, now with netperf -l 60: Samples: 230K of event 'cpu-clock', Event count (approx.): 57707250000 Overhead Command Shared Object Symbol + 5,65% netperf [kernel.vmlinux] [k] memcpy_erms + 5,59% netperf [kernel.vmlinux] [k] copy_user_enhanced_fast_string - 5,05% netperf [kernel.vmlinux] [k] _raw_spin_unlock_irqrestore - _raw_spin_unlock_irqrestore + 49,89% __wake_up_sync_key + 45,68% sctp_ulpq_tail_event - 2,85% mod_timer + 76,51% sctp_transport_reset_t3_rtx + 23,49% sctp_do_sm + 1,55% del_timer + 2,50% netperf [sctp] [k] sctp_datamsg_from_user + 2,26% netperf [sctp] [k] sctp_sendmsg Throughput-wise, from 6800mbps without the patch to 7050mbps with it, ~3.7%. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 6df1ce7a411c..5a404c354f4c 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -847,6 +847,11 @@ struct sctp_transport { */ ktime_t last_time_heard; + /* When was the last time that we sent a chunk using this + * transport? We use this to check for idle transports + */ + unsigned long last_time_sent; + /* Last time(in jiffies) when cwnd is reduced due to the congestion * indication based on ECNE chunk. */ @@ -952,7 +957,8 @@ void sctp_transport_route(struct sctp_transport *, union sctp_addr *, struct sctp_sock *); void sctp_transport_pmtu(struct sctp_transport *, struct sock *sk); void sctp_transport_free(struct sctp_transport *); -void sctp_transport_reset_timers(struct sctp_transport *); +void sctp_transport_reset_t3_rtx(struct sctp_transport *); +void sctp_transport_reset_hb_timer(struct sctp_transport *); int sctp_transport_hold(struct sctp_transport *); void sctp_transport_put(struct sctp_transport *); void sctp_transport_update_rto(struct sctp_transport *, __u32); -- cgit v1.2.3 From 9ab179d83b4e31ea277a123492e419067c2f129a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 7 Apr 2016 11:10:06 -0700 Subject: net: vrf: Fix dst reference counting Vivek reported a kernel exception deleting a VRF with an active connection through it. The root cause is that the socket has a cached reference to a dst that is destroyed. Converting the dst_destroy to dst_release and letting proper reference counting kick in does not work as the dst has a reference to the device which needs to be released as well. I talked to Hannes about this at netdev and he pointed out the ipv4 and ipv6 dst handling has dst_ifdown for just this scenario. Rather than continuing with the reinvented dst wheel in VRF just remove it and leverage the ipv4 and ipv6 versions. Fixes: 193125dbd8eb2 ("net: Introduce VRF device driver") Fixes: 35402e3136634 ("net: Add IPv6 support to VRF device") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 177 +++++------------------------------------------- include/net/ip6_route.h | 3 + include/net/route.h | 3 + net/ipv4/route.c | 7 +- net/ipv6/route.c | 7 +- 5 files changed, 30 insertions(+), 167 deletions(-) (limited to 'include') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 9a9fabb900c1..8a8f1e58b415 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -60,41 +60,6 @@ struct pcpu_dstats { struct u64_stats_sync syncp; }; -static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie) -{ - return dst; -} - -static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - return ip_local_out(net, sk, skb); -} - -static unsigned int vrf_v4_mtu(const struct dst_entry *dst) -{ - /* TO-DO: return max ethernet size? */ - return dst->dev->mtu; -} - -static void vrf_dst_destroy(struct dst_entry *dst) -{ - /* our dst lives forever - or until the device is closed */ -} - -static unsigned int vrf_default_advmss(const struct dst_entry *dst) -{ - return 65535 - 40; -} - -static struct dst_ops vrf_dst_ops = { - .family = AF_INET, - .local_out = vrf_ip_local_out, - .check = vrf_ip_check, - .mtu = vrf_v4_mtu, - .destroy = vrf_dst_destroy, - .default_advmss = vrf_default_advmss, -}; - /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is @@ -349,46 +314,6 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) } #if IS_ENABLED(CONFIG_IPV6) -static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie) -{ - return dst; -} - -static struct dst_ops vrf_dst_ops6 = { - .family = AF_INET6, - .local_out = ip6_local_out, - .check = vrf_ip6_check, - .mtu = vrf_v4_mtu, - .destroy = vrf_dst_destroy, - .default_advmss = vrf_default_advmss, -}; - -static int init_dst_ops6_kmem_cachep(void) -{ - vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache", - sizeof(struct rt6_info), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - - if (!vrf_dst_ops6.kmem_cachep) - return -ENOMEM; - - return 0; -} - -static void free_dst_ops6_kmem_cachep(void) -{ - kmem_cache_destroy(vrf_dst_ops6.kmem_cachep); -} - -static int vrf_input6(struct sk_buff *skb) -{ - skb->dev->stats.rx_errors++; - kfree_skb(skb); - return 0; -} - /* modelled after ip6_finish_output2 */ static int vrf_finish_output6(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -429,67 +354,34 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } -static void vrf_rt6_destroy(struct net_vrf *vrf) +static void vrf_rt6_release(struct net_vrf *vrf) { - dst_destroy(&vrf->rt6->dst); - free_percpu(vrf->rt6->rt6i_pcpu); + dst_release(&vrf->rt6->dst); vrf->rt6 = NULL; } static int vrf_rt6_create(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); - struct dst_entry *dst; + struct net *net = dev_net(dev); struct rt6_info *rt6; - int cpu; int rc = -ENOMEM; - rt6 = dst_alloc(&vrf_dst_ops6, dev, 0, - DST_OBSOLETE_NONE, - (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); + rt6 = ip6_dst_alloc(net, dev, + DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE); if (!rt6) goto out; - dst = &rt6->dst; - - rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL); - if (!rt6->rt6i_pcpu) { - dst_destroy(dst); - goto out; - } - for_each_possible_cpu(cpu) { - struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu); - *p = NULL; - } - - memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst)); - - INIT_LIST_HEAD(&rt6->rt6i_siblings); - INIT_LIST_HEAD(&rt6->rt6i_uncached); - - rt6->dst.input = vrf_input6; rt6->dst.output = vrf_output6; - - rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id); - - atomic_set(&rt6->dst.__refcnt, 2); - + rt6->rt6i_table = fib6_get_table(net, vrf->tb_id); + dst_hold(&rt6->dst); vrf->rt6 = rt6; rc = 0; out: return rc; } #else -static int init_dst_ops6_kmem_cachep(void) -{ - return 0; -} - -static void free_dst_ops6_kmem_cachep(void) -{ -} - -static void vrf_rt6_destroy(struct net_vrf *vrf) +static void vrf_rt6_release(struct net_vrf *vrf) { } @@ -557,11 +449,11 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -static void vrf_rtable_destroy(struct net_vrf *vrf) +static void vrf_rtable_release(struct net_vrf *vrf) { struct dst_entry *dst = (struct dst_entry *)vrf->rth; - dst_destroy(dst); + dst_release(dst); vrf->rth = NULL; } @@ -570,22 +462,10 @@ static struct rtable *vrf_rtable_create(struct net_device *dev) struct net_vrf *vrf = netdev_priv(dev); struct rtable *rth; - rth = dst_alloc(&vrf_dst_ops, dev, 2, - DST_OBSOLETE_NONE, - (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); + rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0); if (rth) { rth->dst.output = vrf_output; - rth->rt_genid = rt_genid_ipv4(dev_net(dev)); - rth->rt_flags = 0; - rth->rt_type = RTN_UNICAST; - rth->rt_is_input = 0; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; rth->rt_table_id = vrf->tb_id; - INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_uncached_list = NULL; } return rth; @@ -673,8 +553,8 @@ static void vrf_dev_uninit(struct net_device *dev) struct net_device *port_dev; struct list_head *iter; - vrf_rtable_destroy(vrf); - vrf_rt6_destroy(vrf); + vrf_rtable_release(vrf); + vrf_rt6_release(vrf); netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); @@ -704,7 +584,7 @@ static int vrf_dev_init(struct net_device *dev) return 0; out_rth: - vrf_rtable_destroy(vrf); + vrf_rtable_release(vrf); out_stats: free_percpu(dev->dstats); dev->dstats = NULL; @@ -737,7 +617,7 @@ static struct rtable *vrf_get_rtable(const struct net_device *dev, struct net_vrf *vrf = netdev_priv(dev); rth = vrf->rth; - atomic_inc(&rth->dst.__refcnt); + dst_hold(&rth->dst); } return rth; @@ -788,7 +668,7 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, struct net_vrf *vrf = netdev_priv(dev); rt = vrf->rt6; - atomic_inc(&rt->dst.__refcnt); + dst_hold(&rt->dst); } return (struct dst_entry *)rt; @@ -946,19 +826,6 @@ static int __init vrf_init_module(void) { int rc; - vrf_dst_ops.kmem_cachep = - kmem_cache_create("vrf_ip_dst_cache", - sizeof(struct rtable), 0, - SLAB_HWCACHE_ALIGN, - NULL); - - if (!vrf_dst_ops.kmem_cachep) - return -ENOMEM; - - rc = init_dst_ops6_kmem_cachep(); - if (rc != 0) - goto error2; - register_netdevice_notifier(&vrf_notifier_block); rc = rtnl_link_register(&vrf_link_ops); @@ -969,22 +836,10 @@ static int __init vrf_init_module(void) error: unregister_netdevice_notifier(&vrf_notifier_block); - free_dst_ops6_kmem_cachep(); -error2: - kmem_cache_destroy(vrf_dst_ops.kmem_cachep); return rc; } -static void __exit vrf_cleanup_module(void) -{ - rtnl_link_unregister(&vrf_link_ops); - unregister_netdevice_notifier(&vrf_notifier_block); - kmem_cache_destroy(vrf_dst_ops.kmem_cachep); - free_dst_ops6_kmem_cachep(); -} - module_init(vrf_init_module); -module_exit(vrf_cleanup_module); MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); MODULE_LICENSE("GPL"); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 295d291269e2..54c779416eec 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -101,6 +101,9 @@ void fib6_force_start_gc(struct net *net); struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, const struct in6_addr *addr, bool anycast); +struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, + int flags); + /* * support functions for ND * diff --git a/include/net/route.h b/include/net/route.h index 9b0a523bb428..6de665bf1750 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -209,6 +209,9 @@ unsigned int inet_addr_type_dev_table(struct net *net, void ip_rt_multicast_event(struct in_device *); int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg); void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt); +struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, + bool nopolicy, bool noxfrm, bool will_cache); struct in_ifaddr; void fib_add_ifaddr(struct in_ifaddr *); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 02c62299d717..2852bdf73540 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1438,9 +1438,9 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #endif } -static struct rtable *rt_dst_alloc(struct net_device *dev, - unsigned int flags, u16 type, - bool nopolicy, bool noxfrm, bool will_cache) +struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, + bool nopolicy, bool noxfrm, bool will_cache) { struct rtable *rt; @@ -1468,6 +1468,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, return rt; } +EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ed446639219c..1d8871a5ed20 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -338,9 +338,9 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, return rt; } -static struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags) +struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags) { struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); @@ -364,6 +364,7 @@ static struct rt6_info *ip6_dst_alloc(struct net *net, return rt; } +EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { -- cgit v1.2.3 From 1ecf689013b598918cd9a261a078e64571b60f90 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 11 Apr 2016 13:39:08 -0700 Subject: devlink: add missing install of header The new devlink.h in uapi was not being installed by make headers_install Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index b71fd0b5cbad..813ffb2e22c9 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -96,6 +96,7 @@ header-y += cyclades.h header-y += cycx_cfm.h header-y += dcbnl.h header-y += dccp.h +header-y += devlink.h header-y += dlmconstants.h header-y += dlm_device.h header-y += dlm.h -- cgit v1.2.3 From 33c162a980fe03498fcecb917f618ad7e7c55e61 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 11 Apr 2016 15:29:36 -0700 Subject: ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update There is a case in connected UDP socket such that getsockopt(IPV6_MTU) will return a stale MTU value. The reproducible sequence could be the following: 1. Create a connected UDP socket 2. Send some datagrams out 3. Receive a ICMPV6_PKT_TOOBIG 4. No new outgoing datagrams to trigger the sk_dst_check() logic to update the sk->sk_dst_cache. 5. getsockopt(IPV6_MTU) returns the mtu from the invalid sk->sk_dst_cache instead of the newly created RTF_CACHE clone. This patch updates the sk->sk_dst_cache for a connected datagram sk during pmtu-update code path. Note that the sk->sk_v6_daddr is used to do the route lookup instead of skb->data (i.e. iph). It is because a UDP socket can become connected after sending out some datagrams in un-connected state. or It can be connected multiple times to different destinations. Hence, iph may not be related to where sk is currently connected to. It is done under '!sock_owned_by_user(sk)' condition because the user may make another ip6_datagram_connect() (i.e changing the sk->sk_v6_daddr) while dst lookup is happening in the pmtu-update code path. For the sock_owned_by_user(sk) == true case, the next patch will introduce a release_cb() which will update the sk->sk_dst_cache. Test: Server (Connected UDP Socket): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Route Details: [root@arch-fb-vm1 ~]# ip -6 r show | egrep '2fac' 2fac::/64 dev eth0 proto kernel metric 256 pref medium 2fac:face::/64 via 2fac::face dev eth0 metric 1024 pref medium A simple python code to create a connected UDP socket: import socket import errno HOST = '2fac::1' PORT = 8080 s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) s.bind((HOST, PORT)) s.connect(('2fac:face::face', 53)) print("connected") while True: try: data = s.recv(1024) except socket.error as se: if se.errno == errno.EMSGSIZE: pmtu = s.getsockopt(41, 24) print("PMTU:%d" % pmtu) break s.close() Python program output after getting a ICMPV6_PKT_TOOBIG: [root@arch-fb-vm1 ~]# python2 ~/devshare/kernel/tasks/fib6/udp-connect-53-8080.py connected PMTU:1300 Cache routes after recieving TOOBIG: [root@arch-fb-vm1 ~]# ip -6 r show table cache 2fac:face::face via 2fac::face dev eth0 metric 0 cache expires 463sec mtu 1300 pref medium Client (Send the ICMPV6_PKT_TOOBIG): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ scapy is used to generate the TOOBIG message. Here is the scapy script I have used: >>> p=Ether(src='da:75:4d:36:ac:32', dst='52:54:00:12:34:66', type=0x86dd)/IPv6(src='2fac::face', dst='2fac::1')/ICMPv6PacketTooBig(mtu=1300)/IPv6(src='2fac:: 1',dst='2fac:face::face', nh='UDP')/UDP(sport=8080,dport=53) >>> sendp(p, iface='qemubr0') Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception") Signed-off-by: Martin KaFai Lau Reported-by: Wei Wang Cc: Cong Wang Cc: Eric Dumazet Cc: Wei Wang Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + net/ipv6/datagram.c | 20 +++++++++++--------- net/ipv6/route.c | 12 ++++++++++++ 3 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d0aeb97aec5d..fd02e90b1289 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -959,6 +959,7 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, int addr_len); +int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 669585e11294..59e01f27db9f 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -64,7 +64,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); } -static int ip6_datagram_dst_update(struct sock *sk) +int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) { struct ip6_flowlabel *flowlabel = NULL; struct in6_addr *final_p, final; @@ -93,14 +93,16 @@ static int ip6_datagram_dst_update(struct sock *sk) goto out; } - if (ipv6_addr_any(&np->saddr)) - np->saddr = fl6.saddr; + if (fix_sk_saddr) { + if (ipv6_addr_any(&np->saddr)) + np->saddr = fl6.saddr; - if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - sk->sk_v6_rcv_saddr = fl6.saddr; - inet->inet_rcv_saddr = LOOPBACK4_IPV6; - if (sk->sk_prot->rehash) - sk->sk_prot->rehash(sk); + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + sk->sk_v6_rcv_saddr = fl6.saddr; + inet->inet_rcv_saddr = LOOPBACK4_IPV6; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } } ip6_dst_store(sk, dst, @@ -221,7 +223,7 @@ ipv4_connected: * destination cache for it. */ - err = ip6_datagram_dst_update(sk); + err = ip6_datagram_dst_update(sk, true); if (err) goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1d8871a5ed20..d916d6ab9ad2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1418,8 +1418,20 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { + struct dst_entry *dst; + ip6_update_pmtu(skb, sock_net(sk), mtu, sk->sk_bound_dev_if, sk->sk_mark); + + dst = __sk_dst_get(sk); + if (!dst || !dst->obsolete || + dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) + return; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) + ip6_datagram_dst_update(sk, false); + bh_unlock_sock(sk); } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); -- cgit v1.2.3 From e646b657f6983017783914a951039e323120dc55 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 11 Apr 2016 15:29:37 -0700 Subject: ipv6: udp: Do a route lookup and update during release_cb This patch adds a release_cb for UDPv6. It does a route lookup and updates sk->sk_dst_cache if it is needed. It picks up the left-over job from ip6_sk_update_pmtu() if the sk was owned by user during the pmtu update. It takes a rcu_read_lock to protect the __sk_dst_get() operations because another thread may do ip6_dst_store() without taking the sk lock (e.g. sendmsg). Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception") Signed-off-by: Martin KaFai Lau Reported-by: Wei Wang Cc: Cong Wang Cc: Eric Dumazet Cc: Wei Wang Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + net/ipv6/datagram.c | 20 ++++++++++++++++++++ net/ipv6/udp.c | 1 + 3 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fd02e90b1289..1be050ada8c5 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -960,6 +960,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); +void ip6_datagram_release_cb(struct sock *sk); int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 59e01f27db9f..9dd3882fe6bf 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -119,6 +119,26 @@ out: return err; } +void ip6_datagram_release_cb(struct sock *sk) +{ + struct dst_entry *dst; + + if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) + return; + + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (!dst || !dst->obsolete || + dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + ip6_datagram_dst_update(sk, false); +} +EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); + static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8125931106be..6bc5c664fa46 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1539,6 +1539,7 @@ struct proto udpv6_prot = { .sendmsg = udpv6_sendmsg, .recvmsg = udpv6_recvmsg, .backlog_rcv = __udpv6_queue_rcv_skb, + .release_cb = ip6_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, -- cgit v1.2.3 From d894ba18d4e449b3a7f6eb491f16c9e02933736e Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Tue, 12 Apr 2016 13:11:25 -0400 Subject: soreuseport: fix ordering for mixed v4/v6 sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the SO_REUSEPORT socket option, it is possible to create sockets in the AF_INET and AF_INET6 domains which are bound to the same IPv4 address. This is only possible with SO_REUSEPORT and when not using IPV6_V6ONLY on the AF_INET6 sockets. Prior to the commits referenced below, an incoming IPv4 packet would always be routed to a socket of type AF_INET when this mixed-mode was used. After those changes, the same packet would be routed to the most recently bound socket (if this happened to be an AF_INET6 socket, it would have an IPv4 mapped IPv6 address). The change in behavior occurred because the recent SO_REUSEPORT optimizations short-circuit the socket scoring logic as soon as they find a match. They did not take into account the scoring logic that favors AF_INET sockets over AF_INET6 sockets in the event of a tie. To fix this problem, this patch changes the insertion order of AF_INET and AF_INET6 addresses in the TCP and UDP socket lists when the sockets have SO_REUSEPORT set. AF_INET sockets will be inserted at the head of the list and AF_INET6 sockets with SO_REUSEPORT set will always be inserted at the tail of the list. This will force AF_INET sockets to always be considered first. Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Fixes: 125e80b88687 ("soreuseport: fast reuseport TCP socket selection") Reported-by: Maciej Żenczykowski Signed-off-by: Craig Gallek Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rculist_nulls.h | 39 +++++++++++++++++++++++++++++++++++++++ include/net/sock.h | 6 +++++- net/ipv4/udp.c | 9 +++++++-- 3 files changed, 51 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 1c33dd7da4a7..4ae95f7e8597 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -98,6 +98,45 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, if (!is_a_nulls(first)) first->pprev = &n->next; } + +/** + * hlist_nulls_add_tail_rcu + * @n: the element to add to the hash list. + * @h: the list to add to. + * + * Description: + * Adds the specified element to the end of the specified hlist_nulls, + * while permitting racing traversals. NOTE: tail insertion requires + * list traversal. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() + * or hlist_nulls_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. Regardless of the type of CPU, the + * list-traversal primitive must be guarded by rcu_read_lock(). + */ +static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, + struct hlist_nulls_head *h) +{ + struct hlist_nulls_node *i, *last = NULL; + + for (i = hlist_nulls_first_rcu(h); !is_a_nulls(i); + i = hlist_nulls_next_rcu(i)) + last = i; + + if (last) { + n->next = last->next; + n->pprev = &last->next; + rcu_assign_pointer(hlist_nulls_next_rcu(last), n); + } else { + hlist_nulls_add_head_rcu(n, h); + } +} + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/net/sock.h b/include/net/sock.h index 255d3e03727b..121ffc115c4f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -630,7 +630,11 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { - hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); + else + hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 08eed5e16df0..a2e7f55a1f61 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -339,8 +339,13 @@ found: hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &hslot2->head); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + else + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } -- cgit v1.2.3 From 2309236c13feae619572324efd3e910e66ef6bd0 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 18 Apr 2016 14:37:10 +0300 Subject: cls_cgroup: get sk_classid only from full sockets skb->sk could point to timewait or request socket which has no sk_classid. Detected as "BUG: KASAN: slab-out-of-bounds in cls_cgroup_classify". Signed-off-by: Konstantin Khlebnikov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/cls_cgroup.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index c0a92e2c286d..74c9693d4941 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUP_NET_CLASSID struct cgroup_cls_state { @@ -63,11 +64,13 @@ static inline u32 task_get_classid(const struct sk_buff *skb) * softirqs always disables bh. */ if (in_serving_softirq()) { + struct sock *sk = skb_to_full_sk(skb); + /* If there is an sock_cgroup_classid we'll use that. */ - if (!skb->sk) + if (!sk || !sk_fullsock(sk)) return 0; - classid = sock_cgroup_classid(&skb->sk->sk_cgrp_data); + classid = sock_cgroup_classid(&sk->sk_cgrp_data); } return classid; -- cgit v1.2.3 From cfea5a688eb37bcd1081255df9f9f777f4e61999 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 19 Apr 2016 22:39:29 -0700 Subject: tcp: Merge tx_flags and tskey in tcp_shifted_skb After receiving sacks, tcp_shifted_skb() will collapse skbs if possible. tx_flags and tskey also have to be merged. This patch reuses the tcp_skb_collapse_tstamp() to handle them. BPF Output Before: ~~~~~ BPF Output After: ~~~~~ <...>-2024 [007] d.s. 88.644374: : ee_data:14599 Packetdrill Script: ~~~~~ +0 `sysctl -q -w net.ipv4.tcp_min_tso_segs=10` +0 `sysctl -q -w net.ipv4.tcp_no_metrics_save=1` +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 0.100 < S 0:0(0) win 32792 0.100 > S. 0:0(0) ack 1 0.200 < . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 0.200 write(4, ..., 1460) = 1460 +0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0 0.200 write(4, ..., 13140) = 13140 0.200 > P. 1:1461(1460) ack 1 0.200 > . 1461:8761(7300) ack 1 0.200 > P. 8761:14601(5840) ack 1 0.300 < . 1:1(0) ack 1 win 257 0.300 > P. 1:1461(1460) ack 1 0.400 < . 1:1(0) ack 14601 win 257 0.400 close(4) = 0 0.400 > F. 14601:14601(0) ack 1 0.500 < F. 1:1(0) ack 14602 win 257 0.500 > . 14602:14602(0) ack 2 Signed-off-by: Martin KaFai Lau Cc: Eric Dumazet Cc: Neal Cardwell Cc: Soheil Hassas Yeganeh Cc: Willem de Bruijn Cc: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Tested-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_output.c | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index b91370f61be6..6db10228113f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -552,6 +552,8 @@ void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); bool tcp_schedule_loss_probe(struct sock *sk); +void tcp_skb_collapse_tstamp(struct sk_buff *skb, + const struct sk_buff *next_skb); /* tcp_input.c */ void tcp_resume_early_retransmit(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0edb07185c25..c124c3c12f7c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1309,6 +1309,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); + tcp_skb_collapse_tstamp(prev, skb); tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5bc3c3062672..441ae9da3a23 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2441,8 +2441,8 @@ u32 __tcp_select_window(struct sock *sk) return window; } -static void tcp_skb_collapse_tstamp(struct sk_buff *skb, - const struct sk_buff *next_skb) +void tcp_skb_collapse_tstamp(struct sk_buff *skb, + const struct sk_buff *next_skb) { const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; -- cgit v1.2.3 From 4bfd2e6e53435a214888fd35e230157a38ffc6a0 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Wed, 20 Apr 2016 16:01:16 +0300 Subject: net/mlx4_core: Avoid repeated calls to pci enable/disable Maintain the PCI status and provide wrappers for enabling and disabling the PCI device. Performing the actions more than once without doing its opposite results in warning logs. This occurred when EEH hotplugged the device causing a warning for disabling an already disabled device. Fixes: 2ba5fbd62b25 ('net/mlx4_core: Handle AER flow properly') Signed-off-by: Daniel Jurgens Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 39 +++++++++++++++++++++++++++---- include/linux/mlx4/device.h | 7 ++++++ 2 files changed, 41 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 5d45aa34f6a5..12c77a70abdb 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -3172,6 +3172,34 @@ static int mlx4_check_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap return 0; } +static int mlx4_pci_enable_device(struct mlx4_dev *dev) +{ + struct pci_dev *pdev = dev->persist->pdev; + int err = 0; + + mutex_lock(&dev->persist->pci_status_mutex); + if (dev->persist->pci_status == MLX4_PCI_STATUS_DISABLED) { + err = pci_enable_device(pdev); + if (!err) + dev->persist->pci_status = MLX4_PCI_STATUS_ENABLED; + } + mutex_unlock(&dev->persist->pci_status_mutex); + + return err; +} + +static void mlx4_pci_disable_device(struct mlx4_dev *dev) +{ + struct pci_dev *pdev = dev->persist->pdev; + + mutex_lock(&dev->persist->pci_status_mutex); + if (dev->persist->pci_status == MLX4_PCI_STATUS_ENABLED) { + pci_disable_device(pdev); + dev->persist->pci_status = MLX4_PCI_STATUS_DISABLED; + } + mutex_unlock(&dev->persist->pci_status_mutex); +} + static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data, int total_vfs, int *nvfs, struct mlx4_priv *priv, int reset_flow) @@ -3582,7 +3610,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data, pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); - err = pci_enable_device(pdev); + err = mlx4_pci_enable_device(&priv->dev); if (err) { dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n"); return err; @@ -3715,7 +3743,7 @@ err_release_regions: pci_release_regions(pdev); err_disable_pdev: - pci_disable_device(pdev); + mlx4_pci_disable_device(&priv->dev); pci_set_drvdata(pdev, NULL); return err; } @@ -3775,6 +3803,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) priv->pci_dev_data = id->driver_data; mutex_init(&dev->persist->device_state_mutex); mutex_init(&dev->persist->interface_state_mutex); + mutex_init(&dev->persist->pci_status_mutex); ret = devlink_register(devlink, &pdev->dev); if (ret) @@ -3923,7 +3952,7 @@ static void mlx4_remove_one(struct pci_dev *pdev) } pci_release_regions(pdev); - pci_disable_device(pdev); + mlx4_pci_disable_device(dev); devlink_unregister(devlink); kfree(dev->persist); devlink_free(devlink); @@ -4042,7 +4071,7 @@ static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, if (state == pci_channel_io_perm_failure) return PCI_ERS_RESULT_DISCONNECT; - pci_disable_device(pdev); + mlx4_pci_disable_device(persist->dev); return PCI_ERS_RESULT_NEED_RESET; } @@ -4053,7 +4082,7 @@ static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) int err; mlx4_err(dev, "mlx4_pci_slot_reset was called\n"); - err = pci_enable_device(pdev); + err = mlx4_pci_enable_device(dev); if (err) { mlx4_err(dev, "Can not re-enable device, err=%d\n", err); return PCI_ERS_RESULT_DISCONNECT; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 8541a913f6a3..d1f904c8b2cb 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -828,6 +828,11 @@ struct mlx4_vf_dev { u8 n_ports; }; +enum mlx4_pci_status { + MLX4_PCI_STATUS_DISABLED, + MLX4_PCI_STATUS_ENABLED, +}; + struct mlx4_dev_persistent { struct pci_dev *pdev; struct mlx4_dev *dev; @@ -841,6 +846,8 @@ struct mlx4_dev_persistent { u8 state; struct mutex interface_state_mutex; /* protect SW state */ u8 interface_state; + struct mutex pci_status_mutex; /* sync pci state */ + enum mlx4_pci_status pci_status; }; struct mlx4_dev { -- cgit v1.2.3