diff options
Diffstat (limited to 'net/ipv4')
48 files changed, 1064 insertions, 1545 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 80dad301361d..32cae39cdff6 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -430,7 +430,7 @@ config INET_DIAG Support for INET (TCP, DCCP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at: - + http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2 If unsure, say Y. @@ -600,7 +600,7 @@ config TCP_CONG_VENO distinguishing to circumvent the difficult judgment of the packet loss type. TCP Veno cuts down less congestion window in response to random loss packets. - See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> + See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> config TCP_CONG_YEAH tristate "YeAH TCP" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index eec9569ffa5c..7446b98661d8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,7 +43,7 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ -obj-$(CONFIG_INET_DIAG) += inet_diag.o +obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 15e125558c76..20fda8fb8ffd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog) err = inet_csk_listen_start(sk, backlog); if (err) goto out; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } sk->sk_max_ack_backlog = backlog; err = 0; @@ -485,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * is temporarily down) */ err = -EADDRNOTAVAIL; - if (!net->ipv4.sysctl_ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!inet_can_nonlocal_bind(net, inet) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && @@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, - .poll_mask = tcp_poll_mask, + .poll = tcp_poll, .ioctl = inet_ioctl, .listen = inet_listen, .shutdown = inet_shutdown, @@ -1021,7 +1021,7 @@ const struct proto_ops inet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll_mask = udp_poll_mask, + .poll = udp_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, @@ -1042,7 +1042,7 @@ EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without - * udp_poll_mask + * udp_poll */ static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, @@ -1053,7 +1053,7 @@ static const struct proto_ops inet_sockraw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, @@ -1384,12 +1384,12 @@ out: } EXPORT_SYMBOL(inet_gso_segment); -struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; - struct sk_buff *p; + struct sk_buff *pp = NULL; const struct iphdr *iph; + struct sk_buff *p; unsigned int hlen; unsigned int off; unsigned int id; @@ -1425,7 +1425,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct iphdr *iph2; u16 flush_id; @@ -1505,8 +1505,8 @@ out: } EXPORT_SYMBOL(inet_gro_receive); -static struct sk_buff **ipip_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipip_gro_receive(struct list_head *head, + struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; @@ -1801,6 +1801,7 @@ static __net_init int inet_init_net(struct net *net) * We set them here, in case sysctl is not compiled. */ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; + net->ipv4.sysctl_ip_fwd_update_priority = 1; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; @@ -1882,6 +1883,7 @@ fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, + .list_func = ip_list_rcv, }; static int __init inet_init(void) diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile index ce262d76cc48..e9e42f99725e 100644 --- a/net/ipv4/bpfilter/Makefile +++ b/net/ipv4/bpfilter/Makefile @@ -1,2 +1 @@ obj-$(CONFIG_BPFILTER) += sockopt.o - diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7585ab1a77a..ea4bd8a52422 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type) size += nla_total_size(4); if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); + if (all || type == NETCONFA_BC_FORWARDING) + size += nla_total_size(4); if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) @@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; + if ((all || type == NETCONFA_BC_FORWARDING) && + nla_put_s32(skb, NETCONFA_BC_FORWARDING, + IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0) + goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) @@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); + if (i == IPV4_DEVCONF_BC_FORWARDING - 1 && + new_value != old_value) + rt_cache_flush(net); + if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); @@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), + DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 7cf755ef9efb..58834a10c0be 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -28,8 +28,8 @@ #include <linux/spinlock.h> #include <net/udp.h> -static struct sk_buff **esp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; @@ -135,8 +135,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || - (x->xso.dev != skb->dev)) + if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~NETIF_F_CSUM_MASK; @@ -179,8 +178,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ if (!xo) return -EINVAL; - if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || - (x->xso.dev != skb->dev)) { + if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b21833651394..2998b0e47d4b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -292,18 +292,19 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); - BUG_ON(!in_dev); net = dev_net(dev); scope = RT_SCOPE_UNIVERSE; if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { + bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), .flowi4_scope = scope, - .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0, + .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1540db65241a..500a59906b87 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -224,14 +224,14 @@ drop: return 0; } -static struct sk_buff **fou_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *fou_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - const struct net_offload *ops; - struct sk_buff **pp = NULL; u8 proto = fou_from_sock(sk)->protocol; const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *pp = NULL; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel @@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return guehdr; } -static struct sk_buff **gue_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gue_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { const struct net_offload **offloads; const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct guehdr *guehdr; size_t len, optlen, hdrlen, off; @@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, skb_gro_pull(skb, hdrlen); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct guehdr *guehdr2; if (!NAPI_GRO_CB(p)->same_flow) @@ -448,9 +448,7 @@ next_proto: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, &grc); - skb->remcsum_offload = 0; + skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 1859c473b21a..6c63524f598a 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -108,10 +108,10 @@ out: return segs; } -static struct sk_buff **gre_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gre_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; const struct gre_base_hdr *greh; unsigned int hlen, grehlen; @@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct gre_base_hdr *greh2; if (!NAPI_GRO_CB(p)->same_flow) @@ -223,7 +223,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1617604c9284..695979b7ef6d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -429,14 +429,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; + ipcm_init(&ipc); inet->tos = ip_hdr(skb)->tos; sk->sk_mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; @@ -710,11 +707,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; sk->sk_mark = mark; + ipcm_init(&ipc); ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 85b617b655bc..cf75f8944b05 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1200,13 +1200,13 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) spin_lock_bh(&im->lock); if (pmc) { im->interface = pmc->interface; - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; - im->sfmode = pmc->sfmode; - if (pmc->sfmode == MCAST_INCLUDE) { + if (im->sfmode == MCAST_INCLUDE) { im->tomb = pmc->tomb; im->sources = pmc->sources; for (psf = im->sources; psf; psf = psf->sf_next) - psf->sf_crcount = im->crcount; + psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + } else { + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; } in_dev_put(pmc->interface); kfree(pmc); @@ -1316,7 +1316,13 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should + * not send filter-mode change record as the mode should be from + * IN() to IN(A). + */ + if (im->sfmode == MCAST_EXCLUDE) + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + igmp_ifc_event(in_dev); #endif } @@ -1381,8 +1387,8 @@ static void ip_mc_hash_remove(struct in_device *in_dev, /* * A socket has joined a multicast group on device dev. */ - -void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, + unsigned int mode) { struct ip_mc_list *im; #ifdef CONFIG_IP_MULTICAST @@ -1394,7 +1400,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == addr) { im->users++; - ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); + ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); goto out; } } @@ -1408,8 +1414,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) in_dev_hold(in_dev); im->multiaddr = addr; /* initial mode is (EX, empty) */ - im->sfmode = MCAST_EXCLUDE; - im->sfcount[MCAST_EXCLUDE] = 1; + im->sfmode = mode; + im->sfcount[mode] = 1; refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST @@ -1432,6 +1438,11 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) out: return; } + +void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +{ + __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_inc_group); static int ip_mc_check_iphdr(struct sk_buff *skb) @@ -2130,8 +2141,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) /* Join a multicast group */ - -int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; @@ -2172,15 +2183,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; iml->sflist = NULL; - iml->sfmode = MCAST_EXCLUDE; + iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); - ip_mc_inc_group(in_dev, addr); + __ip_mc_inc_group(in_dev, addr, mode); err = 0; done: return err; } + +/* Join ASM (Any-Source Multicast) group + */ +int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +{ + return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_join_group); +/* Join SSM (Source-Specific Multicast) group + */ +int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) +{ + return __ip_mc_join_group(sk, imr, mode); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 33a88e045efd..dfd5009f96ef 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, } EXPORT_SYMBOL(inet_rcv_saddr_equal); +bool inet_rcv_saddr_any(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#endif + return !sk->sk_rcv_saddr; +} + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c9e35b81d093..bcb11f3a27c0 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -20,6 +20,7 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <linux/rhashtable.h> #include <net/sock.h> #include <net/inet_frag.h> @@ -90,7 +91,7 @@ static void inet_frags_free_cb(void *ptr, void *arg) void inet_frags_exit_net(struct netns_frags *nf) { - nf->low_thresh = 0; /* prevent creation of new frags */ + nf->high_thresh = 0; /* prevent creation of new frags */ rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } @@ -136,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q) fp = q->fragments; nf = q->net; f = nf->f; - while (fp) { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; + if (fp) { + do { + struct sk_buff *xp = fp->next; + + sum_truesize += fp->truesize; + kfree_skb(fp); + fp = xp; + } while (fp); + } else { + sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); } sum = sum_truesize + f->qsize; @@ -157,9 +162,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, { struct inet_frag_queue *q; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) - return NULL; - q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) return NULL; @@ -204,6 +206,9 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { struct inet_frag_queue *fq; + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) + return NULL; + rcu_read_lock(); fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 31ff46daae97..f5c9ef2586de 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -243,9 +243,9 @@ static inline int compute_score(struct sock *sk, struct net *net, bool dev_match = (sk->sk_bound_dev_if == dif || sk->sk_bound_dev_if == sdif); - if (exact_dif && !dev_match) + if (!dev_match) return -1; - if (sk->sk_bound_dev_if && dev_match) + if (sk->sk_bound_dev_if) score += 4; } if (sk->sk_incoming_cpu == raw_smp_processor_id()) @@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with INADDR_ANY */ @@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each_rcu(sk, &ilb->head) { @@ -352,12 +353,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); @@ -567,10 +571,11 @@ static int inet_reuseport_add_sock(struct sock *sk, inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index b54b948b0596..32662e9e5d21 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb) !skb_sec_path(skb)) ip_rt_send_redirect(skb); - skb->priority = rt_tos2priority(iph->tos); + if (net->ipv4.sysctl_ip_fwd_update_priority) + skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, net, NULL, skb, skb->dev, rt->dst.dev, diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8e9528ebaa8e..88281fbce88c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,6 +57,57 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; +/* Use skb->cb to track consecutive/adjacent fragments coming at + * the end of the queue. Nodes in the rb-tree queue will + * contain "runs" of one or more adjacent fragments. + * + * Invariants: + * - next_frag is NULL at the tail of a "run"; + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. + */ +struct ipfrag_skb_cb { + struct inet_skb_parm h; + struct sk_buff *next_frag; + int frag_run_len; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +static void ip4_frag_init_run(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); + + FRAG_CB(skb)->next_frag = NULL; + FRAG_CB(skb)->frag_run_len = skb->len; +} + +/* Append skb to the last "run". */ +static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, + struct sk_buff *skb) +{ + RB_CLEAR_NODE(&skb->rbnode); + FRAG_CB(skb)->next_frag = NULL; + + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; + FRAG_CB(q->fragments_tail)->next_frag = skb; + q->fragments_tail = skb; +} + +/* Create a new "run" with the skb. */ +static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) +{ + if (q->last_run_head) + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, + &q->last_run_head->rbnode.rb_right); + else + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + + ip4_frag_init_run(skb); + q->fragments_tail = skb; + q->last_run_head = skb; +} + /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; @@ -75,8 +126,8 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, - struct net_device *dev); +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) @@ -136,7 +187,7 @@ static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; - struct sk_buff *head; + struct sk_buff *head = NULL; struct net *net; struct ipq *qp; int err; @@ -152,14 +203,36 @@ static void ip_expire(struct timer_list *t) ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - - head = qp->q.fragments; - __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) + if (!(qp->q.flags & INET_FRAG_FIRST_IN)) goto out; + /* sk_buff::dev and sk_buff::rbnode are unionized. So we + * pull the head out of the tree in order to be able to + * deal with head->dev. + */ + if (qp->q.fragments) { + head = qp->q.fragments; + qp->q.fragments = head->next; + } else { + head = skb_rb_first(&qp->q.rb_fragments); + if (!head) + goto out; + if (FRAG_CB(head)->next_frag) + rb_replace_node(&head->rbnode, + &FRAG_CB(head)->next_frag->rbnode, + &qp->q.rb_fragments); + else + rb_erase(&head->rbnode, &qp->q.rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + } + if (head == qp->q.fragments_tail) + qp->q.fragments_tail = NULL; + + sub_frag_mem_limit(qp->q.net, head->truesize); + head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; @@ -179,16 +252,16 @@ static void ip_expire(struct timer_list *t) (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; - skb_get(head); spin_unlock(&qp->q.lock); icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - kfree_skb(head); goto out_rcu_unlock; out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); + if (head) + kfree_skb(head); ipq_put(qp); } @@ -231,7 +304,7 @@ static int ip_frag_too_far(struct ipq *qp) end = atomic_inc_return(&peer->rid); qp->rid = end; - rc = qp->q.fragments && (end - start) > max; + rc = qp->q.fragments_tail && (end - start) > max; if (rc) { struct net *net; @@ -245,7 +318,6 @@ static int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { - struct sk_buff *fp; unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { @@ -253,21 +325,16 @@ static int ip_frag_reinit(struct ipq *qp) return -ETIMEDOUT; } - fp = qp->q.fragments; - do { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; - } while (fp); + sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments); sub_frag_mem_limit(qp->q.net, sum_truesize); qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.fragments = NULL; + qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; + qp->q.last_run_head = NULL; qp->iif = 0; qp->ecn = 0; @@ -277,7 +344,9 @@ static int ip_frag_reinit(struct ipq *qp) /* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { - struct sk_buff *prev, *next; + struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + struct rb_node **rbn, *parent; + struct sk_buff *skb1, *prev_tail; struct net_device *dev; unsigned int fragsize; int flags, offset; @@ -340,95 +409,61 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (err) goto err; - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = qp->q.fragments_tail; - if (!prev || prev->ip_defrag_offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = qp->q.fragments; next != NULL; next = next->next) { - if (next->ip_defrag_offset >= offset) - break; /* bingo! */ - prev = next; - } + /* Note : skb->rbnode and skb->dev share the same location. */ + dev = skb->dev; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); -found: - /* We found where to put this one. Check for overlap with - * preceding fragment, and, if needed, align things so that - * any overlaps are eliminated. + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. + * + * We do the same here for IPv4 (and increment an snmp counter). */ - if (prev) { - int i = (prev->ip_defrag_offset + prev->len) - offset; - - if (i > 0) { - offset += i; - err = -EINVAL; - if (end <= offset) - goto err; - err = -ENOMEM; - if (!pskb_pull(skb, i)) - goto err; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; - } - } - err = -ENOMEM; - - while (next && next->ip_defrag_offset < end) { - int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */ - - if (i < next->len) { - /* Eat head of the next overlapped fragment - * and leave the loop. The next ones cannot overlap. - */ - if (!pskb_pull(next, i)) - goto err; - next->ip_defrag_offset += i; - qp->q.meat -= i; - if (next->ip_summed != CHECKSUM_UNNECESSARY) - next->ip_summed = CHECKSUM_NONE; - break; - } else { - struct sk_buff *free_it = next; - - /* Old fragment is completely overridden with - * new one drop it. - */ - next = next->next; - - if (prev) - prev->next = next; - else - qp->q.fragments = next; - - qp->q.meat -= free_it->len; - sub_frag_mem_limit(qp->q.net, free_it->truesize); - kfree_skb(free_it); - } + /* Find out where to put this fragment. */ + prev_tail = qp->q.fragments_tail; + if (!prev_tail) + ip4_frag_create_run(&qp->q, skb); /* First fragment. */ + else if (prev_tail->ip_defrag_offset + prev_tail->len < end) { + /* This is the common case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < prev_tail->ip_defrag_offset + prev_tail->len) + goto discard_qp; + if (offset == prev_tail->ip_defrag_offset + prev_tail->len) + ip4_frag_append_to_last_run(&qp->q, skb); + else + ip4_frag_create_run(&qp->q, skb); + } else { + /* Binary search. Note that skb can become the first fragment, + * but not the last (covered above). + */ + rbn = &qp->q.rb_fragments.rb_node; + do { + parent = *rbn; + skb1 = rb_to_skb(parent); + if (end <= skb1->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= skb1->ip_defrag_offset + + FRAG_CB(skb1)->frag_run_len) + rbn = &parent->rb_right; + else /* Found an overlap with skb1. */ + goto discard_qp; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. + */ + ip4_frag_init_run(skb); + rb_link_node(&skb->rbnode, parent, rbn); + rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); } - /* Note : skb->ip_defrag_offset and skb->dev share the same location */ - dev = skb->dev; if (dev) qp->iif = dev->ifindex; - /* Makes sure compiler wont do silly aliasing games */ - barrier(); skb->ip_defrag_offset = offset; - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - qp->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - qp->q.fragments = skb; - qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; @@ -450,7 +485,7 @@ found: unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, prev, dev); + err = ip_frag_reasm(qp, skb, prev_tail, dev); skb->_skb_refdst = orefdst; return err; } @@ -458,20 +493,24 @@ found: skb_dst_drop(skb); return -EINPROGRESS; +discard_qp: + inet_frag_kill(&qp->q); + err = -EINVAL; + __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); err: kfree_skb(skb); return err; } - /* Build a new IP datagram from all its fragments. */ - -static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, - struct net_device *dev) +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; - struct sk_buff *fp, *head = qp->q.fragments; + struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); + struct sk_buff **nextp; /* To build frag_list. */ + struct rb_node *rbn; int len; int ihlen; int err; @@ -485,25 +524,26 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_fail; } /* Make the one we just received the head. */ - if (prev) { - head = prev->next; - fp = skb_clone(head, GFP_ATOMIC); + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); if (!fp) goto out_nomem; - - fp->next = head->next; - if (!fp->next) + FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; + if (RB_EMPTY_NODE(&skb->rbnode)) + FRAG_CB(prev_tail)->next_frag = fp; + else + rb_replace_node(&skb->rbnode, &fp->rbnode, + &qp->q.rb_fragments); + if (qp->q.fragments_tail == skb) qp->q.fragments_tail = fp; - prev->next = fp; - - skb_morph(head, qp->q.fragments); - head->next = qp->q.fragments->next; - - consume_skb(qp->q.fragments); - qp->q.fragments = head; + skb_morph(skb, head); + FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; + rb_replace_node(&head->rbnode, &skb->rbnode, + &qp->q.rb_fragments); + consume_skb(head); + head = skb; } - WARN_ON(!head); WARN_ON(head->ip_defrag_offset != 0); /* Allocate a new buffer for the datagram. */ @@ -528,35 +568,60 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, clone = alloc_skb(0, GFP_ATOMIC); if (!clone) goto out_nomem; - clone->next = head->next; - head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; + head->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(qp->q.net, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; } - skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - for (fp=head->next; fp; fp = fp->next) { - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; + /* Traverse the tree in order, to build frag_list. */ + fp = FRAG_CB(head)->next_frag; + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &qp->q.rb_fragments); + while (rbn || fp) { + /* fp points to the next sk_buff in the current run; + * rbn points to the next run. + */ + /* Go through the current run. */ + while (fp) { + *nextp = fp; + nextp = &fp->next; + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + fp = FRAG_CB(fp)->next_frag; + } + /* Move to the next run. */ + if (rbn) { + struct rb_node *rbnext = rb_next(rbn); + + fp = rb_to_skb(rbn); + rb_erase(rbn, &qp->q.rb_fragments); + rbn = rbnext; + } } sub_frag_mem_limit(qp->q.net, head->truesize); + *nextp = NULL; head->next = NULL; + head->prev = NULL; head->dev = dev; head->tstamp = qp->q.stamp; IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); @@ -584,7 +649,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; + qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; + qp->q.last_run_head = NULL; return 0; out_nomem: @@ -666,6 +733,28 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_check_defrag); +unsigned int inet_frag_rbtree_purge(struct rb_root *root) +{ + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + while (skb) { + struct sk_buff *next = FRAG_CB(skb)->next_frag; + + sum += skb->truesize; + kfree_skb(skb); + skb = next; + } + } + return sum; +} +EXPORT_SYMBOL(inet_frag_rbtree_purge); + #ifdef CONFIG_SYSCTL static int dist_min; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2d8efeecf619..51a5d06085ac 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -587,6 +587,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto err_free_rt; md = ip_tunnel_info_opts(tun_info); if (!md) goto err_free_rt; @@ -983,7 +985,6 @@ static void ipgre_tunnel_setup(struct net_device *dev) static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; - int t_hlen; tunnel = netdev_priv(dev); tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); @@ -991,8 +992,6 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; - t_hlen = tunnel->hlen + sizeof(struct iphdr); - dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -1302,13 +1301,11 @@ static const struct net_device_ops gre_tap_netdev_ops = { static int erspan_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - int t_hlen; tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + erspan_hdr_len(tunnel->erspan_ver); - t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7582713dd18f..3196cf58f418 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -307,7 +307,8 @@ drop: return true; } -static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static int ip_rcv_finish_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); @@ -315,13 +316,6 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) struct rtable *rt; int err; - /* if ingress device is enslaved to an L3 master device pass the - * skb to its handler for processing - */ - skb = l3mdev_ip_rcv(skb); - if (!skb) - return NET_RX_SUCCESS; - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -393,7 +387,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - return dst_input(skb); + return NET_RX_SUCCESS; drop: kfree_skb(skb); @@ -405,13 +399,29 @@ drop_error: goto drop; } +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + + ret = ip_rcv_finish_core(net, sk, skb); + if (ret != NET_RX_DROP) + ret = dst_input(skb); + return ret; +} + /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; - struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -421,7 +431,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - net = dev_net(dev); __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -489,9 +498,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip_rcv_finish); + return skb; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); @@ -500,5 +507,113 @@ inhdr_error: drop: kfree_skb(skb); out: - return NET_RX_DROP; + return NULL; +} + +/* + * IP receive entry point + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip_rcv_finish); +} + +static void ip_sublist_rcv_finish(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + list_del(&skb->list); + /* Handle ip{6}_forward case, as sch_direct_xmit have + * another kind of SKB-list usage (see validate_xmit_skb_list) + */ + skb->next = NULL; + dst_input(skb); + } +} + +static void ip_list_rcv_finish(struct net *net, struct sock *sk, + struct list_head *head) +{ + struct dst_entry *curr_dst = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct dst_entry *dst; + + list_del(&skb->list); + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + continue; + if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) + continue; + + dst = skb_dst(skb); + if (curr_dst != dst) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv_finish(&sublist); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dst = dst; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip_sublist_rcv_finish(&sublist); +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip_rcv_finish); + ip_list_rcv_finish(net, NULL, head); +} + +/* Receive a list of IP packets */ +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + list_del(&skb->list); + skb = ip_rcv_core(skb, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dev = dev; + curr_net = net; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip_sublist_rcv(&sublist, curr_dev, curr_net); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index af5a830ff6ad..9c4e72e9c60a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) } /* Note: skb->sk can be different from sk, in case of tunnels */ -int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + __u8 tos) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS(sk), + RT_CONN_FLAGS_TOS(sk, tos), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; @@ -478,7 +479,7 @@ packet_routed: skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); - *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff)); if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else @@ -511,7 +512,7 @@ no_route: kfree_skb(skb); return -EHOSTUNREACH; } -EXPORT_SYMBOL(ip_queue_xmit); +EXPORT_SYMBOL(__ip_queue_xmit); static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { @@ -523,6 +524,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->dev = from->dev; to->mark = from->mark; + skb_copy_hash(to, from); + /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; @@ -1145,13 +1148,15 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->fragsize = ip_sk_use_pmtu(sk) ? dst_mtu(&rt->dst) : rt->dst.dev->mtu; - cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0; + cork->gso_size = ipc->gso_size; cork->dst = &rt->dst; cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->priority = ipc->priority; - cork->tx_flags = ipc->tx_flags; + cork->transmit_time = ipc->sockc.transmit_time; + cork->tx_flags = 0; + sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); return 0; } @@ -1412,6 +1417,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1544,11 +1550,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) return; + ipcm_init(&ipc); ipc.addr = daddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index fc32fdbeefa6..c0fe5ad996f2 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -150,15 +150,18 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; const struct iphdr *iph = ip_hdr(skb); - __be16 *ports = (__be16 *)skb_transport_header(skb); + __be16 *ports; + int end; - if (skb_transport_offset(skb) + 4 > (int)skb->len) + end = skb_transport_offset(skb) + 4; + if (end > 0 && !pskb_may_pull(skb, end)) return; /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ + ports = (__be16 *)skb_transport_header(skb); sin.sin_family = AF_INET; sin.sin_addr.s_addr = iph->daddr; @@ -984,7 +987,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; mreq.imr_address.s_addr = mreqs.imr_interface; mreq.imr_ifindex = 0; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; omode = MCAST_INCLUDE; @@ -1061,7 +1064,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; mreq.imr_ifindex = greqs.gsr_interface; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; greqs.gsr_interface = mreq.imr_ifindex; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9f79b9803a16..5660adcf7a04 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -60,6 +60,7 @@ #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/export.h> +#include <linux/rhashtable.h> #include <net/ip_tunnels.h> #include <net/checksum.h> #include <net/netlink.h> @@ -1051,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *skb; int ret; - if (assert == IGMPMSG_WHOLEPKT) + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else skb = alloc_skb(128, GFP_ATOMIC); @@ -1059,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt, if (!skb) return -ENOBUFS; - if (assert == IGMPMSG_WHOLEPKT) { + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) { /* Ugly, but we have no choice with this interface. * Duplicate old header, fix ihl, length etc. * And all this only to mangle msg->im_msgtype and @@ -1070,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt, skb_reset_transport_header(skb); msg = (struct igmpmsg *)skb_network_header(skb); memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); - msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_msgtype = assert; msg->im_mbz = 0; - msg->im_vif = mrt->mroute_reg_vif_num; + if (assert == IGMPMSG_WRVIFWHOLE) + msg->im_vif = vifi; + else + msg->im_vif = mrt->mroute_reg_vif_num; ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1371,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, struct mr_table *mrt; struct vifctl vif; struct mfcctl mfc; + bool do_wrvifwhole; u32 uval; /* There's one exception to the lock - MRT_DONE which needs to unlock */ @@ -1501,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, break; } + do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { mrt->mroute_do_pim = val; mrt->mroute_do_assert = val; + mrt->mroute_do_wrvifwhole = do_wrvifwhole; } break; case MRT_TABLE: @@ -1982,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); + if (mrt->mroute_do_wrvifwhole) + ipmr_cache_report(mrt, skb, true_vifi, + IGMPMSG_WRVIFWHOLE); } goto dont_forward; } @@ -2658,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) mrt->mroute_reg_vif_num) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, mrt->mroute_do_assert) || - nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim)) + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, + mrt->mroute_do_wrvifwhole)) return false; return true; diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index cafb0506c8c9..1ad9aa62a97b 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -2,6 +2,7 @@ * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation */ +#include <linux/rhashtable.h> #include <linux/mroute_base.h> /* Sets everything common except 'dev', since that is done under locking */ diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index e6774ccb7731..8d2e5dc9a827 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_ip_reroute); -__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol) -{ - const struct iphdr *iph = ip_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) - break; - if ((protocol == 0 && !csum_fold(skb->csum)) || - !csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - dataoff, protocol, - skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - /* fall through */ - case CHECKSUM_NONE: - if (protocol == 0) - skb->csum = 0; - else - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len - dataoff, - protocol, 0); - csum = __skb_checksum_complete(skb); - } - return csum; -} -EXPORT_SYMBOL(nf_ip_checksum); - -__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) -{ - const struct iphdr *iph = ip_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (len == skb->len - dataoff) - return nf_ip_checksum(skb, hook, dataoff, protocol); - /* fall through */ - case CHECKSUM_NONE: - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, - skb->len - dataoff, 0); - skb->ip_summed = CHECKSUM_NONE; - return __skb_checksum_complete_head(skb, dataoff + len); - } - return csum; -} -EXPORT_SYMBOL_GPL(nf_ip_checksum_partial); - int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict __always_unused) { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index bbfc356cb1b5..d9504adc47b3 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -9,22 +9,6 @@ config NF_DEFRAG_IPV4 tristate default n -config NF_CONNTRACK_IPV4 - tristate "IPv4 connection tracking support (required for NAT)" - depends on NF_CONNTRACK - default m if NETFILTER_ADVANCED=n - select NF_DEFRAG_IPV4 - ---help--- - Connection tracking keeps a record of what packets have passed - through your machine, in order to figure out how they are related - into connections. - - This is IPv4 support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - To compile it as a module, choose M here. If unsure, say N. - config NF_SOCKET_IPV4 tristate "IPv4 socket lookup support" help @@ -112,7 +96,7 @@ config NF_REJECT_IPV4 config NF_NAT_IPV4 tristate "IPv4 NAT" - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT help @@ -279,7 +263,7 @@ config IP_NF_TARGET_SYNPROXY # NAT + specific targets: nf_conntrack config IP_NF_NAT tristate "iptables NAT support" - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT select NF_NAT_IPV4 @@ -340,7 +324,7 @@ config IP_NF_MANGLE config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support" depends on IP_NF_MANGLE - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK select NETFILTER_FAMILY_ARP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 8394c17c269f..367993adf4d3 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -3,12 +3,6 @@ # Makefile for the netfilter modules on top of IPv4. # -# objects for l3 independent conntrack -nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o - -# connection tracking -obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o - nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index ca0dad90803a..e77872c93c20 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1898,6 +1898,7 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = { .checkentry = icmp_checkentry, .proto = IPPROTO_ICMP, .family = NFPROTO_IPV4, + .me = THIS_MODULE, }, }; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c deleted file mode 100644 index 9db988f9a4d7..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ /dev/null @@ -1,472 +0,0 @@ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2006-2012 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/icmp.h> -#include <linux/sysctl.h> -#include <net/route.h> -#include <net/ip.h> - -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_l3proto.h> -#include <net/netfilter/nf_conntrack_zones.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/ipv4/nf_defrag_ipv4.h> -#include <net/netfilter/nf_log.h> - -static int conntrack4_net_id __read_mostly; -static DEFINE_MUTEX(register_ipv4_hooks); - -struct conntrack4_net { - unsigned int users; -}; - -static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - const __be32 *ap; - __be32 _addrs[2]; - ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), - sizeof(u_int32_t) * 2, _addrs); - if (ap == NULL) - return false; - - tuple->src.u3.ip = ap[0]; - tuple->dst.u3.ip = ap[1]; - - return true; -} - -static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u3.ip = orig->dst.u3.ip; - tuple->dst.u3.ip = orig->src.u3.ip; - - return true; -} - -static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - const struct iphdr *iph; - struct iphdr _iph; - - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (iph == NULL) - return -NF_ACCEPT; - - /* Conntrack defragments packets, we might still see fragments - * inside ICMP packets though. */ - if (iph->frag_off & htons(IP_OFFSET)) - return -NF_ACCEPT; - - *dataoff = nhoff + (iph->ihl << 2); - *protonum = iph->protocol; - - /* Check bogus IP headers */ - if (*dataoff > skb->len) { - pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: " - "nhoff %u, ihl %u, skblen %u\n", - nhoff, iph->ihl << 2, skb->len); - return -NF_ACCEPT; - } - - return NF_ACCEPT; -} - -static unsigned int ipv4_helper(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; - - /* This is where we call the helper: as the packet goes out. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); -} - -static unsigned int ipv4_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; - - /* adjust seqs for loopback traffic only in outgoing direction */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); -} - -static unsigned int ipv4_conntrack_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); -} - -static unsigned int ipv4_conntrack_local(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */ - enum ip_conntrack_info ctinfo; - struct nf_conn *tmpl; - - tmpl = nf_ct_get(skb, &ctinfo); - if (tmpl && nf_ct_is_template(tmpl)) { - /* when skipping ct, clear templates to avoid fooling - * later targets/matches - */ - skb->_nfct = 0; - nf_ct_put(tmpl); - } - return NF_ACCEPT; - } - - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); -} - -/* Connection tracking may drop packets, but never alters them, so - make it the first hook. */ -static const struct nf_hook_ops ipv4_conntrack_ops[] = { - { - .hook = ipv4_conntrack_in, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ipv4_conntrack_local, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_confirm, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, - { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_confirm, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, -}; - -/* Fast function for those who don't want to parse /proc (and I don't - blame them). */ -/* Reversing the socket's dst/src point of view gives us the reply - mapping. */ -static int -getorigdst(struct sock *sk, int optval, void __user *user, int *len) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct nf_conntrack_tuple_hash *h; - struct nf_conntrack_tuple tuple; - - memset(&tuple, 0, sizeof(tuple)); - - lock_sock(sk); - tuple.src.u3.ip = inet->inet_rcv_saddr; - tuple.src.u.tcp.port = inet->inet_sport; - tuple.dst.u3.ip = inet->inet_daddr; - tuple.dst.u.tcp.port = inet->inet_dport; - tuple.src.l3num = PF_INET; - tuple.dst.protonum = sk->sk_protocol; - release_sock(sk); - - /* We only do TCP and SCTP at the moment: is there a better way? */ - if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) { - pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); - return -ENOPROTOOPT; - } - - if ((unsigned int) *len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", - *len, sizeof(struct sockaddr_in)); - return -EINVAL; - } - - h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (h) { - struct sockaddr_in sin; - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - sin.sin_family = AF_INET; - sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u.tcp.port; - sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u3.ip; - memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - - pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); - nf_ct_put(ct); - if (copy_to_user(user, &sin, sizeof(sin)) != 0) - return -EFAULT; - else - return 0; - } - pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", - &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); - return -ENOENT; -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_conntrack.h> - -static int ipv4_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple) -{ - if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || - nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = { - [CTA_IP_V4_SRC] = { .type = NLA_U32 }, - [CTA_IP_V4_DST] = { .type = NLA_U32 }, -}; - -static int ipv4_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) -{ - if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) - return -EINVAL; - - t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); - - return 0; -} -#endif - -static struct nf_sockopt_ops so_getorigdst = { - .pf = PF_INET, - .get_optmin = SO_ORIGINAL_DST, - .get_optmax = SO_ORIGINAL_DST+1, - .get = getorigdst, - .owner = THIS_MODULE, -}; - -static int ipv4_hooks_register(struct net *net) -{ - struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); - int err = 0; - - mutex_lock(®ister_ipv4_hooks); - - cnet->users++; - if (cnet->users > 1) - goto out_unlock; - - err = nf_defrag_ipv4_enable(net); - if (err) { - cnet->users = 0; - goto out_unlock; - } - - err = nf_register_net_hooks(net, ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - - if (err) - cnet->users = 0; - out_unlock: - mutex_unlock(®ister_ipv4_hooks); - return err; -} - -static void ipv4_hooks_unregister(struct net *net) -{ - struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); - - mutex_lock(®ister_ipv4_hooks); - if (cnet->users && (--cnet->users == 0)) - nf_unregister_net_hooks(net, ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - mutex_unlock(®ister_ipv4_hooks); -} - -const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { - .l3proto = PF_INET, - .pkt_to_tuple = ipv4_pkt_to_tuple, - .invert_tuple = ipv4_invert_tuple, - .get_l4proto = ipv4_get_l4proto, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = ipv4_tuple_to_nlattr, - .nlattr_to_tuple = ipv4_nlattr_to_tuple, - .nla_policy = ipv4_nla_policy, - .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */ - NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */ -#endif - .net_ns_get = ipv4_hooks_register, - .net_ns_put = ipv4_hooks_unregister, - .me = THIS_MODULE, -}; - -module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, - &nf_conntrack_htable_size, 0600); - -MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); -MODULE_ALIAS("ip_conntrack"); -MODULE_LICENSE("GPL"); - -static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = { - &nf_conntrack_l4proto_tcp4, - &nf_conntrack_l4proto_udp4, - &nf_conntrack_l4proto_icmp, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp4, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp4, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite4, -#endif -}; - -static int ipv4_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); -} - -static void ipv4_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); -} - -static struct pernet_operations ipv4_net_ops = { - .init = ipv4_net_init, - .exit = ipv4_net_exit, - .id = &conntrack4_net_id, - .size = sizeof(struct conntrack4_net), -}; - -static int __init nf_conntrack_l3proto_ipv4_init(void) -{ - int ret = 0; - - need_conntrack(); - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) != - nf_conntrack_l3proto_ipv4.nla_size)) - return -EINVAL; -#endif - ret = nf_register_sockopt(&so_getorigdst); - if (ret < 0) { - pr_err("Unable to register netfilter socket option\n"); - return ret; - } - - ret = register_pernet_subsys(&ipv4_net_ops); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); - goto cleanup_sockopt; - } - - ret = nf_ct_l4proto_register(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - if (ret < 0) - goto cleanup_pernet; - - ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); - goto cleanup_l4proto; - } - - return ret; -cleanup_l4proto: - nf_ct_l4proto_unregister(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - cleanup_pernet: - unregister_pernet_subsys(&ipv4_net_ops); - cleanup_sockopt: - nf_unregister_sockopt(&so_getorigdst); - return ret; -} - -static void __exit nf_conntrack_l3proto_ipv4_fini(void) -{ - synchronize_net(); - nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_ct_l4proto_unregister(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - unregister_pernet_subsys(&ipv4_net_ops); - nf_unregister_sockopt(&so_getorigdst); -} - -module_init(nf_conntrack_l3proto_ipv4_init); -module_exit(nf_conntrack_l3proto_ipv4_fini); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c deleted file mode 100644 index 5c15beafa711..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ /dev/null @@ -1,383 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2006-2010 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <linux/in.h> -#include <linux/icmp.h> -#include <linux/seq_file.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_conntrack_tuple.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_zones.h> -#include <net/netfilter/nf_log.h> - -static const unsigned int nf_ct_icmp_timeout = 30*HZ; - -static inline struct nf_icmp_net *icmp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp; -} - -static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - const struct icmphdr *hp; - struct icmphdr _hdr; - - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hp == NULL) - return false; - - tuple->dst.u.icmp.type = hp->type; - tuple->src.u.icmp.id = hp->un.echo.id; - tuple->dst.u.icmp.code = hp->code; - - return true; -} - -/* Add 1; spaces filled with 0. */ -static const u_int8_t invmap[] = { - [ICMP_ECHO] = ICMP_ECHOREPLY + 1, - [ICMP_ECHOREPLY] = ICMP_ECHO + 1, - [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, - [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, - [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, - [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, - [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, - [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 -}; - -static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - if (orig->dst.u.icmp.type >= sizeof(invmap) || - !invmap[orig->dst.u.icmp.type]) - return false; - - tuple->src.u.icmp.id = orig->src.u.icmp.id; - tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; - tuple->dst.u.icmp.code = orig->dst.u.icmp.code; - return true; -} - -static unsigned int *icmp_get_timeouts(struct net *net) -{ - return &icmp_pernet(net)->timeout; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int icmp_packet(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) -{ - /* Do not immediately delete the connection after the first - successful reply to avoid excessive conntrackd traffic - and also to handle correctly ICMP echo reply duplicates. */ - nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) -{ - static const u_int8_t valid_new[] = { - [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 - }; - - if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || - !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { - /* Can't create a new ICMP `conn' with this. */ - pr_debug("icmp: can't create new conn with type %u\n", - ct->tuplehash[0].tuple.dst.u.icmp.type); - nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); - return false; - } - return true; -} - -/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ -static int -icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - unsigned int hooknum) -{ - struct nf_conntrack_tuple innertuple, origtuple; - const struct nf_conntrack_l4proto *innerproto; - const struct nf_conntrack_tuple_hash *h; - const struct nf_conntrack_zone *zone; - enum ip_conntrack_info ctinfo; - struct nf_conntrack_zone tmp; - - WARN_ON(skb_nfct(skb)); - zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - - /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) + ip_hdrlen(skb) - + sizeof(struct icmphdr), - PF_INET, net, &origtuple)) { - pr_debug("icmp_error_message: failed to get tuple\n"); - return -NF_ACCEPT; - } - - /* rcu_read_lock()ed by nf_hook_thresh */ - innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple, - &nf_conntrack_l3proto_ipv4, innerproto)) { - pr_debug("icmp_error_message: no match\n"); - return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; - - h = nf_conntrack_find_get(net, zone, &innertuple); - if (!h) { - pr_debug("icmp_error_message: no match\n"); - return -NF_ACCEPT; - } - - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - ctinfo += IP_CT_IS_REPLY; - - /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); - return NF_ACCEPT; -} - -static void icmp_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) -{ - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); -} - -/* Small and modified version of icmp_rcv */ -static int -icmp_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u8 pf, unsigned int hooknum) -{ - const struct icmphdr *icmph; - struct icmphdr _ih; - - /* Not enough header? */ - icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); - if (icmph == NULL) { - icmp_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; - } - - /* See ip_conntrack_proto_tcp.c */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, dataoff, 0)) { - icmp_error_log(skb, net, pf, "bad hw icmp checksum"); - return -NF_ACCEPT; - } - - /* - * 18 is the highest 'known' ICMP type. Anything else is a mystery - * - * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently - * discarded. - */ - if (icmph->type > NR_ICMP_TYPES) { - icmp_error_log(skb, net, pf, "invalid icmp type"); - return -NF_ACCEPT; - } - - /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) - return NF_ACCEPT; - - return icmp_error_message(net, tmpl, skb, hooknum); -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_conntrack.h> - -static int icmp_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *t) -{ - if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || - nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || - nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { - [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, -}; - -static int icmp_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) -{ - if (!tb[CTA_PROTO_ICMP_TYPE] || - !tb[CTA_PROTO_ICMP_CODE] || - !tb[CTA_PROTO_ICMP_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); - - if (tuple->dst.u.icmp.type >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type]) - return -EINVAL; - - return 0; -} - -static unsigned int icmp_nlattr_tuple_size(void) -{ - static unsigned int size __read_mostly; - - if (!size) - size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); - - return size; -} -#endif - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_cttimeout.h> - -static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - unsigned int *timeout = data; - struct nf_icmp_net *in = icmp_pernet(net); - - if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { - *timeout = - ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; - } else { - /* Set default ICMP timeout. */ - *timeout = in->timeout; - } - return 0; -} - -static int -icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) -{ - const unsigned int *timeout = data; - - if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -ENOSPC; -} - -static const struct nla_policy -icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { - [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, -}; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - -#ifdef CONFIG_SYSCTL -static struct ctl_table icmp_sysctl_table[] = { - { - .procname = "nf_conntrack_icmp_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_icmp_net *in) -{ -#ifdef CONFIG_SYSCTL - pn->ctl_table = kmemdup(icmp_sysctl_table, - sizeof(icmp_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &in->timeout; -#endif - return 0; -} - -static int icmp_init_net(struct net *net, u_int16_t proto) -{ - struct nf_icmp_net *in = icmp_pernet(net); - struct nf_proto_net *pn = &in->pn; - - in->timeout = nf_ct_icmp_timeout; - - return icmp_kmemdup_sysctl_table(pn, in); -} - -static struct nf_proto_net *icmp_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp.pn; -} - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = -{ - .l3proto = PF_INET, - .l4proto = IPPROTO_ICMP, - .pkt_to_tuple = icmp_pkt_to_tuple, - .invert_tuple = icmp_invert_tuple, - .packet = icmp_packet, - .get_timeouts = icmp_get_timeouts, - .new = icmp_new, - .error = icmp_error, - .destroy = NULL, - .me = NULL, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = icmp_tuple_to_nlattr, - .nlattr_tuple_size = icmp_nlattr_tuple_size, - .nlattr_to_tuple = icmp_nlattr_to_tuple, - .nla_policy = icmp_nla_policy, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - .ctnl_timeout = { - .nlattr_to_obj = icmp_timeout_nlattr_to_obj, - .obj_to_nlattr = icmp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_ICMP_MAX, - .obj_size = sizeof(unsigned int), - .nla_policy = icmp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .init_net = icmp_init_net, - .get_net_proto = icmp_get_net_proto, -}; diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 4388de0e5380..1e6f28c97d3a 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -35,7 +35,7 @@ static const struct nf_loginfo default_loginfo = { }; /* One level of recursion won't kill us */ -static void dump_ipv4_packet(struct nf_log_buf *m, +static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { @@ -183,7 +183,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ nf_log_buf_add(m, "["); - dump_ipv4_packet(m, info, skb, + dump_ipv4_packet(net, m, info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); nf_log_buf_add(m, "] "); } @@ -251,7 +251,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && !iphoff) - nf_log_dump_sk_uid_gid(m, skb->sk); + nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) @@ -333,7 +333,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf, if (in != NULL) dump_ipv4_mac_header(m, loginfo, skb); - dump_ipv4_packet(m, loginfo, skb, 0); + dump_ipv4_packet(net, m, loginfo, skb, 0); nf_log_buf_close(m); } diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 805e83ec3ad9..164714104965 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NF_TPROXY_LOOKUP_LISTENER); @@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) EXPORT_SYMBOL_GPL(nf_tproxy_laddr4); struct sock * -nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, @@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; - struct tcphdr *tcph; switch (protocol) { - case IPPROTO_TCP: + case IPPROTO_TCP: { + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(struct tcphdr), &_hdr); + if (hp == NULL) + return NULL; + switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - tcph = hp; sk = inet_lookup_listener(net, &tcp_hashinfo, skb, ip_hdrlen(skb) + - __tcp_hdrlen(tcph), + __tcp_hdrlen(hp), saddr, sport, daddr, dport, in->ifindex, 0); @@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, BUG(); } break; + } case IPPROTO_UDP: sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2ed64bca54e3..8d7aaf118a30 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; - if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && - isk->freebind == 0 && isk->transparent == 0 && + if ((!inet_can_nonlocal_bind(net, isk) && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) @@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, scoped); rcu_read_unlock(); - if (!(net->ipv6.sysctl.ip_nonlocal_bind || - isk->freebind || isk->transparent || has_addr || + if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; @@ -739,13 +737,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* no remote port */ } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.oif = sk->sk_bound_dev_if; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -769,8 +761,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - saddr = ipc.addr; ipc.addr = faddr = daddr; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 77350c1256ce..70289682a670 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -119,6 +119,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), + SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), SNMP_MIB_SENTINEL }; @@ -287,6 +288,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), + SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), + SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index abb3c9490c55..33df4d76db2d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -561,13 +562,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = inet->inet_daddr; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -670,8 +665,6 @@ back_from_confirm: &rt, msg->msg_flags, &ipc.sockc); else { - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1df6e97106d7..b678466da451 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto no_route; } - if (res->type == RTN_BROADCAST) + if (res->type == RTN_BROADCAST) { + if (IN_DEV_BFORWARD(in_dev)) + goto make_route; goto brd_input; + } if (res->type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, @@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res->type != RTN_UNICAST) goto martian_destination; +make_route: err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); out: return err; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d06247ba08b2..b92f422f2fa8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -189,8 +189,9 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, if (write && ret == 0) { low = make_kgid(user_ns, urange[0]); high = make_kgid(user_ns, urange[1]); - if (!gid_valid(low) || !gid_valid(high) || - (urange[1] < urange[0]) || gid_lt(high, low)) { + if (!gid_valid(low) || !gid_valid(high)) + return -EINVAL; + if (urange[1] < urange[0] || gid_lt(high, low)) { low = make_kgid(&init_user_ns, 1); high = make_kgid(&init_user_ns, 0); } @@ -200,6 +201,23 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, return ret; } +static int ipv4_fwd_update_priority(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv4.sysctl_ip_fwd_update_priority); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, + net); + + return ret; +} + static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -265,8 +283,9 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; - int ret; u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ + __le32 key[4]; + int ret, i; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) @@ -275,11 +294,14 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, rcu_read_lock(); ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) - memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); + memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else - memset(user_key, 0, sizeof(user_key)); + memset(key, 0, sizeof(key)); rcu_read_unlock(); + for (i = 0; i < ARRAY_SIZE(key); i++) + user_key[i] = le32_to_cpu(key[i]); + snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", user_key[0], user_key[1], user_key[2], user_key[3]); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); @@ -290,13 +312,17 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ret = -EINVAL; goto bad_key; } - tcp_fastopen_reset_cipher(net, NULL, user_key, + + for (i = 0; i < ARRAY_SIZE(user_key); i++) + key[i] = cpu_to_le32(user_key[i]); + + tcp_fastopen_reset_cipher(net, NULL, key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", - user_key[0], user_key[1], user_key[2], user_key[3], + user_key[0], user_key[1], user_key[2], user_key[3], (char *)tbl.data, ret); kfree(tbl.data); return ret; @@ -655,6 +681,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "ip_forward_update_priority", + .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipv4_fwd_update_priority, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 141acd92e58a..b8af2fec5ad5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -494,21 +494,32 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, } /* - * Socket is not locked. We are protected from async events by poll logic and - * correct handling of state changes made by other threads is impossible in - * any case. + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. */ -__poll_t tcp_poll_mask(struct socket *sock, __poll_t events) +__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { + __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); - __poll_t mask = 0; int state; + sock_poll_wait(file, wait); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); + /* Socket is not locked. We are protected from async events + * by poll logic and correct handling of state changes + * made by other threads is impossible in any case. + */ + + mask = 0; + /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a @@ -589,7 +600,7 @@ __poll_t tcp_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(tcp_poll_mask); +EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { @@ -806,8 +817,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, * This occurs when user tries to read * from never connected socket. */ - if (!sock_flag(sk, SOCK_DONE)) - ret = -ENOTCONN; + ret = -ENOTCONN; break; } if (!timeo) { @@ -1230,7 +1240,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { @@ -1264,9 +1274,6 @@ restart: int linear; new_segment: - /* Allocate new segment. If the interface is SG, - * allocate skb fitting to single page. - */ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -1987,7 +1994,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), - "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n", + "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; @@ -2002,7 +2009,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), - "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", + "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } @@ -2031,13 +2038,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; if (sk->sk_state == TCP_CLOSE) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - copied = -ENOTCONN; - break; - } + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; break; } @@ -2527,7 +2531,6 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int err = 0; int old_state = sk->sk_state; if (old_state != TCP_CLOSE) @@ -2551,6 +2554,8 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); + tp->copied_seq = tp->rcv_nxt; + tp->urg_data = 0; tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); @@ -2563,6 +2568,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; + tp->rcv_rtt_last_tsecr = 0; tp->write_seq += tp->max_window + 2; if (tp->write_seq == 0) tp->write_seq = 1; @@ -2587,6 +2593,10 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); tp->compressed_ack = 0; + tp->bytes_sent = 0; + tp->bytes_retrans = 0; + tp->dsack_dups = 0; + tp->reord_seen = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -2601,7 +2611,7 @@ int tcp_disconnect(struct sock *sk, int flags) } sk->sk_error_report(sk); - return err; + return 0; } EXPORT_SYMBOL(tcp_disconnect); @@ -2810,14 +2820,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; - else if (val == 1) { + else if (val == TCP_REPAIR_ON) { tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; - } else if (val == 0) { + } else if (val == TCP_REPAIR_OFF) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; tcp_send_window_probe(sk); + } else if (val == TCP_REPAIR_OFF_NO_WP) { + tp->repair = 0; + sk->sk_reuse = SK_NO_REUSE; } else err = -EINVAL; @@ -2979,7 +2992,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (val < 0) err = -EINVAL; else - icsk->icsk_user_timeout = msecs_to_jiffies(val); + icsk->icsk_user_timeout = val; break; case TCP_FASTOPEN: @@ -3191,10 +3204,41 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; + info->tcpi_bytes_sent = tp->bytes_sent; + info->tcpi_bytes_retrans = tp->bytes_retrans; + info->tcpi_dsack_dups = tp->dsack_dups; + info->tcpi_reord_seen = tp->reord_seen; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); +static size_t tcp_opt_stats_get_size(void) +{ + return + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ + 0; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -3203,9 +3247,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u64 rate64; u32 rate; - stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 7 * nla_total_size(sizeof(u32)) + - 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); + stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) return NULL; @@ -3241,6 +3283,13 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, + TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, + TCP_NLA_PAD); + nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); + nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); + return stats; } @@ -3435,7 +3484,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_USER_TIMEOUT: - val = jiffies_to_msecs(icsk->icsk_user_timeout); + val = icsk->icsk_user_timeout; break; case TCP_FASTOPEN: @@ -3709,8 +3758,7 @@ int tcp_abort(struct sock *sk, int err) struct request_sock *req = inet_reqsk(sk); local_bh_disable(); - inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, - req); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); local_bh_enable(); return 0; } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 58e2f479ffb4..13d34427ca3d 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -205,7 +205,11 @@ static u32 bbr_bw(const struct sock *sk) */ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) { - rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); + unsigned int mss = tcp_sk(sk)->mss_cache; + + if (!tcp_needs_internal_pacing(sk)) + mss = tcp_mss_to_mtu(sk, mss); + rate *= mss; rate *= gain; rate >>= BBR_SCALE; rate *= USEC_PER_SEC; @@ -354,6 +358,10 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ cwnd = (cwnd + 1) & ~1U; + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ + if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) + cwnd += 2; + return cwnd; } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 5f5e5936760e..ca61e2a659e7 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -55,7 +55,6 @@ struct dctcp { u32 dctcp_alpha; u32 next_seq; u32 ce_state; - u32 delayed_ack_reserved; u32 loss_cwnd; }; @@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk) ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); - ca->delayed_ack_reserved = 0; ca->loss_cwnd = 0; ca->ce_state = 0; @@ -131,23 +129,14 @@ static void dctcp_ce_state_0_to_1(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - /* State has changed from CE=0 to CE=1 and delayed - * ACK has not sent yet. - */ - if (!ca->ce_state && ca->delayed_ack_reserved) { - u32 tmp_rcv_nxt; - - /* Save current rcv_nxt. */ - tmp_rcv_nxt = tp->rcv_nxt; - - /* Generate previous ack with CE=0. */ - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - tp->rcv_nxt = ca->prior_rcv_nxt; - - tcp_send_ack(sk); - - /* Recover current rcv_nxt. */ - tp->rcv_nxt = tmp_rcv_nxt; + if (!ca->ce_state) { + /* State has changed from CE=0 to CE=1, force an immediate + * ACK to reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } ca->prior_rcv_nxt = tp->rcv_nxt; @@ -161,23 +150,14 @@ static void dctcp_ce_state_1_to_0(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - /* State has changed from CE=1 to CE=0 and delayed - * ACK has not sent yet. - */ - if (ca->ce_state && ca->delayed_ack_reserved) { - u32 tmp_rcv_nxt; - - /* Save current rcv_nxt. */ - tmp_rcv_nxt = tp->rcv_nxt; - - /* Generate previous ack with CE=1. */ - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; - tp->rcv_nxt = ca->prior_rcv_nxt; - - tcp_send_ack(sk); - - /* Recover current rcv_nxt. */ - tp->rcv_nxt = tmp_rcv_nxt; + if (ca->ce_state) { + /* State has changed from CE=1 to CE=0, force an immediate + * ACK to reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } ca->prior_rcv_nxt = tp->rcv_nxt; @@ -248,25 +228,6 @@ static void dctcp_state(struct sock *sk, u8 new_state) } } -static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) -{ - struct dctcp *ca = inet_csk_ca(sk); - - switch (ev) { - case CA_EVENT_DELAYED_ACK: - if (!ca->delayed_ack_reserved) - ca->delayed_ack_reserved = 1; - break; - case CA_EVENT_NON_DELAYED_ACK: - if (ca->delayed_ack_reserved) - ca->delayed_ack_reserved = 0; - break; - default: - /* Don't care for the rest. */ - break; - } -} - static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { switch (ev) { @@ -276,10 +237,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) case CA_EVENT_ECN_NO_CE: dctcp_ce_state_1_to_0(sk); break; - case CA_EVENT_DELAYED_ACK: - case CA_EVENT_NON_DELAYED_ACK: - dctcp_update_ack_reserved(sk, ev); - break; default: /* Don't care for the rest. */ break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 355d3dffd021..4c2dd9f863f7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -78,6 +78,7 @@ #include <linux/errqueue.h> #include <trace/events/tcp.h> #include <linux/static_key.h> +#include <net/busy_poll.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -215,7 +216,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.quick = quickacks; } -static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) +void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -223,6 +224,7 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } +EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. @@ -243,10 +245,17 @@ static void tcp_ecn_queue_cwr(struct tcp_sock *tp) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } -static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) +static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) { - if (tcp_hdr(skb)->cwr) - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + if (tcp_hdr(skb)->cwr) { + tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + + /* If the sender is telling us it has entered CWR, then its + * cwnd may be very low (even just 1 packet), so we should ACK + * immediately. + */ + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + } } static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) @@ -265,7 +274,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) - tcp_enter_quickack_mode(sk, 1); + tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: if (tcp_ca_needs_ecn(sk)) @@ -273,7 +282,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ - tcp_enter_quickack_mode(sk, 1); + tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } tp->ecn_flags |= TCP_ECN_SEEN; @@ -582,9 +591,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, { struct tcp_sock *tp = tcp_sk(sk); - if (tp->rx_opt.rcv_tsecr && - (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { + if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr) + return; + tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; + + if (TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; @@ -869,6 +881,7 @@ static void tcp_dsack_seen(struct tcp_sock *tp) { tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rack.dsack_seen = 1; + tp->dsack_dups++; } /* It's reordering when higher sequence was delivered (i.e. sacked) before @@ -900,8 +913,8 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, sock_net(sk)->ipv4.sysctl_tcp_max_reordering); } - tp->rack.reord = 1; /* This exciting event is worth to be remembered. 8) */ + tp->reord_seen++; NET_INC_STATS(sock_net(sk), ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } @@ -1865,6 +1878,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) tp->reordering = min_t(u32, tp->packets_out + addend, sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + tp->reord_seen++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } @@ -3181,6 +3195,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); + + /* If any of the cumulatively ACKed segments was + * retransmitted, non-SACK case cannot confirm that + * progress was due to original transmission due to + * lack of TCPCB_SACKED_ACKED bits even if some of + * the packets may have been never retransmitted. + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + flag &= ~FLAG_ORIG_SACK_ACKED; } else { int delta; @@ -3449,7 +3472,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) @@ -4330,6 +4353,11 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; +#ifdef CONFIG_TLS_DEVICE + if (from->decrypted != to->decrypted) + return false; +#endif + if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; @@ -4348,6 +4376,23 @@ static bool tcp_try_coalesce(struct sock *sk, return true; } +static bool tcp_ooo_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + bool res = tcp_try_coalesce(sk, to, from, fragstolen); + + /* In case tcp_drop() is called later, update to->gso_segs */ + if (res) { + u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + + max_t(u16, 1, skb_shinfo(from)->gso_segs); + + skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + } + return res; +} + static void tcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); @@ -4471,8 +4516,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, tp->ooo_last_skb, - skb, &fragstolen)) { + if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, + skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); @@ -4500,7 +4545,7 @@ coalesce_done: /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb); + tcp_drop(sk, skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4519,11 +4564,11 @@ coalesce_done: TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb1); + tcp_drop(sk, skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, skb1, - skb, &fragstolen)) { + } else if (tcp_ooo_try_coalesce(sk, skb1, + skb, &fragstolen)) { goto coalesce_done; } p = &parent->rb_right; @@ -4608,8 +4653,10 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) skb->data_len = data_len; skb->len = size; - if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto err_free; + } err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) @@ -4656,7 +4703,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); - tcp_ecn_accept_cwr(tp, skb); + tcp_ecn_accept_cwr(sk, skb); tp->rx_opt.dsack = 0; @@ -4665,18 +4712,21 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - if (tcp_receive_window(tp) == 0) + if (tcp_receive_window(tp) == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; + } /* Ok. In sequence. In window. */ queue_and_out: if (skb_queue_len(&sk->sk_receive_queue) == 0) sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4685,11 +4735,11 @@ queue_and_out: if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); - /* RFC2581. 4.2. SHOULD send immediate ACK, when + /* RFC5681. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) - inet_csk(sk)->icsk_ack.pingpong = 0; + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } if (tp->rx_opt.num_sacks) @@ -4732,8 +4782,10 @@ drop: /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. */ - if (!tcp_receive_window(tp)) + if (!tcp_receive_window(tp)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; + } goto queue_and_out; } @@ -4851,6 +4903,9 @@ restart: break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); +#ifdef CONFIG_TLS_DEVICE + nskb->decrypted = skb->decrypted; +#endif TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); @@ -4878,6 +4933,10 @@ restart: skb == tail || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; +#ifdef CONFIG_TLS_DEVICE + if (skb->decrypted != nskb->decrypted) + goto end; +#endif } } } @@ -4892,6 +4951,7 @@ end: static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 range_truesize, sum_tiny = 0; struct sk_buff *skb, *head; u32 start, end; @@ -4903,6 +4963,7 @@ new_range: } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; + range_truesize = skb->truesize; for (head = skb;;) { skb = skb_rb_next(skb); @@ -4913,11 +4974,20 @@ new_range: if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, NULL, &tp->out_of_order_queue, - head, skb, start, end); + /* Do not attempt collapsing tiny skbs */ + if (range_truesize != head->truesize || + end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { + tcp_collapse(sk, NULL, &tp->out_of_order_queue, + head, skb, start, end); + } else { + sum_tiny += range_truesize; + if (sum_tiny > sk->sk_rcvbuf >> 3) + return; + } goto new_range; } + range_truesize += skb->truesize; if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) start = TCP_SKB_CB(skb)->seq; if (after(TCP_SKB_CB(skb)->end_seq, end)) @@ -4932,6 +5002,7 @@ new_range: * 2) not add too big latencies if thousands of packets sit there. * (But if application shrinks SO_RCVBUF, we could still end up * freeing whole queue here) + * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. * * Return true if queue has shrunk. */ @@ -4939,20 +5010,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node *node, *prev; + int goal; if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false; NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); + goal = sk->sk_rcvbuf >> 3; node = &tp->ooo_last_skb->rbnode; do { prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); + goal -= rb_to_skb(node)->truesize; tcp_drop(sk, rb_to_skb(node)); - sk_mem_reclaim(sk); - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !tcp_under_memory_pressure(sk)) - break; + if (!prev || goal <= 0) { + sk_mem_reclaim(sk); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !tcp_under_memory_pressure(sk)) + break; + goal = sk->sk_rcvbuf >> 3; + } node = prev; } while (node); tp->ooo_last_skb = rb_to_skb(prev); @@ -4987,6 +5064,9 @@ static int tcp_prune_queue(struct sock *sk) else if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, NULL, @@ -5099,7 +5179,9 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || __tcp_select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ - tcp_in_quickack_mode(sk)) { + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ + inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) { send_now: tcp_send_ack(sk); return; @@ -5475,6 +5557,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_ack(sk, skb, 0); __kfree_skb(skb); tcp_data_snd_check(sk); + /* When receiving pure ack in fast path, update + * last ts ecr directly instead of calling + * tcp_rcv_rtt_measure_ts() + */ + tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; return; } else { /* Header too small */ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); @@ -5576,6 +5663,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); + sk_mark_napi_id(sk, skb); } tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); @@ -6404,6 +6492,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); + sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bea17f1e8302..9e041fa5c545 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -155,12 +155,26 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + (!twp || (reuse && time_after32(ktime_get_seconds(), + tcptw->tw_ts_recent_stamp)))) { + /* In case of repair and re-using TIME-WAIT sockets we still + * want to be sure that it is safe as above but honor the + * sequence numbers and time stamps set as part of the repair + * process. + * + * Without this check re-using a TIME-WAIT socket with TCP + * repair would accumulate a -1 on the repair assigned + * sequence number. The first time it is reused the sequence + * is -1, the second time -2, etc. This fixes that issue + * without appearing to create any others. + */ + if (likely(!tp->repair)) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + } sock_hold(sktw); return 1; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1dda1341a223..75ef332a7caf 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } @@ -189,7 +189,7 @@ kill: if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); } inet_twsk_put(tw); @@ -449,119 +449,122 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); + const struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk; + struct tcp_sock *oldtp, *newtp; - if (newsk) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_request_sock *treq = tcp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct tcp_sock *newtp = tcp_sk(newsk); - struct tcp_sock *oldtp = tcp_sk(sk); - - smc_check_reset_syn_req(oldtp, req, newtp); - - /* Now setup tcp_sock */ - newtp->pred_flags = 0; - - newtp->rcv_wup = newtp->copied_seq = - newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->segs_in = 1; - - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; - - INIT_LIST_HEAD(&newtp->tsq_node); - INIT_LIST_HEAD(&newtp->tsorted_sent_queue); - - tcp_init_wl(newtp, treq->rcv_isn); - - newtp->srtt_us = 0; - newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); - newicsk->icsk_rto = TCP_TIMEOUT_INIT; - newicsk->icsk_ack.lrcvtime = tcp_jiffies32; - - newtp->packets_out = 0; - newtp->retrans_out = 0; - newtp->sacked_out = 0; - newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - newtp->tlp_high_seq = 0; - newtp->lsndtime = tcp_jiffies32; - newsk->sk_txhash = treq->txhash; - newtp->last_oow_ack_time = 0; - newtp->total_retrans = req->num_retrans; - - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - newtp->snd_cwnd = TCP_INIT_CWND; - newtp->snd_cwnd_cnt = 0; - - /* There's a bubble in the pipe until at least the first ACK. */ - newtp->app_limited = ~0U; - - tcp_init_xmit_timers(newsk); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; - - newtp->rx_opt.saw_tstamp = 0; - - newtp->rx_opt.dsack = 0; - newtp->rx_opt.num_sacks = 0; - - newtp->urg_data = 0; - - if (sock_flag(newsk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); - - newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - newtp->rx_opt.sack_ok = ireq->sack_ok; - newtp->window_clamp = req->rsk_window_clamp; - newtp->rcv_ssthresh = req->rsk_rcv_wnd; - newtp->rcv_wnd = req->rsk_rcv_wnd; - newtp->rx_opt.wscale_ok = ireq->wscale_ok; - if (newtp->rx_opt.wscale_ok) { - newtp->rx_opt.snd_wscale = ireq->snd_wscale; - newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; - } else { - newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; - newtp->window_clamp = min(newtp->window_clamp, 65535U); - } - newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << - newtp->rx_opt.snd_wscale); - newtp->max_window = newtp->snd_wnd; - - if (newtp->rx_opt.tstamp_ok) { - newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = get_seconds(); - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - } else { - newtp->rx_opt.ts_recent_stamp = 0; - newtp->tcp_header_len = sizeof(struct tcphdr); - } - newtp->tsoffset = treq->ts_off; + if (!newsk) + return NULL; + + newicsk = inet_csk(newsk); + newtp = tcp_sk(newsk); + oldtp = tcp_sk(sk); + + smc_check_reset_syn_req(oldtp, req, newtp); + + /* Now setup tcp_sock */ + newtp->pred_flags = 0; + + newtp->rcv_wup = newtp->copied_seq = + newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 1; + + newtp->snd_sml = newtp->snd_una = + newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + + INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); + + tcp_init_wl(newtp, treq->rcv_isn); + + newtp->srtt_us = 0; + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); + newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; + + newtp->packets_out = 0; + newtp->retrans_out = 0; + newtp->sacked_out = 0; + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + newtp->tlp_high_seq = 0; + newtp->lsndtime = tcp_jiffies32; + newsk->sk_txhash = treq->txhash; + newtp->last_oow_ack_time = 0; + newtp->total_retrans = req->num_retrans; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = TCP_INIT_CWND; + newtp->snd_cwnd_cnt = 0; + + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + + tcp_init_xmit_timers(newsk); + newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + + newtp->rx_opt.saw_tstamp = 0; + + newtp->rx_opt.dsack = 0; + newtp->rx_opt.num_sacks = 0; + + newtp->urg_data = 0; + + if (sock_flag(newsk, SOCK_KEEPOPEN)) + inet_csk_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); + + newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; + newtp->rx_opt.sack_ok = ireq->sack_ok; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rx_opt.wscale_ok = ireq->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = ireq->snd_wscale; + newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG - newtp->md5sig_info = NULL; /*XXX*/ - if (newtp->af_specific->md5_lookup(sk, newsk)) - newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; + newtp->md5sig_info = NULL; /*XXX*/ + if (newtp->af_specific->md5_lookup(sk, newsk)) + newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif - if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) - newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; - newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); - newtp->fastopen_req = NULL; - newtp->fastopen_rsk = NULL; - newtp->syn_data_acked = 0; - newtp->rack.mstamp = 0; - newtp->rack.advanced = 0; - newtp->rack.reo_wnd_steps = 1; - newtp->rack.last_delivered = 0; - newtp->rack.reo_wnd_persist = 0; - newtp->rack.dsack_seen = 0; - - __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); - } + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; + newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; + newtp->rack.mstamp = 0; + newtp->rack.advanced = 0; + newtp->rack.reo_wnd_steps = 1; + newtp->rack.last_delivered = 0; + newtp->rack.reo_wnd_persist = 0; + newtp->rack.dsack_seen = 0; + + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); @@ -600,7 +603,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); + tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 8cc7c3487330..870b0a335061 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -180,9 +180,9 @@ out: return segs; } -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct tcphdr *th; struct tcphdr *th2; @@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) len = skb_gro_len(skb); flags = tcp_flag_word(th); - for (; (p = *head); head = &p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto found; } - + p = NULL; goto out_check_final; found: @@ -262,8 +262,11 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); +#ifdef CONFIG_TLS_DEVICE + flush |= p->decrypted ^ skb->decrypted; +#endif - if (flush || skb_gro_receive(head, skb)) { + if (flush || skb_gro_receive(p, skb)) { mss = 1; goto out_check_final; } @@ -277,7 +280,7 @@ out_check_final: TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) - pp = head; + pp = p; out: NAPI_GRO_CB(skb)->flush |= (flush != 0); @@ -302,7 +305,7 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8e08b409c71e..597dbd749f05 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -160,7 +160,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, + u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); @@ -171,6 +172,9 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } + + if (unlikely(rcv_nxt != tp->rcv_nxt)) + return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -973,17 +977,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) return HRTIMER_NORESTART; } -/* BBR congestion control needs pacing. - * Same remark for SO_MAX_PACING_RATE. - * sch_fq packet scheduler is efficiently handling pacing, - * but is not always installed/used. - * Return true if TCP stack should pace packets itself. - */ -static bool tcp_needs_internal_pacing(const struct sock *sk) -{ - return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; -} - static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) { u64 len_ns; @@ -995,9 +988,6 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) if (!rate || rate == ~0U) return; - /* Should account for header sizes as sch_fq does, - * but lets make things simple. - */ len_ns = (u64)skb->len * NSEC_PER_SEC; do_div(len_ns, rate); hrtimer_start(&tcp_sk(sk)->pacing_timer, @@ -1023,8 +1013,8 @@ static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, - gfp_t gfp_mask) +static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, + int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; @@ -1100,7 +1090,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); - th->ack_seq = htonl(tp->rcv_nxt); + th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); @@ -1141,11 +1131,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, icsk->icsk_af_ops->send_check(sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); + tp->bytes_sent += skb->len - tcp_header_size; tcp_internal_pacing(sk, skb); } @@ -1178,6 +1169,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, return err; } +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) +{ + return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, + tcp_sk(sk)->rcv_nxt); +} + /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, @@ -2700,9 +2698,8 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb_rb_next(skb); - int skb_size, next_skb_size; + int next_skb_size; - skb_size = skb->len; next_skb_size = next_skb->len; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); @@ -2873,6 +2870,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; + tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom @@ -3523,8 +3521,6 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; - tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); - if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3573,7 +3569,7 @@ void tcp_send_delayed_ack(struct sock *sk) } /* This routine sends an ack and also updates the window. */ -void tcp_send_ack(struct sock *sk) +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) { struct sk_buff *buff; @@ -3581,8 +3577,6 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; - tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); - /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. @@ -3608,9 +3602,14 @@ void tcp_send_ack(struct sock *sk) skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ - tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); + __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); +} +EXPORT_SYMBOL_GPL(__tcp_send_ack); + +void tcp_send_ack(struct sock *sk) +{ + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } -EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index c61240e43923..4dff40dad4dc 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -146,6 +146,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + /* Normally we expect interval_us >= min-rtt. * Note that rate may still be over-estimated when a spuriously * retransmistted skb was first (s)acked because "interval_us" diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 71593e4400ab..c81aadff769b 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -25,7 +25,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (!tp->rack.reord) { + if (!tp->reord_seen) { /* If reordering has not been observed, be aggressive during * the recovery or starting the recovery by DUPACK threshold. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3b3611729928..7fdf222a0bdf 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,6 +22,35 @@ #include <linux/gfp.h> #include <net/tcp.h> +static u32 tcp_retransmit_stamp(const struct sock *sk) +{ + u32 start_ts = tcp_sk(sk)->retrans_stamp; + + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return 0; + start_ts = tcp_skb_timestamp(head); + } + return start_ts; +} + +static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + u32 elapsed, start_ts; + + start_ts = tcp_retransmit_stamp(sk); + if (!icsk->icsk_user_timeout || !start_ts) + return icsk->icsk_rto; + elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; + if (elapsed >= icsk->icsk_user_timeout) + return 1; /* user timeout has passed; fire ASAP */ + else + return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed)); +} + /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -166,14 +195,9 @@ static bool retransmits_timed_out(struct sock *sk, if (!inet_csk(sk)->icsk_retransmits) return false; - start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) { - struct sk_buff *head = tcp_rtx_queue_head(sk); - - if (!head) - return false; - start_ts = tcp_skb_timestamp(head); - } + start_ts = tcp_retransmit_stamp(sk); + if (!start_ts) + return false; if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -183,8 +207,9 @@ static bool retransmits_timed_out(struct sock *sk, else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + timeout = jiffies_to_msecs(timeout); } - return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); + return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout; } /* A write timeout has occurred. Process the after effects. */ @@ -337,8 +362,7 @@ static void tcp_probe_timer(struct sock *sk) if (!start_ts) skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp(tp) - start_ts) > - jiffies_to_msecs(icsk->icsk_user_timeout)) + (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; @@ -535,7 +559,8 @@ out_reset_timer: /* Use normal (exponential) backoff */ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); @@ -672,7 +697,7 @@ static void tcp_keepalive_timer (struct timer_list *t) * to determine when to timeout instead. */ if ((icsk->icsk_user_timeout != 0 && - elapsed >= icsk->icsk_user_timeout && + elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) && icsk->icsk_probes_out > 0) || (icsk->icsk_user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9bb27df4dac5..f4e35b2ff8b8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** @@ -498,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, daddr, hnum, dif, sdif, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -512,6 +515,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } @@ -926,11 +931,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; @@ -977,9 +977,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) connected = 1; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); ipc.gso_size = up->gso_size; if (msg->msg_controllen) { @@ -1027,8 +1025,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) saddr = ipc.addr; ipc.addr = faddr = daddr; - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; @@ -2591,7 +2587,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * udp_poll - wait for a UDP event. * @file - file struct * @sock - socket - * @events - events to wait for + * @wait - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd @@ -2600,23 +2596,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * but then block when reading it. Add special case code * to work around these arguably broken applications. */ -__poll_t udp_poll_mask(struct socket *sock, __poll_t events) +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) { - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ - if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) && + if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); return mask; } -EXPORT_SYMBOL(udp_poll_mask); +EXPORT_SYMBOL(udp_poll); int udp_abort(struct sock *sk, int err) { diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 92dc9e5a7ff3..0c0522b79b43 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -343,10 +343,11 @@ out: return segs; } -struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, udp_lookup_t lookup) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; @@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, unflush: flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -394,13 +395,13 @@ unflush: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } EXPORT_SYMBOL(udp_gro_receive); -static struct sk_buff **udp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); |