diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/fib_frontend.c | 5 | ||||
-rw-r--r-- | net/ipv4/fou.c | 4 | ||||
-rw-r--r-- | net/ipv4/gre_offload.c | 2 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 62 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 8 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 5 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 11 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_tables.c | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_tproxy_ipv4.c | 18 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 23 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 16 | ||||
-rw-r--r-- | net/ipv4/tcp_bbr.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_dctcp.c | 75 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 87 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 23 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 36 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 2 |
18 files changed, 240 insertions, 144 deletions
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b21833651394..2998b0e47d4b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -292,18 +292,19 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); - BUG_ON(!in_dev); net = dev_net(dev); scope = RT_SCOPE_UNIVERSE; if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { + bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), .flowi4_scope = scope, - .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0, + .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1540db65241a..c9ec1603666b 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -448,9 +448,7 @@ next_proto: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, &grc); - skb->remcsum_offload = 0; + skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 1859c473b21a..6a7d980105f6 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -223,7 +223,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 85b617b655bc..75151be21413 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1200,13 +1200,13 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) spin_lock_bh(&im->lock); if (pmc) { im->interface = pmc->interface; - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; - im->sfmode = pmc->sfmode; - if (pmc->sfmode == MCAST_INCLUDE) { + if (im->sfmode == MCAST_INCLUDE) { im->tomb = pmc->tomb; im->sources = pmc->sources; for (psf = im->sources; psf; psf = psf->sf_next) - psf->sf_crcount = im->crcount; + psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + } else { + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; } in_dev_put(pmc->interface); kfree(pmc); @@ -1288,7 +1288,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) #endif } -static void igmp_group_added(struct ip_mc_list *im) +static void igmp_group_added(struct ip_mc_list *im, unsigned int mode) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST @@ -1316,7 +1316,13 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should + * not send filter-mode change record as the mode should be from + * IN() to IN(A). + */ + if (mode == MCAST_EXCLUDE) + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + igmp_ifc_event(in_dev); #endif } @@ -1381,8 +1387,8 @@ static void ip_mc_hash_remove(struct in_device *in_dev, /* * A socket has joined a multicast group on device dev. */ - -void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, + unsigned int mode) { struct ip_mc_list *im; #ifdef CONFIG_IP_MULTICAST @@ -1394,7 +1400,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == addr) { im->users++; - ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); + ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); goto out; } } @@ -1408,8 +1414,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) in_dev_hold(in_dev); im->multiaddr = addr; /* initial mode is (EX, empty) */ - im->sfmode = MCAST_EXCLUDE; - im->sfcount[MCAST_EXCLUDE] = 1; + im->sfmode = mode; + im->sfcount[mode] = 1; refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST @@ -1426,12 +1432,17 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im); #endif - igmp_group_added(im); + igmp_group_added(im, mode); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: return; } + +void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +{ + __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_inc_group); static int ip_mc_check_iphdr(struct sk_buff *skb) @@ -1688,7 +1699,7 @@ void ip_mc_remap(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc); + igmp_group_added(pmc, pmc->sfmode); } } @@ -1751,7 +1762,7 @@ void ip_mc_up(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc); + igmp_group_added(pmc, pmc->sfmode); } } @@ -2130,8 +2141,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) /* Join a multicast group */ - -int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; @@ -2172,15 +2183,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; iml->sflist = NULL; - iml->sfmode = MCAST_EXCLUDE; + iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); - ip_mc_inc_group(in_dev, addr); + __ip_mc_inc_group(in_dev, addr, mode); err = 0; done: return err; } + +/* Join ASM (Any-Source Multicast) group + */ +int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +{ + return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_join_group); +/* Join SSM (Source-Specific Multicast) group + */ +int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) +{ + return __ip_mc_join_group(sk, imr, mode); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c9e35b81d093..0d70608cc2e1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -90,7 +90,7 @@ static void inet_frags_free_cb(void *ptr, void *arg) void inet_frags_exit_net(struct netns_frags *nf) { - nf->low_thresh = 0; /* prevent creation of new frags */ + nf->high_thresh = 0; /* prevent creation of new frags */ rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } @@ -157,9 +157,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, { struct inet_frag_queue *q; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) - return NULL; - q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) return NULL; @@ -204,6 +201,9 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { struct inet_frag_queue *fq; + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) + return NULL; + rcu_read_lock(); fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8e9528ebaa8e..d14d741fb05e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -383,11 +383,16 @@ found: int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */ if (i < next->len) { + int delta = -next->truesize; + /* Eat head of the next overlapped fragment * and leave the loop. The next ones cannot overlap. */ if (!pskb_pull(next, i)) goto err; + delta += next->truesize; + if (delta) + add_frag_mem_limit(qp->q.net, delta); next->ip_defrag_offset += i; qp->q.meat -= i; if (next->ip_summed != CHECKSUM_UNNECESSARY) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b3308e9d9762..0e3edd25f881 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -523,6 +523,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->dev = from->dev; to->mark = from->mark; + skb_copy_hash(to, from); + /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index fc32fdbeefa6..c0fe5ad996f2 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -150,15 +150,18 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; const struct iphdr *iph = ip_hdr(skb); - __be16 *ports = (__be16 *)skb_transport_header(skb); + __be16 *ports; + int end; - if (skb_transport_offset(skb) + 4 > (int)skb->len) + end = skb_transport_offset(skb) + 4; + if (end > 0 && !pskb_may_pull(skb, end)) return; /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ + ports = (__be16 *)skb_transport_header(skb); sin.sin_family = AF_INET; sin.sin_addr.s_addr = iph->daddr; @@ -984,7 +987,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; mreq.imr_address.s_addr = mreqs.imr_interface; mreq.imr_ifindex = 0; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; omode = MCAST_INCLUDE; @@ -1061,7 +1064,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; mreq.imr_ifindex = greqs.gsr_interface; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; greqs.gsr_interface = mreq.imr_ifindex; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index ca0dad90803a..e77872c93c20 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1898,6 +1898,7 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = { .checkentry = icmp_checkentry, .proto = IPPROTO_ICMP, .family = NFPROTO_IPV4, + .me = THIS_MODULE, }, }; diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 805e83ec3ad9..164714104965 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NF_TPROXY_LOOKUP_LISTENER); @@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) EXPORT_SYMBOL_GPL(nf_tproxy_laddr4); struct sock * -nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, @@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; - struct tcphdr *tcph; switch (protocol) { - case IPPROTO_TCP: + case IPPROTO_TCP: { + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(struct tcphdr), &_hdr); + if (hp == NULL) + return NULL; + switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - tcph = hp; sk = inet_lookup_listener(net, &tcp_hashinfo, skb, ip_hdrlen(skb) + - __tcp_hdrlen(tcph), + __tcp_hdrlen(hp), saddr, sport, daddr, dport, in->ifindex, 0); @@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, BUG(); } break; + } case IPPROTO_UDP: sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d06247ba08b2..5fa335fd3852 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -189,8 +189,9 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, if (write && ret == 0) { low = make_kgid(user_ns, urange[0]); high = make_kgid(user_ns, urange[1]); - if (!gid_valid(low) || !gid_valid(high) || - (urange[1] < urange[0]) || gid_lt(high, low)) { + if (!gid_valid(low) || !gid_valid(high)) + return -EINVAL; + if (urange[1] < urange[0] || gid_lt(high, low)) { low = make_kgid(&init_user_ns, 1); high = make_kgid(&init_user_ns, 0); } @@ -265,8 +266,9 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; - int ret; u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ + __le32 key[4]; + int ret, i; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) @@ -275,11 +277,14 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, rcu_read_lock(); ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) - memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); + memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else - memset(user_key, 0, sizeof(user_key)); + memset(key, 0, sizeof(key)); rcu_read_unlock(); + for (i = 0; i < ARRAY_SIZE(key); i++) + user_key[i] = le32_to_cpu(key[i]); + snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", user_key[0], user_key[1], user_key[2], user_key[3]); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); @@ -290,13 +295,17 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ret = -EINVAL; goto bad_key; } - tcp_fastopen_reset_cipher(net, NULL, user_key, + + for (i = 0; i < ARRAY_SIZE(user_key); i++) + key[i] = cpu_to_le32(user_key[i]); + + tcp_fastopen_reset_cipher(net, NULL, key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", - user_key[0], user_key[1], user_key[2], user_key[3], + user_key[0], user_key[1], user_key[2], user_key[3], (char *)tbl.data, ret); kfree(tbl.data); return ret; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e7b53d2a971f..4491faf83f4f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1998,7 +1998,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), - "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n", + "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; @@ -2013,7 +2013,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), - "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", + "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } @@ -2562,6 +2562,8 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); + tp->copied_seq = tp->rcv_nxt; + tp->urg_data = 0; tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); @@ -2821,14 +2823,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; - else if (val == 1) { + else if (val == TCP_REPAIR_ON) { tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; - } else if (val == 0) { + } else if (val == TCP_REPAIR_OFF) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; tcp_send_window_probe(sk); + } else if (val == TCP_REPAIR_OFF_NO_WP) { + tp->repair = 0; + sk->sk_reuse = SK_NO_REUSE; } else err = -EINVAL; @@ -3720,8 +3725,7 @@ int tcp_abort(struct sock *sk, int err) struct request_sock *req = inet_reqsk(sk); local_bh_disable(); - inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, - req); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); local_bh_enable(); return 0; } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 58e2f479ffb4..4bfff3c87e8e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -354,6 +354,10 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ cwnd = (cwnd + 1) & ~1U; + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ + if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) + cwnd += 2; + return cwnd; } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 5f5e5936760e..8b637f9f23a2 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -55,7 +55,6 @@ struct dctcp { u32 dctcp_alpha; u32 next_seq; u32 ce_state; - u32 delayed_ack_reserved; u32 loss_cwnd; }; @@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk) ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); - ca->delayed_ack_reserved = 0; ca->loss_cwnd = 0; ca->ce_state = 0; @@ -131,23 +129,14 @@ static void dctcp_ce_state_0_to_1(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - /* State has changed from CE=0 to CE=1 and delayed - * ACK has not sent yet. - */ - if (!ca->ce_state && ca->delayed_ack_reserved) { - u32 tmp_rcv_nxt; - - /* Save current rcv_nxt. */ - tmp_rcv_nxt = tp->rcv_nxt; - - /* Generate previous ack with CE=0. */ - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - tp->rcv_nxt = ca->prior_rcv_nxt; - - tcp_send_ack(sk); - - /* Recover current rcv_nxt. */ - tp->rcv_nxt = tmp_rcv_nxt; + if (!ca->ce_state) { + /* State has changed from CE=0 to CE=1, force an immediate + * ACK to reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); + tcp_enter_quickack_mode(sk, 1); } ca->prior_rcv_nxt = tp->rcv_nxt; @@ -161,23 +150,14 @@ static void dctcp_ce_state_1_to_0(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - /* State has changed from CE=1 to CE=0 and delayed - * ACK has not sent yet. - */ - if (ca->ce_state && ca->delayed_ack_reserved) { - u32 tmp_rcv_nxt; - - /* Save current rcv_nxt. */ - tmp_rcv_nxt = tp->rcv_nxt; - - /* Generate previous ack with CE=1. */ - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; - tp->rcv_nxt = ca->prior_rcv_nxt; - - tcp_send_ack(sk); - - /* Recover current rcv_nxt. */ - tp->rcv_nxt = tmp_rcv_nxt; + if (ca->ce_state) { + /* State has changed from CE=1 to CE=0, force an immediate + * ACK to reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) + __tcp_send_ack(sk, ca->prior_rcv_nxt); + tcp_enter_quickack_mode(sk, 1); } ca->prior_rcv_nxt = tp->rcv_nxt; @@ -248,25 +228,6 @@ static void dctcp_state(struct sock *sk, u8 new_state) } } -static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) -{ - struct dctcp *ca = inet_csk_ca(sk); - - switch (ev) { - case CA_EVENT_DELAYED_ACK: - if (!ca->delayed_ack_reserved) - ca->delayed_ack_reserved = 1; - break; - case CA_EVENT_NON_DELAYED_ACK: - if (ca->delayed_ack_reserved) - ca->delayed_ack_reserved = 0; - break; - default: - /* Don't care for the rest. */ - break; - } -} - static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { switch (ev) { @@ -276,10 +237,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) case CA_EVENT_ECN_NO_CE: dctcp_ce_state_1_to_0(sk); break; - case CA_EVENT_DELAYED_ACK: - case CA_EVENT_NON_DELAYED_ACK: - dctcp_update_ack_reserved(sk, ev); - break; default: /* Don't care for the rest. */ break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 355d3dffd021..f9dcb29be12d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -215,7 +215,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.quick = quickacks; } -static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) +void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -223,6 +223,7 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } +EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. @@ -245,8 +246,15 @@ static void tcp_ecn_queue_cwr(struct tcp_sock *tp) static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) { - if (tcp_hdr(skb)->cwr) + if (tcp_hdr(skb)->cwr) { tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + + /* If the sender is telling us it has entered CWR, then its + * cwnd may be very low (even just 1 packet), so we should ACK + * immediately. + */ + tcp_enter_quickack_mode((struct sock *)tp, 2); + } } static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) @@ -265,7 +273,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) - tcp_enter_quickack_mode(sk, 1); + tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: if (tcp_ca_needs_ecn(sk)) @@ -273,7 +281,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ - tcp_enter_quickack_mode(sk, 1); + tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } tp->ecn_flags |= TCP_ECN_SEEN; @@ -3181,6 +3189,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); + + /* If any of the cumulatively ACKed segments was + * retransmitted, non-SACK case cannot confirm that + * progress was due to original transmission due to + * lack of TCPCB_SACKED_ACKED bits even if some of + * the packets may have been never retransmitted. + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + flag &= ~FLAG_ORIG_SACK_ACKED; } else { int delta; @@ -4348,6 +4365,23 @@ static bool tcp_try_coalesce(struct sock *sk, return true; } +static bool tcp_ooo_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + bool res = tcp_try_coalesce(sk, to, from, fragstolen); + + /* In case tcp_drop() is called later, update to->gso_segs */ + if (res) { + u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + + max_t(u16, 1, skb_shinfo(from)->gso_segs); + + skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + } + return res; +} + static void tcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); @@ -4471,8 +4505,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, tp->ooo_last_skb, - skb, &fragstolen)) { + if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, + skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); @@ -4500,7 +4534,7 @@ coalesce_done: /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb); + tcp_drop(sk, skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4519,11 +4553,11 @@ coalesce_done: TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb1); + tcp_drop(sk, skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, skb1, - skb, &fragstolen)) { + } else if (tcp_ooo_try_coalesce(sk, skb1, + skb, &fragstolen)) { goto coalesce_done; } p = &parent->rb_right; @@ -4892,6 +4926,7 @@ end: static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 range_truesize, sum_tiny = 0; struct sk_buff *skb, *head; u32 start, end; @@ -4903,6 +4938,7 @@ new_range: } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; + range_truesize = skb->truesize; for (head = skb;;) { skb = skb_rb_next(skb); @@ -4913,11 +4949,20 @@ new_range: if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, NULL, &tp->out_of_order_queue, - head, skb, start, end); + /* Do not attempt collapsing tiny skbs */ + if (range_truesize != head->truesize || + end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { + tcp_collapse(sk, NULL, &tp->out_of_order_queue, + head, skb, start, end); + } else { + sum_tiny += range_truesize; + if (sum_tiny > sk->sk_rcvbuf >> 3) + return; + } goto new_range; } + range_truesize += skb->truesize; if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) start = TCP_SKB_CB(skb)->seq; if (after(TCP_SKB_CB(skb)->end_seq, end)) @@ -4932,6 +4977,7 @@ new_range: * 2) not add too big latencies if thousands of packets sit there. * (But if application shrinks SO_RCVBUF, we could still end up * freeing whole queue here) + * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. * * Return true if queue has shrunk. */ @@ -4939,20 +4985,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node *node, *prev; + int goal; if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false; NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); + goal = sk->sk_rcvbuf >> 3; node = &tp->ooo_last_skb->rbnode; do { prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); + goal -= rb_to_skb(node)->truesize; tcp_drop(sk, rb_to_skb(node)); - sk_mem_reclaim(sk); - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !tcp_under_memory_pressure(sk)) - break; + if (!prev || goal <= 0) { + sk_mem_reclaim(sk); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !tcp_under_memory_pressure(sk)) + break; + goal = sk->sk_rcvbuf >> 3; + } node = prev; } while (node); tp->ooo_last_skb = rb_to_skb(prev); @@ -4987,6 +5039,9 @@ static int tcp_prune_queue(struct sock *sk) else if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, NULL, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bea17f1e8302..3b2711e33e4c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -156,11 +156,24 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) */ if (tcptw->tw_ts_recent_stamp && (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + /* In case of repair and re-using TIME-WAIT sockets we still + * want to be sure that it is safe as above but honor the + * sequence numbers and time stamps set as part of the repair + * process. + * + * Without this check re-using a TIME-WAIT socket with TCP + * repair would accumulate a -1 on the repair assigned + * sequence number. The first time it is reused the sequence + * is -1, the second time -2, etc. This fixes that issue + * without appearing to create any others. + */ + if (likely(!tp->repair)) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + } sock_hold(sktw); return 1; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8e08b409c71e..c4172c1fb198 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -160,7 +160,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, + u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); @@ -171,6 +172,9 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } + + if (unlikely(rcv_nxt != tp->rcv_nxt)) + return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -1023,8 +1027,8 @@ static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, - gfp_t gfp_mask) +static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, + int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; @@ -1100,7 +1104,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); - th->ack_seq = htonl(tp->rcv_nxt); + th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); @@ -1141,7 +1145,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, icsk->icsk_af_ops->send_check(sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); @@ -1178,6 +1182,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, return err; } +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) +{ + return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, + tcp_sk(sk)->rcv_nxt); +} + /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, @@ -3523,8 +3534,6 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; - tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); - if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3573,7 +3582,7 @@ void tcp_send_delayed_ack(struct sock *sk) } /* This routine sends an ack and also updates the window. */ -void tcp_send_ack(struct sock *sk) +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) { struct sk_buff *buff; @@ -3581,8 +3590,6 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; - tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); - /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. @@ -3608,9 +3615,14 @@ void tcp_send_ack(struct sock *sk) skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ - tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); + __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); +} +EXPORT_SYMBOL_GPL(__tcp_send_ack); + +void tcp_send_ack(struct sock *sk) +{ + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } -EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 92dc9e5a7ff3..69c54540d5b4 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -394,7 +394,7 @@ unflush: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } EXPORT_SYMBOL(udp_gro_receive); |