diff options
Diffstat (limited to 'net/ipv4')
64 files changed, 2025 insertions, 1336 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 91a2557942fa..f48fe6fc7e8c 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -70,11 +70,9 @@ config IP_MULTIPLE_TABLES address into account. Furthermore, the TOS (Type-Of-Service) field of the packet can be used for routing decisions as well. - If you are interested in this, please see the preliminary - documentation at <http://www.compendium.com.ar/policy-routing.txt> - and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>. - You will need supporting software from - <ftp://ftp.tux.org/pub/net/ip-routing/>. + If you need more information, see the Linux Advanced + Routing and Traffic Control documentation at + <http://lartc.org/howto/lartc.rpdb.html> If unsure, say N. diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index afcb435adfbe..c6c8ad1d4b6d 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux TCP/IP (INET) layer. # diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e31108e5ef79..ce4aa827be05 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; - int err; + int err, tcp_fastopen; lock_sock(sk); @@ -217,11 +217,12 @@ int inet_listen(struct socket *sock, int backlog) * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && - (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk, backlog); @@ -826,6 +827,7 @@ int inet_shutdown(struct socket *sock, int how) err = -ENOTCONN; /* Hack to wake up other listeners, who can poll for POLLHUP, even on eg. unconnected UDP sockets -- RR */ + /* fall through */ default: sk->sk_shutdown |= how; if (sk->sk_prot->shutdown) @@ -839,7 +841,7 @@ int inet_shutdown(struct socket *sock, int how) case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; - /* Fall through */ + /* fall through */ case TCP_SYN_SENT: err = sk->sk_prot->disconnect(sk, O_NONBLOCK); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 37db44f60718..4dd95cdd8070 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -240,7 +240,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) if (err == -EINPROGRESS) goto out; - if (err == -EBUSY) + if (err == -ENOSPC) err = NET_XMIT_DROP; goto out_free; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7c45b8896709..a8d7c5a9fb05 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1180,6 +1180,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; + /* fall through */ case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2ae8f54cb321..82178cc69c96 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1951,7 +1951,7 @@ int cipso_v4_req_setattr(struct request_sock *req, buf = NULL; req_inet = inet_rsk(req); - opt = xchg(&req_inet->opt, opt); + opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt); if (opt) kfree_rcu(opt, rcu); @@ -1973,11 +1973,13 @@ req_setattr_failure: * values on failure. * */ -static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) +static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) { + struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1); int hdr_delta = 0; - struct ip_options_rcu *opt = *opt_ptr; + if (!opt || opt->opt.cipso == 0) + return 0; if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { u8 cipso_len; u8 cipso_off; @@ -2039,14 +2041,10 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) */ void cipso_v4_sock_delattr(struct sock *sk) { - int hdr_delta; - struct ip_options_rcu *opt; struct inet_sock *sk_inet; + int hdr_delta; sk_inet = inet_sk(sk); - opt = rcu_dereference_protected(sk_inet->inet_opt, 1); - if (!opt || opt->opt.cipso == 0) - return; hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); if (sk_inet->is_icsk && hdr_delta > 0) { @@ -2066,15 +2064,7 @@ void cipso_v4_sock_delattr(struct sock *sk) */ void cipso_v4_req_delattr(struct request_sock *req) { - struct ip_options_rcu *opt; - struct inet_request_sock *req_inet; - - req_inet = inet_rsk(req); - opt = req_inet->opt; - if (!opt || opt->opt.cipso == 0) - return; - - cipso_v4_delopt(&req_inet->opt); + cipso_v4_delopt(&inet_rsk(req)->ireq_opt); } /** diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7adc0616599..a4573bccd6da 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -137,22 +137,12 @@ static void inet_hash_remove(struct in_ifaddr *ifa) */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { - u32 hash = inet_addr_hash(net, addr); struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) { - if (ifa->ifa_local == addr) { - struct net_device *dev = ifa->ifa_dev->dev; - - if (!net_eq(dev_net(dev), net)) - continue; - result = dev; - break; - } - } - if (!result) { + ifa = inet_lookup_ifaddr_rcu(net, addr); + if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; @@ -165,6 +155,8 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); + } else { + result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); @@ -173,6 +165,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) } EXPORT_SYMBOL(__ip_dev_find); +/* called under RCU lock */ +struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) +{ + u32 hash = inet_addr_hash(net, addr); + struct in_ifaddr *ifa; + + hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) + if (ifa->ifa_local == addr && + net_eq(dev_net(ifa->ifa_dev->dev), net)) + return ifa; + + return NULL; +} + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); @@ -438,7 +444,7 @@ static void check_lifetime(struct work_struct *work); static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, - u32 portid) + u32 portid, struct netlink_ext_ack *extack) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1, **ifap, **last_primary; @@ -483,6 +489,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; + ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); @@ -515,7 +522,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, static int inet_insert_ifa(struct in_ifaddr *ifa) { - return __inet_insert_ifa(ifa, NULL, 0); + return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) @@ -896,7 +903,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, return ret; } } - return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); + return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, + extack); } else { inet_free_ifa(ifa); @@ -1516,6 +1524,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ + /* fall through */ case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; @@ -1751,7 +1760,7 @@ static int inet_validate_link_af(const struct net_device *dev, struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; - if (dev && !__in_dev_get_rtnl(dev)) + if (dev && !__in_dev_get_rcu(dev)) return -EAFNOSUPPORT; err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL); @@ -1775,7 +1784,7 @@ static int inet_validate_link_af(const struct net_device *dev, static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b00e4a43b4dc..d57aa64fa7c7 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -432,7 +432,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * case -EINPROGRESS: goto error; - case -EBUSY: + case -ENOSPC: err = NET_XMIT_DROP; break; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 37819ab4cc74..f52d27a422c3 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -73,6 +73,11 @@ fail: fib_free_table(main_table); return -ENOMEM; } + +static bool fib4_has_custom_rules(struct net *net) +{ + return false; +} #else struct fib_table *fib_new_table(struct net *net, u32 id) @@ -128,6 +133,11 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } return NULL; } + +static bool fib4_has_custom_rules(struct net *net) +{ + return net->ipv4.fib_has_custom_rules; +} #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, @@ -345,9 +355,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; - if (!rpf && !fib_num_tclassid_users(net) && - (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) - goto last_resort; fib_combine_itag(itag, &res); dev_match = false; @@ -402,13 +409,28 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + struct net *net = dev_net(dev); - if (!r && !fib_num_tclassid_users(dev_net(dev)) && - IN_DEV_ACCEPT_LOCAL(idev) && + if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { + if (IN_DEV_ACCEPT_LOCAL(idev)) + goto ok; + /* with custom local routes in place, checking local addresses + * only will be too optimistic, with custom rules, checking + * local addresses only can be too strict, e.g. due to vrf + */ + if (net->ipv4.fib_has_custom_local_routes || + fib4_has_custom_rules(net)) + goto full_check; + if (inet_lookup_ifaddr_rcu(net, src)) + return -EINVAL; + +ok: *itag = 0; return 0; } + +full_check: return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); } @@ -759,6 +781,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, } err = fib_table_insert(net, tb, &cfg, extack); + if (!err && cfg.fc_type == RTN_LOCAL) + net->ipv4.fib_has_custom_local_routes = true; errout: return err; } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 5b2af19cfb5b..e6ff282bb7f4 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FIB_LOOKUP_H #define _FIB_LOOKUP_H diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index cfd420b0572c..b804ccbdb241 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <linux/socket.h> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 57a5d48acee8..f04d944f8abe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi) atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); } endfor_nexthops(fi); } - -static inline void fib_add_weight(struct fib_info *fi, - const struct fib_nh *nh) -{ - fi->fib_weight += nh->nh_weight; -} - #else /* CONFIG_IP_ROUTE_MULTIPATH */ #define fib_rebalance(fi) do { } while (0) -#define fib_add_weight(fi, nh) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -718,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) bool ecn_ca = false; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { val = nla_get_u32(nla); } @@ -774,8 +766,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, - struct fib_nh *nh, struct netlink_ext_ack *extack) +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) { int err = 0; struct net *net; @@ -1038,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) return -EINVAL; } else { @@ -1258,7 +1250,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, fi, nexthop_nh, extack); + err = fib_check_nh(cfg, nexthop_nh, extack); if (err != 0) goto failure; if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) @@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); - fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) fib_rebalance(fi); @@ -1365,8 +1356,6 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { - struct in_device *in_dev; - if (fi->fib_nh->nh_gw && nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) goto nla_put_failure; @@ -1374,10 +1363,14 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) goto nla_put_failure; if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { - in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev); + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev); if (in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) rtm->rtm_flags |= RTNH_F_DEAD; + rcu_read_unlock(); } if (fi->fib_nh->nh_flags & RTNH_F_OFFLOAD) rtm->rtm_flags |= RTNH_F_OFFLOAD; @@ -1400,18 +1393,20 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; for_nexthops(fi) { - struct in_device *in_dev; - rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_flags = nh->nh_flags & 0xFF; if (nh->nh_flags & RTNH_F_LINKDOWN) { - in_dev = __in_dev_get_rtnl(nh->nh_dev); + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(nh->nh_dev); if (in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) rtnh->rtnh_flags |= RTNH_F_DEAD; + rcu_read_unlock(); } rtnh->rtnh_hops = nh->nh_weight - 1; rtnh->rtnh_ifindex = nh->nh_oif; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index c636650a6a70..5ddc4aefff12 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -87,32 +87,32 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa) { struct fib_entry_notifier_info info = { .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .tos = fa->fa_tos, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; return call_fib4_notifier(nb, net, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .tos = fa->fa_tos, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; return call_fib4_notifiers(net, event_type, &info.info); } @@ -1216,9 +1216,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, fi, - new_fa->fa_tos, cfg->fc_type, - tb->tb_id); + key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1273,8 +1271,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type, - tb->tb_id); + call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1574,8 +1571,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, return -ESRCH; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete->fa_info, tos, - fa_to_delete->fa_type, tb->tb_id); + fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1892,9 +1888,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb) call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); + KEYLENGTH - fa->fa_slen, fa, + NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -1932,8 +1927,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, continue; call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, - fa->fa_type, fa->tb_id); + KEYLENGTH - fa->fa_slen, fa); } } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 416bb304a281..1859c473b21a 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -86,7 +86,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { unsigned int partial_adj; /* Adjust checksum to account for the fact that diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 681e33998e03..1617604c9284 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -782,7 +782,7 @@ static bool icmp_tag_validation(int proto) } /* - * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. */ @@ -810,7 +810,8 @@ static bool icmp_unreach(struct sk_buff *skb) if (iph->ihl < 5) /* Mangled header, drop. */ goto out_err; - if (icmph->type == ICMP_DEST_UNREACH) { + switch (icmph->type) { + case ICMP_DEST_UNREACH: switch (icmph->code & 15) { case ICMP_NET_UNREACH: case ICMP_HOST_UNREACH: @@ -846,8 +847,16 @@ static bool icmp_unreach(struct sk_buff *skb) } if (icmph->code > NR_ICMP_UNREACH) goto out; - } else if (icmph->type == ICMP_PARAMETERPROB) + break; + case ICMP_PARAMETERPROB: info = ntohl(icmph->un.gateway) >> 24; + break; + case ICMP_TIME_EXCEEDED: + __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS); + if (icmph->code == ICMP_EXC_FRAGTIME) + goto out; + break; + } /* * Throw it at our lower layers @@ -959,8 +968,9 @@ static bool icmp_timestamp(struct sk_buff *skb) */ icmp_param.data.times[1] = inet_current_timestamp(); icmp_param.data.times[2] = icmp_param.data.times[1]; - if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) - BUG(); + + BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)); + icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; icmp_param.data.icmph.code = 0; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b9c64b40a83a..4ca46dc08e63 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -39,11 +39,11 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ -static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, - const struct in6_addr *sk2_rcv_saddr6, - __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk1_ipv6only, bool sk2_ipv6only, - bool match_wildcard) +static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, + const struct in6_addr *sk2_rcv_saddr6, + __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk1_ipv6only, bool sk2_ipv6only, + bool match_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -52,29 +52,29 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) - return 1; + return true; if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } - return 0; + return false; } if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; + return true; if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; + return true; if (addr_type == IPV6_ADDR_ANY && match_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; + return true; if (sk2_rcv_saddr6 && ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) - return 1; + return true; - return 0; + return false; } #endif @@ -82,20 +82,20 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk2_ipv6only, bool match_wildcard) +static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk2_ipv6only, bool match_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) - return 1; + return true; if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } - return 0; + return false; } -int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) +bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) @@ -266,7 +266,7 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, - &sk->sk_v6_rcv_saddr, + inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, @@ -321,13 +321,14 @@ tb_found: goto fail_unlock; } success: - if (!hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif @@ -354,6 +355,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif @@ -473,6 +475,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) } spin_unlock_bh(&queue->fastopenq.lock); } + mem_cgroup_sk_alloc(newsk); out: release_sock(sk); if (req) @@ -492,17 +495,15 @@ EXPORT_SYMBOL(inet_csk_accept); * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)) + void (*retransmit_handler)(struct timer_list *t), + void (*delack_handler)(struct timer_list *t), + void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); - setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, - (unsigned long)sk); - setup_timer(&icsk->icsk_delack_timer, delack_handler, - (unsigned long)sk); - setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); + timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); + timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); + timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } EXPORT_SYMBOL(inet_csk_init_xmit_timers); @@ -537,9 +538,11 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); - struct ip_options_rcu *opt = ireq->opt; + struct ip_options_rcu *opt; struct rtable *rt; + opt = ireq_opt_deref(ireq); + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), @@ -573,10 +576,9 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct flowi4 *fl4; struct rtable *rt; + opt = rcu_dereference(ireq->ireq_opt); fl4 = &newinet->cork.fl.u.ip4; - rcu_read_lock(); - opt = rcu_dereference(newinet->inet_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), @@ -589,13 +591,11 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; - rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: - rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } @@ -674,9 +674,9 @@ void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); -static void reqsk_timer_handler(unsigned long data) +static void reqsk_timer_handler(struct timer_list *t) { - struct request_sock *req = (struct request_sock *)data; + struct request_sock *req = from_timer(req, t, rsk_timer); struct sock *sk_listener = req->rsk_listener; struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); @@ -747,8 +747,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, req->num_timeout = 0; req->sk = NULL; - setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler, - (unsigned long)req); + timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index af74d0433453..26a3d0315728 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -147,7 +147,7 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) spin_unlock(&hb->chain_lock); hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire((unsigned long) fq); + f->frag_expire(&fq->timer); return evicted; } @@ -164,7 +164,7 @@ static void inet_frag_worker(struct work_struct *work) local_bh_disable(); - for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { + for (i = READ_ONCE(f->next_bucket); budget; --budget) { evicted += inet_evict_bucket(f, &f->hash[i]); i = (i + 1) & (INETFRAGS_HASHSZ - 1); if (evicted > INETFRAGS_EVICT_MAX) @@ -366,7 +366,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, f->constructor(q, arg); add_frag_mem_limit(nf, f->qsize); - setup_timer(&q->timer, f->frag_expire, (unsigned long)q); + timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); refcount_set(&q->refcnt, 1); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 597bb4cfe805..e7d15fb0d94d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -456,10 +456,7 @@ static int inet_reuseport_add_sock(struct sock *sk, return reuseport_add_sock(sk, sk2); } - /* Initial allocation may have already happened via setsockopt */ - if (!rcu_access_pointer(sk->sk_reuseport_cb)) - return reuseport_alloc(sk); - return 0; + return reuseport_alloc(sk); } int __inet_hash(struct sock *sk, struct sock *osk) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5b039159e67a..c690cd0d9b3f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -9,7 +9,6 @@ */ #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/slab.h> #include <linux/module.h> #include <net/inet_hashtables.h> @@ -142,9 +141,9 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -static void tw_timer_handler(unsigned long data) +static void tw_timer_handler(struct timer_list *t) { - struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; + struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); if (tw->tw_kill) __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); @@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, if (tw) { const struct inet_sock *inet = inet_sk(sk); - kmemcheck_annotate_bitfield(tw, flags); - tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; @@ -188,8 +185,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); - setup_pinned_timer(&tw->tw_timer, tw_timer_handler, - (unsigned long)tw); + timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e7eb590c86ce..914d56928578 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -128,9 +128,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, break; } if (cmp == -1) - pp = &(*pp)->rb_left; + pp = &next->rb_left; else - pp = &(*pp)->rb_right; + pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; @@ -284,14 +284,17 @@ EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { - struct inet_peer *p, *n; + struct rb_node *p = rb_first(&base->rb_root); - rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) { - inet_putpeer(p); + while (p) { + struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); + + p = rb_next(p); + rb_erase(&peer->rb_node, &base->rb_root); + inet_putpeer(peer); cond_resched(); } - base->rb_root = RB_ROOT; base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 9f0a7b96646f..2dd21c3281a1 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 46408c220d9d..bbf1b94942c0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -190,12 +191,13 @@ static bool frag_expire_skip_icmp(u32 user) /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ -static void ip_expire(unsigned long arg) +static void ip_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct ipq *qp; struct net *net; - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); rcu_read_lock(); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0162fb955b33..bb6239169b1a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel *tunnel; struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 session_id; __be32 index; int len; @@ -275,8 +274,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ - session_id = cpu_to_be32(ntohs(ershdr->session_id)); - tpi->key = session_id; + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); index = ershdr->md.index; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, @@ -581,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, false)) goto err_free_rt; - if (skb->len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } @@ -733,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } @@ -775,20 +773,46 @@ free_skb: return NETDEV_TX_OK; } +static void ipgre_link_update(struct net_device *dev, bool set_mtu) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int len; + + len = tunnel->tun_hlen; + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); + len = tunnel->tun_hlen - len; + tunnel->hlen = tunnel->hlen + len; + + dev->needed_headroom = dev->needed_headroom + len; + if (set_mtu) + dev->mtu = max_t(int, dev->mtu - len, 68); + + if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { + if (!(tunnel->parms.o_flags & TUNNEL_CSUM) || + tunnel->encap.type == TUNNEL_ENCAP_NONE) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + dev->features |= NETIF_F_LLTX; + } +} + static int ipgre_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - int err; struct ip_tunnel_parm p; + int err; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) return -EFAULT; + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || - ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) + p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) || + ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } + p.i_flags = gre_flags_to_tnl_flags(p.i_flags); p.o_flags = gre_flags_to_tnl_flags(p.o_flags); @@ -796,11 +820,22 @@ static int ipgre_tunnel_ioctl(struct net_device *dev, if (err) return err; + if (cmd == SIOCCHGTUNNEL) { + struct ip_tunnel *t = netdev_priv(dev); + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; + + if (strcmp(dev->rtnl_link_ops->kind, "erspan")) + ipgre_link_update(dev, true); + } + p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) return -EFAULT; + return 0; } @@ -1013,15 +1048,14 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_net(struct net *net) +static void __net_exit ipgre_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); - ip_tunnel_delete_net(itn, &ipgre_link_ops); + ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit = ipgre_exit_net, + .exit_batch = ipgre_exit_batch_net, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1223,6 +1257,7 @@ static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } @@ -1246,13 +1281,16 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; - t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + sizeof(struct erspanhdr); + t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; dev->mtu = ETH_DATA_LEN - t_hlen - 4; dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } @@ -1306,9 +1344,9 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; __u32 fwmark = t->fwmark; + struct ip_tunnel_parm p; int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { @@ -1321,7 +1359,18 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_changelink(dev, tb, &p, fwmark); + + err = ip_tunnel_changelink(dev, tb, &p, fwmark); + if (err < 0) + return err; + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; + + if (strcmp(dev->rtnl_link_ops->kind, "erspan")) + ipgre_link_update(dev, !tb[IFLA_MTU]); + + return 0; } static size_t ipgre_get_size(const struct net_device *dev) @@ -1540,15 +1589,14 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_net(struct net *net) +static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); - ip_tunnel_delete_net(itn, &ipgre_tap_ops); + ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit = ipgre_tap_exit_net, + .exit_batch = ipgre_tap_exit_batch_net, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1559,16 +1607,14 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_net(struct net *net) +static void __net_exit erspan_exit_batch_net(struct list_head *net_list) { - struct ip_tunnel_net *itn = net_generic(net, erspan_net_id); - - ip_tunnel_delete_net(itn, &erspan_link_ops); + ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit = erspan_exit_net, + .exit_batch = erspan_exit_batch_net, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fa2dc8f692c6..57fc13c6ab2b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -311,9 +311,10 @@ drop: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt; + int (*edemux)(struct sk_buff *skb); struct net_device *dev = skb->dev; - void (*edemux)(struct sk_buff *skb); + struct rtable *rt; + int err; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - edemux(skb); + err = edemux(skb); + if (unlikely(err)) + goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } @@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); - if (unlikely(err)) { - if (err == -EXDEV) - __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - goto drop; - } + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, dev); + if (unlikely(err)) + goto drop_error; } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) drop: kfree_skb(skb); return NET_RX_DROP; + +drop_error: + if (err == -EXDEV) + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); + goto drop; } /* diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 525ae88d1e58..ed194d46c00e 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index a599aa83fdad..60fb1eb7d7d8 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e9805ad664ac..fe6fee728ce4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1061,16 +1061,22 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, } } -void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) +void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, + struct rtnl_link_ops *ops) { + struct ip_tunnel_net *itn; + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip_tunnel_destroy(itn, &list, ops); + list_for_each_entry(net, net_list, exit_list) { + itn = net_generic(net, id); + ip_tunnel_destroy(itn, &list, ops); + } unregister_netdevice_many(&list); rtnl_unlock(); } -EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); +EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5ed63d250950..949f432a5f04 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_parm *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ + int pkt_len = skb->len; int err; int mtu; @@ -197,15 +198,6 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { - tunnel->err_count--; - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } - mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); @@ -229,7 +221,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) - err = skb->len; + err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; @@ -452,15 +444,14 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_net(struct net *net) +static void __net_exit vti_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - ip_tunnel_delete_net(itn, &vti_link_ops); + ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit = vti_exit_net, + .exit_batch = vti_exit_batch_net, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 4c5dfe6bd34d..abdebca848c9 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or * user-supplied information to configure own IP address and routes. diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index fb1ad22b5e29..c891235b4966 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -128,43 +128,68 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly; static int ipip_err(struct sk_buff *skb, u32 info) { - -/* All the routers (except for Linux) return only - 8 bytes of packet payload. It means, that precise relaying of - ICMP in the real Internet is absolutely infeasible. - */ + /* All the routers (except for Linux) return only + * 8 bytes of packet payload. It means, that precise relaying of + * ICMP in the real Internet is absolutely infeasible. + */ struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; - struct ip_tunnel *t; - int err; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + struct ip_tunnel *t; + int err = 0; + + switch (type) { + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + /* Impossible event. */ + goto out; + default: + /* All others are translated to HOST_UNREACH. + * rfc2003 contains "deep thoughts" about NET_UNREACH, + * I believe they are just ether pollution. --ANK + */ + break; + } + break; + + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + goto out; + break; + + case ICMP_REDIRECT: + break; + + default: + goto out; + } - err = -ENOENT; t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->daddr, iph->saddr, 0); - if (!t) + if (!t) { + err = -ENOENT; goto out; + } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->parms.link, 0, iph->protocol, 0); - err = 0; + ipv4_update_pmtu(skb, net, info, t->parms.link, 0, + iph->protocol, 0); goto out; } if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, - iph->protocol, 0); - err = 0; + ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0); goto out; } - if (t->parms.iph.daddr == 0) + if (t->parms.iph.daddr == 0) { + err = -ENOENT; goto out; + } - err = 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; @@ -634,15 +659,14 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_net(struct net *net) +static void __net_exit ipip_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); - ip_tunnel_delete_net(itn, &ipip_link_ops); + ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit = ipip_exit_net, + .exit_batch = ipip_exit_batch_net, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c9b3e6e069ae..40a43ad294cb 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -67,6 +67,7 @@ #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/nexthop.h> +#include <net/switchdev.h> struct ipmr_rule { struct fib_rule common; @@ -264,6 +265,22 @@ static void __net_exit ipmr_rules_exit(struct net *net) fib_rules_unregister(net->ipv4.mr_rules_ops); rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); +} + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT; +} +EXPORT_SYMBOL(ipmr_rule_default); #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) @@ -298,6 +315,22 @@ static void __net_exit ipmr_rules_exit(struct net *net) net->ipv4.mrt = NULL; rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return 0; +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return 0; +} + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return true; +} +EXPORT_SYMBOL(ipmr_rule_default); #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, @@ -587,6 +620,82 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } #endif +static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_vif_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + /** * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call @@ -594,6 +703,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { + struct net *net = read_pnet(&mrt->net); struct vif_device *v; struct net_device *dev; struct in_device *in_dev; @@ -603,6 +713,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; + if (VIF_EXISTS(mrt, vifi)) + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi, + mrt->id); + write_lock_bh(&mrt_lock); dev = v->dev; v->dev = NULL; @@ -652,10 +766,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head) kmem_cache_free(mrt_cachep, c); } -static inline void ipmr_cache_free(struct mfc_cache *c) +void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->rcu, ipmr_cache_free_rcu); } +EXPORT_SYMBOL(ipmr_cache_free); /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. @@ -754,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { int vifi = vifc->vifc_vifi; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; @@ -828,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt, /* Fill in the VIF structures */ + attr.orig_dev = dev; + if (!switchdev_port_attr_get(dev, &attr)) { + memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len); + v->dev_parent_id.id_len = attr.u.ppid.id_len; + } else { + v->dev_parent_id.id_len = 0; + } v->rate_limit = vifc->vifc_rate_limit; v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; @@ -851,6 +976,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, if (vifi+1 > mrt->maxvif) mrt->maxvif = vifi+1; write_unlock_bh(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); return 0; } @@ -949,6 +1075,7 @@ static struct mfc_cache *ipmr_cache_alloc(void) if (c) { c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->mfc_un.res.minvif = MAXVIFS; + refcount_set(&c->mfc_un.res.refcount, 1); } return c; } @@ -1150,6 +1277,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; /* The entries are added/deleted only under RTNL */ @@ -1161,8 +1289,9 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); return 0; } @@ -1189,6 +1318,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1238,6 +1369,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1245,6 +1377,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, bool all) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c, *tmp; LIST_HEAD(list); int i; @@ -1263,8 +1396,10 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { @@ -1393,6 +1528,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, case MRT_ADD_MFC: case MRT_DEL_MFC: parent = -1; + /* fall through */ case MRT_ADD_MFC_PROXY: case MRT_DEL_MFC_PROXY: if (optlen != sizeof(mfc)) { @@ -1724,10 +1860,33 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } +#ifdef CONFIG_NET_SWITCHDEV +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + struct vif_device *out_vif = &mrt->vif_table[out_vifi]; + struct vif_device *in_vif = &mrt->vif_table[in_vifi]; + + if (!skb->offload_mr_fwd_mark) + return false; + if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) + return false; + return netdev_phys_item_id_same(&out_vif->dev_parent_id, + &in_vif->dev_parent_id); +} +#else +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + return false; +} +#endif + /* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *c, int vifi) + int in_vifi, struct sk_buff *skb, + struct mfc_cache *c, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; @@ -1748,6 +1907,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } + if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) + goto out_free; + if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, @@ -1925,8 +2087,8 @@ forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, - psend); + ipmr_queue_xmit(net, mrt, true_vifi, + skb2, cache, psend); } psend = ct; } @@ -1937,9 +2099,10 @@ last_forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb2, + cache, psend); } else { - ipmr_queue_xmit(net, mrt, skb, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend); return; } } @@ -2156,6 +2319,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) return -EMSGSIZE; + if (c->mfc_flags & MFC_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; + if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) return -EMSGSIZE; @@ -3048,14 +3214,87 @@ static const struct net_protocol pim_protocol = { }; #endif +static unsigned int ipmr_seq_read(struct net *net) +{ + ASSERT_RTNL(); + + return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); +} + +static int ipmr_dump(struct net *net, struct notifier_block *nb) +{ + struct mr_table *mrt; + int err; + + err = ipmr_rules_dump(net, nb); + if (err) + return err; + + ipmr_for_each_table(mrt, net) { + struct vif_device *v = &mrt->vif_table[0]; + struct mfc_cache *mfc; + int vifi; + + /* Notifiy on table VIF entries */ + read_lock(&mrt_lock); + for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { + if (!v->dev) + continue; + + call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + } + read_unlock(&mrt_lock); + + /* Notify on table MFC entries */ + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + call_ipmr_mfc_entry_notifier(nb, net, + FIB_EVENT_ENTRY_ADD, mfc, + mrt->id); + } + + return 0; +} + +static const struct fib_notifier_ops ipmr_notifier_ops_template = { + .family = RTNL_FAMILY_IPMR, + .fib_seq_read = ipmr_seq_read, + .fib_dump = ipmr_dump, + .owner = THIS_MODULE, +}; + +static int __net_init ipmr_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; + + net->ipv4.ipmr_seq = 0; + + ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv4.ipmr_notifier_ops = ops; + + return 0; +} + +static void __net_exit ipmr_notifier_exit(struct net *net) +{ + fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops); + net->ipv4.ipmr_notifier_ops = NULL; +} + /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; + err = ipmr_notifier_init(net); + if (err) + goto ipmr_notifier_fail; + err = ipmr_rules_init(net); if (err < 0) - goto fail; + goto ipmr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -3072,7 +3311,9 @@ proc_cache_fail: proc_vif_fail: ipmr_rules_exit(net); #endif -fail: +ipmr_rules_fail: + ipmr_notifier_exit(net); +ipmr_notifier_fail: return err; } @@ -3082,6 +3323,7 @@ static void __net_exit ipmr_net_exit(struct net *net) remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif + ipmr_notifier_exit(net); ipmr_rules_exit(net); } diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index f462fee66ac8..adcdae358365 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the netfilter modules on top of IPv4. # diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 9e2770fd00be..f88221aebc9d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -634,6 +634,25 @@ static void get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct arpt_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; + } + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -910,8 +929,7 @@ static int __do_replace(struct net *net, const char *name, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 39286e543ee6..4cbe5e80f3bf 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -781,6 +781,26 @@ get_counters(const struct xt_table_info *t, } } +static void get_old_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ipt_entry *iter; + unsigned int cpu, i; + + for_each_possible_cpu(cpu) { + i = 0; + xt_entry_foreach(iter, t->entries, t->size) { + const struct xt_counters *tmp; + + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); + ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); + ++i; /* macro does multi eval of i */ + } + + cond_resched(); + } +} + static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; @@ -1070,8 +1090,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters, and synchronize with replace */ - get_counters(oldinfo, counters); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 811689e523c3..f75fc6b53115 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, if (synproxy == NULL) return NF_ACCEPT; - if (nf_is_loopback_packet(skb)) + if (nf_is_loopback_packet(skb) || + ip_hdr(skb)->protocol != IPPROTO_TCP) return NF_ACCEPT; thoff = ip_hdrlen(skb); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index fe374da4bc13..89af9d88ca21 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -344,7 +344,7 @@ static void ipv4_hooks_unregister(struct net *net) mutex_unlock(®ister_ipv4_hooks); } -struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { +const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index a046c298413a..1849fedd9b81 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -81,7 +81,6 @@ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeout) { /* Do not immediately delete the connection after the first @@ -165,6 +164,12 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return NF_ACCEPT; } +static void icmp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); +} + /* Small and modified version of icmp_rcv */ static int icmp_error(struct net *net, struct nf_conn *tmpl, @@ -177,18 +182,14 @@ icmp_error(struct net *net, struct nf_conn *tmpl, /* Not enough header? */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, - NULL, "nf_ct_icmp: short packet "); + icmp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: bad HW ICMP checksum "); + icmp_error_log(skb, net, pf, "bad hw icmp checksum"); return -NF_ACCEPT; } @@ -199,9 +200,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, * discarded. */ if (icmph->type > NR_ICMP_TYPES) { - if (LOG_INVALID(net, IPPROTO_ICMP)) - nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL, - "nf_ct_icmp: invalid ICMP type "); + icmp_error_log(skb, net, pf, "invalid icmp type"); return -NF_ACCEPT; } @@ -259,9 +258,14 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[], return 0; } -static int icmp_nlattr_tuple_size(void) +static unsigned int icmp_nlattr_tuple_size(void) { - return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + + return size; } #endif diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index a0f37b208268..0443ca4120b0 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -276,7 +276,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + /* Only ICMPs can be IP_CT_IS_REPLY: */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index eeacbdaf7cdf..5cd06ba3535d 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -132,6 +132,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) goto free_nskb; + niph = ip_hdr(nskb); + /* "Never happens" */ if (nskb->len > dst_mtu(skb_dst(nskb))) goto free_nskb; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 127153f1ed8a..9f37c4727861 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -212,7 +212,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), - SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER), SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 94d4cd2d5ea4..3b427757b1f8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -495,7 +495,7 @@ u32 ip_idents_reserve(u32 hash, int segs) { u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(*p_tstamp); + u32 old = READ_ONCE(*p_tstamp); u32 now = (u32)jiffies; u32 new, delta = 0; @@ -1250,7 +1250,7 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); - unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size, + unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, ip_rt_min_advmss); return min(advmss, IPV4_MAX_PMTU - header_size); @@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev, EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { - struct rtable *rth; - struct in_device *in_dev = __in_dev_get_rcu(dev); - unsigned int flags = RTCF_MULTICAST; - u32 itag = 0; int err; /* Primary sanity checks. */ - if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) - goto e_inval; + return -EINVAL; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - goto e_inval; + return -EINVAL; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) - goto e_inval; + return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, &itag); + in_dev, itag); if (err < 0) - goto e_err; + return err; } + return 0; +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, int our) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; + struct rtable *rth; + u32 itag = 0; + int err; + + err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); + if (err) + return err; + if (our) flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) - goto e_nobufs; + return -ENOBUFS; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_set(skb, &rth->dst); return 0; - -e_nobufs: - return -ENOBUFS; -e_inval: - return -EINVAL; -e_err: - return err; } @@ -2507,7 +2513,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or struct rtable *ort = (struct rtable *) dst_orig; struct rtable *rt; - rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; @@ -3032,7 +3038,6 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { - int rc = 0; int cpu; ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); @@ -3089,7 +3094,7 @@ int __init ip_rt_init(void) #endif register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); - return rc; + return 0; } #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b1bb1b3a1082..fda37f2862c9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -355,7 +355,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ - ireq->opt = tcp_v4_save_options(sock_net(sk), skb); + RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb)); if (security_inet_conn_request(sk, skb, req)) { reqsk_free(req); @@ -385,7 +385,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); - tcp_select_initial_window(tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0d3c038d7b04..93e172118a94 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * @@ -25,6 +26,7 @@ #include <net/inet_frag.h> #include <net/ping.h> #include <net/protocol.h> +#include <net/netevent.h> static int zero; static int one = 1; @@ -199,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(ctl->data, struct net, + ipv4.tcp_congestion_control); char val[TCP_CA_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -206,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, }; int ret; - tcp_get_default_congestion_control(val); + tcp_get_default_congestion_control(net, val); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = tcp_set_default_congestion_control(val); + ret = tcp_set_default_congestion_control(net, val); return ret; } @@ -251,10 +255,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } -static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, +static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; int ret; @@ -265,7 +271,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(tcp_fastopen_ctx); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else @@ -282,12 +288,8 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - /* Generate a dummy secret but don't publish it. This - * is needed so we don't regenerate a new key on the - * first invocation of tcp_fastopen_cookie_gen - */ - tcp_fastopen_init_key_once(false); - tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); + tcp_fastopen_reset_cipher(net, NULL, user_key, + TCP_FASTOPEN_KEY_LENGTH); } bad_key: @@ -358,11 +360,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - tcp_fastopen_active_timeout_reset(); + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } @@ -385,15 +389,25 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH +static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_fib_multipath_hash_policy); + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net); + + return ret; +} +#endif + static struct ctl_table ipv4_table[] = { { - .procname = "tcp_retrans_collapse", - .data = &sysctl_tcp_retrans_collapse, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), @@ -401,48 +415,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_fastopen", - .data = &sysctl_tcp_fastopen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_fastopen_key", - .mode = 0600, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), - .proc_handler = proc_tcp_fastopen_key, - }, - { - .procname = "tcp_fastopen_blackhole_timeout_sec", - .data = &sysctl_tcp_fastopen_blackhole_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_tfo_blackhole_detect_timeout, - .extra1 = &zero, - }, - { - .procname = "tcp_abort_on_overflow", - .data = &sysctl_tcp_abort_on_overflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_stdurg", - .data = &sysctl_tcp_stdurg, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_rfc1337", - .data = &sysctl_tcp_rfc1337, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -464,34 +436,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_jiffies, }, { - .procname = "tcp_fack", - .data = &sysctl_tcp_fack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_recovery", - .data = &sysctl_tcp_recovery, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_max_reordering", - .data = &sysctl_tcp_max_reordering, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_dsack", - .data = &sysctl_tcp_dsack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_mem", .maxlen = sizeof(sysctl_tcp_mem), .data = &sysctl_tcp_mem, @@ -499,113 +443,12 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_doulongvec_minmax, }, { - .procname = "tcp_wmem", - .data = &sysctl_tcp_wmem, - .maxlen = sizeof(sysctl_tcp_wmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, - { - .procname = "tcp_rmem", - .data = &sysctl_tcp_rmem, - .maxlen = sizeof(sysctl_tcp_rmem), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - }, - { - .procname = "tcp_app_win", - .data = &sysctl_tcp_app_win, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_adv_win_scale", - .data = &sysctl_tcp_adv_win_scale, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &tcp_adv_win_scale_min, - .extra2 = &tcp_adv_win_scale_max, - }, - { - .procname = "tcp_frto", - .data = &sysctl_tcp_frto, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_min_rtt_wlen", - .data = &sysctl_tcp_min_rtt_wlen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_no_metrics_save", - .data = &sysctl_tcp_nometrics_save, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_moderate_rcvbuf", - .data = &sysctl_tcp_moderate_rcvbuf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_tso_win_divisor", - .data = &sysctl_tcp_tso_win_divisor, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_congestion_control", - .mode = 0644, - .maxlen = TCP_CA_NAME_MAX, - .proc_handler = proc_tcp_congestion_control, - }, - { - .procname = "tcp_workaround_signed_windows", - .data = &sysctl_tcp_workaround_signed_windows, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_limit_output_bytes", - .data = &sysctl_tcp_limit_output_bytes, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_challenge_ack_limit", - .data = &sysctl_tcp_challenge_ack_limit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_slow_start_after_idle", - .data = &sysctl_tcp_slow_start_after_idle, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", @@ -649,65 +492,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_allowed_congestion_control, }, { - .procname = "tcp_thin_linear_timeouts", - .data = &sysctl_tcp_thin_linear_timeouts, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_early_retrans", - .data = &sysctl_tcp_early_retrans, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &four, - }, - { - .procname = "tcp_min_tso_segs", - .data = &sysctl_tcp_min_tso_segs, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &gso_max_segs, - }, - { - .procname = "tcp_pacing_ss_ratio", - .data = &sysctl_tcp_pacing_ss_ratio, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &thousand, - }, - { - .procname = "tcp_pacing_ca_ratio", - .data = &sysctl_tcp_pacing_ca_ratio, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &thousand, - }, - { - .procname = "tcp_autocorking", - .data = &sysctl_tcp_autocorking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "tcp_invalid_ratelimit", - .data = &sysctl_tcp_invalid_ratelimit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, - { .procname = "tcp_available_ulp", .maxlen = TCP_ULP_BUF_MAX, .mode = 0444, @@ -976,6 +760,13 @@ static struct ctl_table ipv4_net_table[] = { }, #endif { + .procname = "tcp_congestion_control", + .data = &init_net.ipv4.tcp_congestion_control, + .mode = 0644, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = proc_tcp_congestion_control, + }, + { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, .maxlen = sizeof(int), @@ -1085,6 +876,28 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_fastopen", + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_fastopen_key", + .mode = 0600, + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + .proc_handler = proc_tcp_fastopen_key, + }, + { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", @@ -1100,7 +913,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_fib_multipath_hash_policy, .extra1 = &zero, .extra2 = &one, }, @@ -1144,6 +957,216 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_early_retrans", + .data = &init_net.ipv4.sysctl_tcp_early_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &four, + }, + { + .procname = "tcp_recovery", + .data = &init_net.ipv4.sysctl_tcp_recovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_thin_linear_timeouts", + .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_slow_start_after_idle", + .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_retrans_collapse", + .data = &init_net.ipv4.sysctl_tcp_retrans_collapse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_stdurg", + .data = &init_net.ipv4.sysctl_tcp_stdurg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_rfc1337", + .data = &init_net.ipv4.sysctl_tcp_rfc1337, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_abort_on_overflow", + .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_fack", + .data = &init_net.ipv4.sysctl_tcp_fack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_max_reordering", + .data = &init_net.ipv4.sysctl_tcp_max_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_dsack", + .data = &init_net.ipv4.sysctl_tcp_dsack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_app_win", + .data = &init_net.ipv4.sysctl_tcp_app_win, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_adv_win_scale", + .data = &init_net.ipv4.sysctl_tcp_adv_win_scale, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_adv_win_scale_min, + .extra2 = &tcp_adv_win_scale_max, + }, + { + .procname = "tcp_frto", + .data = &init_net.ipv4.sysctl_tcp_frto, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_no_metrics_save", + .data = &init_net.ipv4.sysctl_tcp_nometrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_moderate_rcvbuf", + .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_tso_win_divisor", + .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_workaround_signed_windows", + .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_limit_output_bytes", + .data = &init_net.ipv4.sysctl_tcp_limit_output_bytes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_challenge_ack_limit", + .data = &init_net.ipv4.sysctl_tcp_challenge_ack_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_min_tso_segs", + .data = &init_net.ipv4.sysctl_tcp_min_tso_segs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &gso_max_segs, + }, + { + .procname = "tcp_min_rtt_wlen", + .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_autocorking", + .data = &init_net.ipv4.sysctl_tcp_autocorking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "tcp_invalid_ratelimit", + .data = &init_net.ipv4.sysctl_tcp_invalid_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "tcp_pacing_ss_ratio", + .data = &init_net.ipv4.sysctl_tcp_pacing_ss_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { + .procname = "tcp_pacing_ca_ratio", + .data = &init_net.ipv4.sysctl_tcp_pacing_ca_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { + .procname = "tcp_wmem", + .data = &init_net.ipv4.sysctl_tcp_wmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, + { + .procname = "tcp_rmem", + .data = &init_net.ipv4.sysctl_tcp_rmem, + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5091402720ab..bf97317e6c97 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include <linux/time.h> #include <linux/slab.h> #include <linux/errqueue.h> +#include <linux/static_key.h> #include <net/icmp.h> #include <net/inet_common.h> @@ -282,24 +283,22 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> -int sysctl_tcp_min_tso_segs __read_mostly = 2; - -int sysctl_tcp_autocorking __read_mostly = 1; +#include <trace/events/tcp.h> struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; -int sysctl_tcp_wmem[3] __read_mostly; -int sysctl_tcp_rmem[3] __read_mostly; - EXPORT_SYMBOL(sysctl_tcp_mem); -EXPORT_SYMBOL(sysctl_tcp_rmem); -EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); +#if IS_ENABLED(CONFIG_SMC) +DEFINE_STATIC_KEY_FALSE(tcp_have_smc); +EXPORT_SYMBOL(tcp_have_smc); +#endif + /* * Current number of TCP sockets. */ @@ -413,8 +412,10 @@ void tcp_init_sock(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; + sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); + INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -441,6 +442,7 @@ void tcp_init_sock(struct sock *sk) tcp_assign_congestion_control(sk); tp->tsoffset = 0; + tp->rack.reo_wnd_steps = 1; sk->sk_state = TCP_CLOSE; @@ -449,15 +451,29 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) +void tcp_init_transfer(struct sock *sk, int bpf_op) { + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + tcp_call_bpf(sk, bpf_op); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -675,7 +691,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && - sysctl_tcp_autocorking && + sock_net(sk)->ipv4.sysctl_tcp_autocorking && skb != tcp_write_queue_head(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } @@ -686,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!tcp_send_head(sk)) - return; - skb = tcp_write_queue_tail(sk); + if (!skb) + return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); @@ -869,6 +884,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, * available to the caller, no more, no less. */ skb->reserved_tailroom = skb->end - skb->tail - size; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); @@ -948,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, int copy, i; bool can_coalesce; - if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || + if (!skb || (copy = size_goal - skb->len) <= 0 || !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - skb_queue_empty(&sk->sk_write_queue)); + tcp_rtx_and_write_queues_empty(sk)); if (!skb) goto wait_for_memory; @@ -1027,7 +1043,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sk->sk_tsflags); if (!(flags & MSG_SENDPAGE_NOTLAST)) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } @@ -1126,7 +1142,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -1183,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; + skb = tcp_write_queue_tail(sk); uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; @@ -1259,7 +1275,7 @@ restart: int max = size_goal; skb = tcp_write_queue_tail(sk); - if (tcp_send_head(sk)) { + if (skb) { if (skb->ip_summed == CHECKSUM_NONE) max = mss_now; copy = max - skb->len; @@ -1279,7 +1295,7 @@ new_segment: process_backlog = false; goto restart; } - first_skb = skb_queue_empty(&sk->sk_write_queue); + first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, select_size(sk, sg, first_skb), sk->sk_allocation, @@ -1404,7 +1420,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: @@ -1505,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err) + return err; + copied += skb->len; + } + skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) @@ -2017,6 +2040,8 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + trace_tcp_set_state(sk, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) @@ -2304,6 +2329,37 @@ static inline bool tcp_need_reset(int state) TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } +static void tcp_rtx_queue_purge(struct sock *sk) +{ + struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + /* Since we are deleting whole queue, no need to + * list_del(&skb->tcp_tsorted_anchor) + */ + tcp_rtx_queue_unlink(skb, sk); + sk_wmem_free_skb(sk, skb); + } +} + +void tcp_write_queue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); + sk_wmem_free_skb(sk, skb); + } + tcp_rtx_queue_purge(sk); + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); + sk_mem_reclaim(sk); + tcp_clear_all_retrans_hints(tcp_sk(sk)); +} + int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2362,7 +2418,6 @@ int tcp_disconnect(struct sock *sk, int flags) * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; - tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(sk->sk_rx_dst); @@ -2454,8 +2509,6 @@ static int tcp_repair_options_est(struct sock *sk, return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; - if (sysctl_tcp_fack) - tcp_enable_fack(tp); break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) @@ -2518,6 +2571,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + + if (optlen != sizeof(key)) + return -EINVAL; + + if (copy_from_user(key, optval, optlen)) + return -EFAULT; + + return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); + } default: /* fallthru */ break; @@ -2749,7 +2813,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { @@ -2759,7 +2823,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else @@ -2768,6 +2832,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EOPNOTSUPP; } break; + case TCP_FASTOPEN_NO_COOKIE: + if (val > 1 || val < 0) + err = -EINVAL; + else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + err = -EINVAL; + else + tp->fastopen_no_cookie = val; + break; case TCP_TIMESTAMP: if (!tp->repair) err = -EPERM; @@ -2905,7 +2977,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; - info->tcpi_fackets = tp->fackets_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); @@ -3104,6 +3175,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; + case TCP_FASTOPEN_KEY: { + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctx; + + if (get_user(len, optlen)) + return -EFAULT; + + rcu_read_lock(); + ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); + if (ctx) + memcpy(key, ctx->key, sizeof(key)); + else + len = 0; + rcu_read_unlock(); + + len = min_t(unsigned int, len, sizeof(key)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, key, len)) + return -EFAULT; + return 0; + } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; @@ -3166,6 +3259,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_connect; break; + case TCP_FASTOPEN_NO_COOKIE: + val = tp->fastopen_no_cookie; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; @@ -3531,13 +3628,13 @@ void __init tcp_init(void) max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_wmem[1] = 16*1024; - sysctl_tcp_wmem[2] = max(64*1024, max_wshare); + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; + init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; - sysctl_tcp_rmem[1] = 87380; - sysctl_tcp_rmem[2] = max(87380, max_rshare); + init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_rmem[1] = 87380; + init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 66ac69f7bd19..06fbe102a425 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -389,7 +389,7 @@ static void tcp_cdg_release(struct sock *sk) kfree(ca->gradients); } -struct tcp_congestion_ops tcp_cdg __read_mostly = { +static struct tcp_congestion_ops tcp_cdg __read_mostly = { .cong_avoid = tcp_cdg_cong_avoid, .cwnd_event = tcp_cdg_cwnd_event, .pkts_acked = tcp_cdg_acked, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2f26124fd160..bc6c02f16243 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) } /* Must be called with rcu lock held */ -static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) +static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, + const char *name) { - const struct tcp_congestion_ops *ca = tcp_ca_find(name); + struct tcp_congestion_ops *ca = tcp_ca_find(name); + #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); @@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; @@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) might_sleep(); rcu_read_lock(); - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(net, name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; @@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_congestion_ops *ca; + const struct tcp_congestion_ops *ca; rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (likely(try_module_get(ca->owner))) { - icsk->icsk_ca_ops = ca; - goto out; - } - /* Fallback to next available. The last really - * guaranteed fallback is Reno from this list. - */ - } -out: + ca = rcu_dereference(net->ipv4.tcp_congestion_control); + if (unlikely(!try_module_get(ca->owner))) + ca = &tcp_reno; + icsk->icsk_ca_ops = ca; rcu_read_unlock(); - memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else @@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk) } /* Used by sysctl to change default congestion control */ -int tcp_set_default_congestion_control(const char *name) +int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; - int ret = -ENOENT; - - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); -#ifdef CONFIG_MODULES - if (!ca && capable(CAP_NET_ADMIN)) { - spin_unlock(&tcp_cong_list_lock); + const struct tcp_congestion_ops *prev; + int ret; - request_module("tcp_%s", name); - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); - } -#endif + rcu_read_lock(); + ca = tcp_ca_find_autoload(net, name); + if (!ca) { + ret = -ENOENT; + } else if (!try_module_get(ca->owner)) { + ret = -EBUSY; + } else { + prev = xchg(&net->ipv4.tcp_congestion_control, ca); + if (prev) + module_put(prev->owner); - if (ca) { - ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ - list_move(&ca->list, &tcp_cong_list); + ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } - spin_unlock(&tcp_cong_list_lock); + rcu_read_unlock(); return ret; } @@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name) /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { - return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); + return tcp_set_default_congestion_control(&init_net, + CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); @@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) } /* Get current default congestion control */ -void tcp_get_default_congestion_control(char *name) +void tcp_get_default_congestion_control(struct net *net, char *name) { - struct tcp_congestion_ops *ca; - /* We will always have reno... */ - BUG_ON(list_empty(&tcp_cong_list)); + const struct tcp_congestion_ops *ca; rcu_read_lock(); - ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + ca = rcu_dereference(net->ipv4.tcp_congestion_control); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } @@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo if (!load) ca = tcp_ca_find(name); else - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(sock_net(sk), name); + /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } + if (!ca) { err = -ENOENT; } else if (!load) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index e3c33220c418..78c192ee03a4 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/crypto.h> #include <linux/err.h> #include <linux/init.h> @@ -9,15 +10,18 @@ #include <net/inetpeer.h> #include <net/tcp.h> -int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; - -struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; - -static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); - -void tcp_fastopen_init_key_once(bool publish) +void tcp_fastopen_init_key_once(struct net *net) { - static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctxt; + + rcu_read_lock(); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctxt) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. @@ -25,8 +29,8 @@ void tcp_fastopen_init_key_once(bool publish) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key)) && publish) - tcp_fastopen_reset_cipher(key, sizeof(key)); + get_random_bytes(key, sizeof(key)); + tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) @@ -37,10 +41,37 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head) kfree(ctx); } -int tcp_fastopen_reset_cipher(void *key, unsigned int len) +void tcp_fastopen_destroy_cipher(struct sock *sk) +{ + struct tcp_fastopen_context *ctx; + + ctx = rcu_dereference_protected( + inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1); + if (ctx) + call_rcu(&ctx->rcu, tcp_fastopen_ctx_free); +} + +void tcp_fastopen_ctx_destroy(struct net *net) +{ + struct tcp_fastopen_context *ctxt; + + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + + ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + + if (ctxt) + call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); +} + +int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, + void *key, unsigned int len) { - int err; struct tcp_fastopen_context *ctx, *octx; + struct fastopen_queue *q; + int err; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -61,26 +92,37 @@ error: kfree(ctx); } memcpy(ctx->key, key, len); - spin_lock(&tcp_fastopen_ctx_lock); - octx = rcu_dereference_protected(tcp_fastopen_ctx, - lockdep_is_held(&tcp_fastopen_ctx_lock)); - rcu_assign_pointer(tcp_fastopen_ctx, ctx); - spin_unlock(&tcp_fastopen_ctx_lock); + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + if (sk) { + q = &inet_csk(sk)->icsk_accept_queue.fastopenq; + octx = rcu_dereference_protected(q->ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(q->ctx, ctx); + } else { + octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + } + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); return err; } -static bool __tcp_fastopen_cookie_gen(const void *path, +static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; bool ok = false; rcu_read_lock(); - ctx = rcu_dereference(tcp_fastopen_ctx); + + ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); + if (!ctx) + ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); + if (ctx) { crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; @@ -96,7 +138,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path, * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ -static bool tcp_fastopen_cookie_gen(struct request_sock *req, +static bool tcp_fastopen_cookie_gen(struct sock *sk, + struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { @@ -104,7 +147,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct iphdr *iph = ip_hdr(syn); __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(path, foc); + return __tcp_fastopen_cookie_gen(sk, path, foc); } #if IS_ENABLED(CONFIG_IPV6) @@ -112,13 +155,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct ipv6hdr *ip6h = ipv6_hdr(syn); struct tcp_fastopen_cookie tmp; - if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) { struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(buf, foc); + return __tcp_fastopen_cookie_gen(sk, buf, foc); } } #endif @@ -216,12 +259,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_buffer_space(child); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; @@ -271,33 +309,45 @@ static bool tcp_fastopen_queue_check(struct sock *sk) return true; } +static bool tcp_fastopen_no_cookie(const struct sock *sk, + const struct dst_entry *dst, + int flag) +{ + return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) || + tcp_sk(sk)->fastopen_no_cookie || + (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE)); +} + /* Returns true if we should perform Fast Open on the SYN. The cookie (foc) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + const struct dst_entry *dst) { - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); - if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } - if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + if (syn_data && + tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) && foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { @@ -330,6 +380,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { unsigned long last_syn_loss = 0; + const struct dst_entry *dst; int syn_loss = 0; tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); @@ -347,7 +398,9 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, return false; } - if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + dst = __sk_dst_get(sk); + + if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) { cookie->len = -1; return true; } @@ -401,25 +454,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * TFO connection with data exchanges. */ -/* Default to 1hr */ -unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; -static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); -static unsigned long tfo_active_disable_stamp __read_mostly; - /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { - atomic_inc(&tfo_active_disable_times); - tfo_active_disable_stamp = jiffies; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); -} + struct net *net = sock_net(sk); -/* Reset tfo_active_disable_times to 0 */ -void tcp_fastopen_active_timeout_reset(void) -{ - atomic_set(&tfo_active_disable_times, 0); + atomic_inc(&net->ipv4.tfo_active_disable_times); + net->ipv4.tfo_active_disable_stamp = jiffies; + NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable @@ -428,17 +472,18 @@ void tcp_fastopen_active_timeout_reset(void) */ bool tcp_fastopen_active_should_disable(struct sock *sk) { - int tfo_da_times = atomic_read(&tfo_active_disable_times); - int multiplier; + unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; + int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); unsigned long timeout; + int multiplier; if (!tfo_da_times) return false; /* Limit timout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); - timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; - if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + timeout = multiplier * tfo_bh_timeout * HZ; + if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) return true; /* Mark check bit so we can check for successful active TFO @@ -457,27 +502,25 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node *p; - struct sk_buff *skb; struct dst_entry *dst; + struct sk_buff *skb; if (!tp->syn_fastopen) return; if (!tp->data_segs_in) { - p = rb_first(&tp->out_of_order_queue); - if (p && !rb_next(p)) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); + if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk); return; } } } else if (tp->syn_fastopen_ch && - atomic_read(&tfo_active_disable_times)) { + atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) - tcp_fastopen_active_timeout_reset(); + atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c5d7656beeee..f844c06c0676 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -75,25 +76,10 @@ #include <linux/ipsec.h> #include <asm/unaligned.h> #include <linux/errqueue.h> +#include <trace/events/tcp.h> +#include <linux/static_key.h> -int sysctl_tcp_fack __read_mostly; -int sysctl_tcp_max_reordering __read_mostly = 300; -int sysctl_tcp_dsack __read_mostly = 1; -int sysctl_tcp_app_win __read_mostly = 31; -int sysctl_tcp_adv_win_scale __read_mostly = 1; -EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); - -/* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 1000; - -int sysctl_tcp_stdurg __read_mostly; -int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_frto __read_mostly = 2; -int sysctl_tcp_min_rtt_wlen __read_mostly = 300; -int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; -int sysctl_tcp_early_retrans __read_mostly = 3; -int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -114,7 +100,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) -#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) @@ -334,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) - sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -367,8 +353,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(skb->truesize) >> 1; - int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; + int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; + int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -393,7 +379,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ - if (tcp_win_from_space(skb->truesize) <= skb->len) + if (tcp_win_from_space(sk, skb->truesize) <= skb->len) incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb); @@ -419,11 +405,11 @@ static void tcp_fixup_rcvbuf(struct sock *sk) /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency * Allow enough cushion so that sender is not limited by our window */ - if (sysctl_tcp_moderate_rcvbuf) + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) rcvmem <<= 2; if (sk->sk_rcvbuf < rcvmem) - sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); + sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); } /* 4. Try to fixup all. It is made immediately after connection enters @@ -431,6 +417,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) */ void tcp_init_buffer_space(struct sock *sk) { + int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; struct tcp_sock *tp = tcp_sk(sk); int maxwin; @@ -449,14 +436,14 @@ void tcp_init_buffer_space(struct sock *sk) if (tp->window_clamp >= maxwin) { tp->window_clamp = maxwin; - if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) + if (tcp_app_win && maxwin > 4 * tp->advmss) tp->window_clamp = max(maxwin - - (maxwin >> sysctl_tcp_app_win), + (maxwin >> tcp_app_win), 4 * tp->advmss); } /* Force reservation of one segment. */ - if (sysctl_tcp_app_win && + if (tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); @@ -470,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); icsk->icsk_ack.quick = 0; - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + net->ipv4.sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -609,7 +597,7 @@ void tcp_rcv_space_adjust(struct sock *sk) * <prev RTT . ><current RTT .. ><next RTT .... > */ - if (sysctl_tcp_moderate_rcvbuf && + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvwin, rcvmem, rcvbuf; @@ -633,10 +621,11 @@ void tcp_rcv_space_adjust(struct sock *sk) } rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < tp->advmss) + while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); + rcvbuf = min(rcvwin / tp->advmss * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; @@ -780,15 +769,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->srtt_us = max(1U, srtt); } -/* Set the sk_pacing_rate to allow proper sizing of TSO packets. - * Note: TCP stack does not yet implement pacing. - * FQ packet scheduler can be used to implement cheap but effective - * TCP pacing, to smooth the burst on large writes when packets - * in flight is significantly lower than cwnd (or rwin) - */ -int sysctl_tcp_pacing_ss_ratio __read_mostly = 200; -int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; - static void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -806,21 +786,21 @@ static void tcp_update_pacing_rate(struct sock *sk) * end of slow start and should slow down. */ if (tp->snd_cwnd < tp->snd_ssthresh / 2) - rate *= sysctl_tcp_pacing_ss_ratio; + rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; else - rate *= sysctl_tcp_pacing_ca_ratio; + rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio; rate *= max(tp->snd_cwnd, tp->packets_out); if (likely(tp->srtt_us)) do_div(rate, tp->srtt_us); - /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate + /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store * intermediate values in this location. */ - ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, - sk->sk_max_pacing_rate); + WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, + sk->sk_max_pacing_rate)); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -862,60 +842,46 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -/* - * Packet counting of FACK is based on in-order assumptions, therefore TCP - * disables it when reordering is detected - */ -void tcp_disable_fack(struct tcp_sock *tp) -{ - /* RFC3517 uses different metric in lost marker => reset on change */ - if (tcp_is_fack(tp)) - tp->lost_skb_hint = NULL; - tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED; -} - /* Take a notice that peer is sending D-SACKs */ static void tcp_dsack_seen(struct tcp_sock *tp) { tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; + tp->rack.dsack_seen = 1; } -static void tcp_update_reordering(struct sock *sk, const int metric, - const int ts) +/* It's reordering when higher sequence was delivered (i.e. sacked) before + * some lower never-retransmitted sequence ("low_seq"). The maximum reordering + * distance is approximated in full-mss packet distance ("reordering"). + */ +static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, + const int ts) { struct tcp_sock *tp = tcp_sk(sk); - int mib_idx; + const u32 mss = tp->mss_cache; + u32 fack, metric; - if (WARN_ON_ONCE(metric < 0)) + fack = tcp_highest_sack_seq(tp); + if (!before(low_seq, fack)) return; - if (metric > tp->reordering) { - tp->reordering = min(sysctl_tcp_max_reordering, metric); - + metric = fack - low_seq; + if ((metric > tp->reordering * mss) && mss) { #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, - tp->fackets_out, + 0, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif - tcp_disable_fack(tp); + tp->reordering = min_t(u32, (metric + mss - 1) / mss, + sock_net(sk)->ipv4.sysctl_tcp_max_reordering); } tp->rack.reord = 1; - /* This exciting event is worth to be remembered. 8) */ - if (ts) - mib_idx = LINUX_MIB_TCPTSREORDER; - else if (tcp_is_reno(tp)) - mib_idx = LINUX_MIB_TCPRENOREORDER; - else if (tcp_is_fack(tp)) - mib_idx = LINUX_MIB_TCPFACKREORDER; - else - mib_idx = LINUX_MIB_TCPSACKREORDER; - - NET_INC_STATS(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), + ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } /* This must be called before lost_out is incremented */ @@ -989,7 +955,6 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. - * A''. Its FACK modification, head until snd.fack is lost. * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. @@ -1132,8 +1097,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } struct tcp_sacktag_state { - int reord; - int fack_count; + u32 reord; /* Timestamps for earliest and latest never-retransmitted segment * that was SACKed. RTO needs the earliest RTT to stay conservative, * but congestion control should still get an accurate delay signal. @@ -1142,6 +1106,7 @@ struct tcp_sacktag_state { u64 last_sackt; struct rate_sample *rate; int flag; + unsigned int mss_now; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1191,7 +1156,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, if (pkt_len >= skb->len && !in_sack) return 0; - err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); + err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1207,15 +1173,15 @@ static u8 tcp_sacktag_one(struct sock *sk, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); - int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; - if (sacked & TCPCB_SACKED_ACKED) - state->reord = min(fack_count, state->reord); + if ((sacked & TCPCB_SACKED_ACKED) && + before(start_seq, state->reord)) + state->reord = start_seq; } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ @@ -1241,9 +1207,10 @@ static u8 tcp_sacktag_one(struct sock *sk, * which was in hole. It is reordering. */ if (before(start_seq, - tcp_highest_sack_seq(tp))) - state->reord = min(fack_count, - state->reord); + tcp_highest_sack_seq(tp)) && + before(start_seq, state->reord)) + state->reord = start_seq; + if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; if (state->first_sackt == 0) @@ -1262,15 +1229,10 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; tp->delivered += pcount; /* Out-of-order packets delivered */ - fack_count += pcount; - /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ - if (!tcp_is_fack(tp) && tp->lost_skb_hint && + if (tp->lost_skb_hint && before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; - - if (fack_count > tp->fackets_out) - tp->fackets_out = fack_count; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1288,13 +1250,13 @@ static u8 tcp_sacktag_one(struct sock *sk, /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ -static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, +static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *prev = tcp_write_queue_prev(sk, skb); u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ @@ -1363,8 +1325,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_rtx_queue_unlink_and_free(skb, sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); @@ -1414,9 +1375,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto fallback; /* Can only happen with delayed DSACK + discard craziness */ - if (unlikely(skb == tcp_write_queue_head(sk))) + prev = skb_rb_prev(skb); + if (!prev) goto fallback; - prev = tcp_write_queue_prev(sk, skb); if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; @@ -1495,18 +1456,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!skb_shift(prev, skb, len)) goto fallback; - if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) + if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) goto out; /* Hole filled allows collapsing with the next as well, this is very * useful when hole on every nth skb pattern happens */ - if (prev == tcp_write_queue_tail(sk)) + skb = skb_rb_next(prev); + if (!skb) goto out; - skb = tcp_write_queue_next(sk, prev); if (!skb_can_shift(skb) || - (skb == tcp_send_head(sk)) || ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || (mss != tcp_skb_seglen(skb))) goto out; @@ -1514,11 +1474,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, len = skb->len; if (skb_shift(prev, skb, len)) { pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb), + len, mss, 0); } out: - state->fack_count += pcount; return prev; noop: @@ -1538,13 +1498,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { int in_sack = 0; bool dup_sack = dup_sack_in; - if (skb == tcp_send_head(sk)) - break; - /* queue is in-order => we can short-circuit the walk early */ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; @@ -1593,34 +1550,48 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, tcp_skb_pcount(skb), skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + list_del_init(&skb->tcp_tsorted_anchor); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) tcp_advance_highest_sack(sk, skb); } - - state->fack_count += tcp_skb_pcount(skb); } return skb; } -/* Avoid all extra work that is being done by sacktag while walking in - * a normal way - */ +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, + struct tcp_sacktag_state *state, + u32 seq) +{ + struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; + struct sk_buff *skb; + + while (*p) { + parent = *p; + skb = rb_to_skb(parent); + if (before(seq, TCP_SKB_CB(skb)->seq)) { + p = &parent->rb_left; + continue; + } + if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { + p = &parent->rb_right; + continue; + } + return skb; + } + return NULL; +} + static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, struct tcp_sacktag_state *state, u32 skip_to_seq) { - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - - if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) - break; + if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) + return skb; - state->fack_count += tcp_skb_pcount(skb); - } - return skb; + return tcp_sacktag_bsearch(sk, state, skip_to_seq); } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, @@ -1665,13 +1636,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, int first_sack_index; state->flag = 0; - state->reord = tp->packets_out; + state->reord = tp->snd_nxt; - if (!tp->sacked_out) { - if (WARN_ON(tp->fackets_out)) - tp->fackets_out = 0; + if (!tp->sacked_out) tcp_highest_sack_reset(sk); - } found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); @@ -1742,8 +1710,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } } - skb = tcp_write_queue_head(sk); - state->fack_count = 0; + state->mss_now = tcp_current_mss(sk); + skb = NULL; i = 0; if (!tp->sacked_out) { @@ -1800,7 +1768,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state->fack_count = tp->fackets_out; cache++; goto walk; } @@ -1815,7 +1782,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, skb = tcp_highest_sack(sk); if (!skb) break; - state->fack_count = tp->fackets_out; } skb = tcp_sacktag_skip(skb, sk, state, start_seq); @@ -1835,9 +1801,8 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - if ((state->reord < tp->fackets_out) && - ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) - tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker) + tcp_check_sack_reordering(sk, state->reord, 0); tcp_verify_left_out(tp); out: @@ -1875,8 +1840,13 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp) static void tcp_check_reno_reordering(struct sock *sk, const int addend) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_limit_reno_sacked(tp)) - tcp_update_reordering(sk, tp->packets_out + addend, 0); + + if (!tcp_limit_reno_sacked(tp)) + return; + + tp->reordering = min_t(u32, tp->packets_out + addend, + sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } /* Emulate SACKs for SACKless connection: account for a new dupack. */ @@ -1922,7 +1892,6 @@ void tcp_clear_retrans(struct tcp_sock *tp) tp->lost_out = 0; tp->undo_marker = 0; tp->undo_retrans = -1; - tp->fackets_out = 0; tp->sacked_out = 0; } @@ -1967,19 +1936,15 @@ void tcp_enter_loss(struct sock *sk) if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; - tp->fackets_out = 0; } tcp_clear_all_retrans_hints(tp); - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - + skb_rbtree_walk_from(skb) { mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || is_reneg); if (mark_lost) @@ -2013,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk) * falsely raise the receive window, which results in repeated * timeouts and stop-and-go behavior. */ - tp->frto = sysctl_tcp_frto && + tp->frto = net->ipv4.sysctl_tcp_frto && (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } @@ -2042,19 +2007,10 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag) return false; } -static inline int tcp_fackets_out(const struct tcp_sock *tp) -{ - return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; -} - /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs * counter when SACK is enabled (without SACK, sacked_out is used for * that purpose). * - * Instead, with FACK TCP uses fackets_out that includes both SACKed - * segments up to the highest received SACK block so far and holes in - * between them. - * * With reordering, holes may still be in flight, so RFC3517 recovery * uses pure sacked_out (total number of SACKed segments) even though * it violates the RFC that uses duplicate ACKs, often these are equal @@ -2064,10 +2020,10 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp) */ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) { - return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; + return tp->sacked_out + 1; } -/* Linux NewReno/SACK/FACK/ECN state machine. +/* Linux NewReno/SACK/ECN state machine. * -------------------------------------- * * "Open" Normal state, no dubious events, fast path. @@ -2132,16 +2088,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) * dynamically measured and adjusted. This is implemented in * tcp_rack_mark_lost. * - * FACK (Disabled by default. Subsumbed by RACK): - * It is the simplest heuristics. As soon as we decided - * that something is lost, we decide that _all_ not SACKed - * packets until the most forward SACK are lost. I.e. - * lost_out = fackets_out - sacked_out and left_out = fackets_out. - * It is absolutely correct estimate, if network does not reorder - * packets. And it loses any connection to reality when reordering - * takes place. We use FACK by default until reordering - * is suspected on the path to this destination. - * * If the receiver does not support SACK: * * NewReno (RFC6582): in Recovery we assume that one segment @@ -2190,7 +2136,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) } /* Detect loss in event "A" above by marking head of queue up as lost. - * For FACK or non-SACK(Reno) senders, the first "packets" number of segments + * For non-SACK(Reno) senders, the first "packets" number of segments * are considered lost. For RFC3517 SACK, a segment is considered lost if it * has at least tp->reordering SACKed seqments above it; "packets" refers to * the maximum SACKed segments to pass before reaching this limit. @@ -2205,20 +2151,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; WARN_ON(packets > tp->packets_out); - if (tp->lost_skb_hint) { - skb = tp->lost_skb_hint; - cnt = tp->lost_cnt_hint; + skb = tp->lost_skb_hint; + if (skb) { /* Head already handled? */ - if (mark_head && skb != tcp_write_queue_head(sk)) + if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) return; + cnt = tp->lost_cnt_hint; } else { - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); cnt = 0; } - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk_from(skb) { /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; @@ -2228,12 +2172,12 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; oldcnt = cnt; - if (tcp_is_fack(tp) || tcp_is_reno(tp) || + if (tcp_is_reno(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) cnt += tcp_skb_pcount(skb); if (cnt > packets) { - if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || + if (tcp_is_sack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || (oldcnt >= packets)) break; @@ -2242,7 +2186,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) /* If needed, chop off the prefix to mark as lost. */ lost = (packets - oldcnt) * mss; if (lost < skb->len && - tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) + tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + lost, mss, GFP_ATOMIC) < 0) break; cnt = packets; } @@ -2263,11 +2208,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) if (tcp_is_reno(tp)) { tcp_mark_head_lost(sk, 1, 1); - } else if (tcp_is_fack(tp)) { - int lost = tp->fackets_out - tp->reordering; - if (lost <= 0) - lost = 1; - tcp_mark_head_lost(sk, lost, 0); } else { int sacked_upto = tp->sacked_out - tp->reordering; if (sacked_upto >= 0) @@ -2326,16 +2266,16 @@ static bool tcp_any_retrans_done(const struct sock *sk) if (tp->retrans_out) return true; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) return true; return false; } -#if FASTRETRANS_DEBUG > 1 static void DBGUNDO(struct sock *sk, const char *msg) { +#if FASTRETRANS_DEBUG > 1 struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -2357,10 +2297,8 @@ static void DBGUNDO(struct sock *sk, const char *msg) tp->packets_out); } #endif -} -#else -#define DBGUNDO(x...) do { } while (0) #endif +} static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { @@ -2369,9 +2307,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (unmark_loss) { struct sk_buff *skb; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } tp->lost_out = 0; @@ -2416,6 +2352,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) mib_idx = LINUX_MIB_TCPFULLUNDO; NET_INC_STATS(sock_net(sk), mib_idx); + } else if (tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_persist--; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq @@ -2435,6 +2373,8 @@ static bool tcp_try_undo_dsack(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && !tp->undo_retrans) { + tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH, + tp->rack.reo_wnd_persist + 1); DBGUNDO(sk, "D-SACK"); tcp_undo_cwnd_reduction(sk, false); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); @@ -2614,11 +2554,8 @@ void tcp_simple_retransmit(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk); - u32 prior_lost = tp->lost_out; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss && !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { @@ -2631,7 +2568,7 @@ void tcp_simple_retransmit(struct sock *sk) tcp_clear_retrans_hints_partial(tp); - if (prior_lost == tp->lost_out) + if (!tp->lost_out) return; if (tcp_is_reno(tp)) @@ -2712,7 +2649,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, * is updated in tcp_ack()). Otherwise fall back to * the conventional recovery. */ - if (tcp_send_head(sk) && + if (!tcp_write_queue_empty(sk) && after(tcp_wnd_end(tp), tp->snd_nxt)) { *rexmit = REXMIT_NEW; return; @@ -2739,15 +2676,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, } /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, const int acked) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && tcp_packet_delayed(tp)) { /* Plain luck! Hole if filled with delayed - * packet, rather than with a retransmit. + * packet, rather than with a retransmit. Check reordering. */ - tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + tcp_check_sack_reordering(sk, prior_snd_una, 1); /* We are getting evidence that the reordering degree is higher * than we realized. If there are no retransmits out then we @@ -2774,7 +2711,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) struct tcp_sock *tp = tcp_sk(sk); /* Use RACK to detect loss */ - if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { u32 prior_retrans = tp->retrans_out; tcp_rack_mark_lost(sk); @@ -2783,6 +2720,14 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) } } +static bool tcp_force_fast_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return after(tcp_highest_sack_seq(tp), + tp->snd_una + tp->reordering * tp->mss_cache); +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2795,19 +2740,17 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void tcp_fastretrans_alert(struct sock *sk, const int acked, +static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int fast_rexmit = 0, flag = *ack_flag; bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && - (tcp_fackets_out(tp) > tp->reordering)); + tcp_force_fast_retransmit(sk)); - if (WARN_ON(!tp->packets_out && tp->sacked_out)) + if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) - tp->fackets_out = 0; /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ @@ -2854,11 +2797,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked)) + if (tcp_try_undo_partial(sk, prior_snd_una)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || - tcp_fackets_out(tp) > tp->reordering; + tcp_force_fast_retransmit(sk); } if (tcp_try_undo_dsack(sk)) { tcp_try_keep_open(sk); @@ -2873,6 +2816,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ + /* fall through */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) @@ -2913,8 +2857,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { + u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; struct tcp_sock *tp = tcp_sk(sk); - u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); @@ -3056,28 +3000,31 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, shinfo = skb_shinfo(skb); if (!before(shinfo->tskey, prior_snd_una) && - before(shinfo->tskey, tcp_sk(sk)->snd_una)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + before(shinfo->tskey, tcp_sk(sk)->snd_una)) { + tcp_skb_tsorted_save(skb) { + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + } tcp_skb_tsorted_restore(skb); + } } /* Remove acknowledged frames from the retransmission queue. If our packet * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, int *acked, +static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, + u32 prior_snd_una, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); u64 first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; - u32 reord = tp->packets_out; + u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */ + struct sk_buff *skb, *next; bool fully_acked = true; long sack_rtt_us = -1L; long seq_rtt_us = -1L; long ca_rtt_us = -1L; - struct sk_buff *skb; u32 pkts_acked = 0; u32 last_in_flight = 0; bool rtt_update; @@ -3085,8 +3032,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt = 0; - while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + const u32 start_seq = scb->seq; u8 sacked = scb->sacked; u32 acked_pcount; @@ -3103,8 +3051,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, break; fully_acked = false; } else { - /* Speedup tcp_unlink_write_queue() and next loop */ - prefetchw(skb->next); acked_pcount = tcp_skb_pcount(skb); } @@ -3119,7 +3065,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt = last_ackt; last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; - reord = min(pkts_acked, reord); + if (before(start_seq, reord)) + reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) flag |= FLAG_ORIG_SACK_ACKED; } @@ -3156,12 +3103,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!fully_acked) break; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; + tcp_rtx_queue_unlink_and_free(skb, sk); } if (!skb) @@ -3197,16 +3144,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets && reord <= tp->fackets_out) - tcp_update_reordering(sk, tp->fackets_out - reord, 0); + if (before(reord, prior_fack)) + tcp_check_sack_reordering(sk, reord, 0); - delta = tcp_is_fack(tp) ? pkts_acked : - prior_sacked - tp->sacked_out; + delta = prior_sacked - tp->sacked_out; tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } - - tp->fackets_out -= min(pkts_acked, tp->fackets_out); - } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent @@ -3247,18 +3190,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } } #endif - *acked = pkts_acked; return flag; } static void tcp_ack_probe(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *head = tcp_send_head(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* Was it a usable window open? */ - - if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { + if (!head) + return; + if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). @@ -3378,7 +3322,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->pred_flags = 0; tcp_fast_path_check(sk); - if (tcp_send_head(sk)) + if (!tcp_write_queue_empty(sk)) tcp_slow_start_after_idle_check(sk); if (nwin > tp->max_window) { @@ -3399,7 +3343,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, if (*last_oow_ack_time) { s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { NET_INC_STATS(net, mib_idx); return true; /* rate-limited: don't send yet! */ } @@ -3435,10 +3379,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); u32 count, now; /* First check our per-socket dupack rate limit. */ - if (__tcp_oow_rate_limited(sock_net(sk), + if (__tcp_oow_rate_limited(net, LINUX_MIB_TCPACKSKIPPEDCHALLENGE, &tp->last_oow_ack_time)) return; @@ -3446,16 +3391,16 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { - u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; + u32 half = (ack_limit + 1) >> 1; challenge_timestamp = now; - WRITE_ONCE(challenge_count, half + - prandom_u32_max(sysctl_tcp_challenge_ack_limit)); + WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); } count = READ_ONCE(challenge_count); if (count > 0) { WRITE_ONCE(challenge_count, count - 1); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } } @@ -3553,18 +3498,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_fackets; int prior_packets = tp->packets_out; u32 delivered = tp->delivered; u32 lost = tp->lost; - int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 prior_fack; sack_state.first_sackt = 0; sack_state.rate = &rs; - /* We very likely will need to access write queue head. */ - prefetchw(sk->sk_write_queue.next); + /* We very likely will need to access rtx queue. */ + prefetch(sk->tcp_rtx_queue.rb_node); /* If the ack is older than previous acks * then we can probably ignore it. @@ -3590,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_retransmits = 0; } - prior_fackets = tp->fackets_out; + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet @@ -3646,8 +3590,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state); + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); + + tcp_rack_update_reo_wnd(sk, &rs); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3657,7 +3602,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) @@ -3673,13 +3619,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (tcp_send_head(sk)) - tcp_ack_probe(sk); + tcp_ack_probe(sk); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3696,7 +3642,8 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, + &rexmit); tcp_xmit_recovery(sk, rexmit); } @@ -3721,6 +3668,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, foc->exp = exp_opt; } +static void smc_parse_options(const struct tcphdr *th, + struct tcp_options_received *opt_rx, + const unsigned char *ptr, + int opsize) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (th->syn && !(opsize & 1) && + opsize >= TCPOLEN_EXP_SMC_BASE && + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) + opt_rx->smc_ok = 1; + } +#endif +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -3828,6 +3790,9 @@ void tcp_parse_options(const struct net *net, tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); + else + smc_parse_options(th, opt_rx, ptr, + opsize); break; } @@ -3995,6 +3960,8 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) /* When we get a reset we do this. */ void tcp_reset(struct sock *sk) { + trace_tcp_receive_reset(sk); + /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: @@ -4117,7 +4084,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { int mib_idx; if (before(seq, tp->rcv_nxt)) @@ -4152,7 +4119,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); - if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) @@ -4268,11 +4235,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } -enum tcp_queue { - OOO_QUEUE, - RCV_QUEUE, -}; - /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket @@ -4288,7 +4250,6 @@ enum tcp_queue { * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, - enum tcp_queue dest, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) @@ -4313,10 +4274,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; - if (dest == OOO_QUEUE) - TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp; - else - to->tstamp = from->tstamp; + to->tstamp = from->tstamp; } return true; @@ -4341,7 +4299,7 @@ static void tcp_ofo_queue(struct sock *sk) p = rb_first(&tp->out_of_order_queue); while (p) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = rb_to_skb(p); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4353,9 +4311,6 @@ static void tcp_ofo_queue(struct sock *sk) } p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); - /* Replace tstamp which was stomped by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - skb->tstamp = TCP_SKB_CB(skb)->swtstamp; if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); @@ -4367,8 +4322,7 @@ static void tcp_ofo_queue(struct sock *sk) TCP_SKB_CB(skb)->end_seq); tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, - tail, skb, &fragstolen); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) @@ -4409,7 +4363,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node **p, *q, *parent; + struct rb_node **p, *parent; struct sk_buff *skb1; u32 seq, end_seq; bool fragstolen; @@ -4422,10 +4376,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } - /* Stash tstamp to avoid being stomped on by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - TCP_SKB_CB(skb)->swtstamp = skb->tstamp; - /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); @@ -4453,7 +4403,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); @@ -4472,7 +4422,7 @@ coalesce_done: parent = NULL; while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb1)->seq)) { p = &parent->rb_left; continue; @@ -4504,7 +4454,7 @@ coalesce_done: __kfree_skb(skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { goto coalesce_done; } @@ -4517,9 +4467,7 @@ insert: merge_right: /* Remove other segments covered by skb. */ - while ((q = rb_next(&skb->rbnode)) != NULL) { - skb1 = rb_entry(q, struct sk_buff, rbnode); - + while ((skb1 = skb_rb_next(skb)) != NULL) { if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { @@ -4534,7 +4482,7 @@ merge_right: tcp_drop(sk, skb1); } /* If there is no skb after us, we are the last_skb ! */ - if (!q) + if (!skb1) tp->ooo_last_skb = skb; add_sack: @@ -4556,7 +4504,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && - tcp_try_coalesce(sk, RCV_QUEUE, tail, + tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { @@ -4720,7 +4668,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li if (list) return !skb_queue_is_last(list, skb) ? skb->next : NULL; - return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); + return skb_rb_next(skb); } static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, @@ -4741,7 +4689,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, } /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ -static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -4749,7 +4697,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) p = &parent->rb_left; else @@ -4796,7 +4744,7 @@ restart: * overlaps to the next one. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && - (tcp_win_from_space(skb->truesize) > skb->len || + (tcp_win_from_space(sk, skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start))) { end_of_skbs = false; break; @@ -4868,26 +4816,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *head; - struct rb_node *p; u32 start, end; - p = rb_first(&tp->out_of_order_queue); - skb = rb_entry_safe(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); new_range: if (!skb) { - p = rb_last(&tp->out_of_order_queue); - /* Note: This is possible p is NULL here. We do not - * use rb_entry_safe(), as ooo_last_skb is valid only - * if rbtree is not empty. - */ - tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); + tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); return; } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; for (head = skb;;) { - skb = tcp_skb_next(skb, NULL); + skb = skb_rb_next(skb); /* Range is terminated when we see a gap or when * we are at the queue end. @@ -4930,14 +4871,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk) do { prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); - tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); + tcp_drop(sk, rb_to_skb(node)); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; node = prev; } while (node); - tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); + tp->ooo_last_skb = rb_to_skb(prev); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection @@ -5112,7 +5053,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); - if (ptr && !sysctl_tcp_stdurg) + if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) ptr--; ptr += ntohl(th->seq); @@ -5532,20 +5473,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) security_inet_conn_established(sk, skb); } - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp->lsndtime = tcp_jiffies32; - tcp_init_buffer_space(sk); - if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); @@ -5559,7 +5493,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; + struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; @@ -5594,9 +5528,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ - tcp_for_write_queue_from(data, sk) { - if (data == tcp_send_head(sk) || - __tcp_retransmit_skb(sk, data, 1)) + skb_rbtree_walk_from(data) { + if (__tcp_retransmit_skb(sk, data, 1)) break; } tcp_rearm_rto(sk); @@ -5614,6 +5547,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, return false; } +static void smc_check_reset_syn(struct tcp_sock *tp) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc && !tp->rx_opt.smc_ok) + tp->syn_smc = 0; + } +#endif +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { @@ -5709,10 +5652,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tcp_is_sack(tp) && sysctl_tcp_fack) - tcp_enable_fack(tp); - - tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -5721,6 +5660,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * is initialized. */ tp->copied_seq = tp->rcv_nxt; + smc_check_reset_syn(tp); + smp_mb(); tcp_finish_connect(sk, skb); @@ -5938,15 +5879,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (req) { inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); } else { - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); - - tcp_mtup_init(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; - tcp_init_buffer_space(sk); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -5966,19 +5910,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (req) { - /* Re-arm the timer because data may have been sent out. - * This is similar to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive and - * retransmitting any data sooner based on when they - * are sent out. - */ - tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); @@ -6075,6 +6006,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; + /* fall through */ case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, @@ -6183,6 +6115,9 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_rmt_port = tcp_hdr(skb)->source; ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); +#if IS_ENABLED(CONFIG_SMC) + ireq->smc_ok = rx_opt->smc_ok; +#endif } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, @@ -6195,8 +6130,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, if (req) { struct inet_request_sock *ireq = inet_rsk(req); - kmemcheck_annotate_bitfield(ireq, flags); - ireq->opt = NULL; + ireq->ireq_opt = NULL; #if IS_ENABLED(CONFIG_IPV6) ireq->pktopts = NULL; #endif @@ -6358,7 +6292,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_openreq_init_rwin(req, sk, dst); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); - fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc); + fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5162bc..c6bc0c4d19c6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,6 +85,8 @@ #include <crypto/hash.h> #include <linux/scatterlist.h> +#include <trace/events/tcp.h> + #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -480,7 +482,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) TCP_TIMEOUT_INIT; icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); BUG_ON(!skb); tcp_mstamp_refresh(tp); @@ -701,8 +703,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) + if (sk) { arg.bound_dev_if = sk->sk_bound_dev_if; + trace_tcp_send_reset(sk, skb); + } BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); @@ -877,7 +881,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq->opt); + ireq_opt_deref(ireq)); err = net_xmit_eval(err); } @@ -889,7 +893,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, */ static void tcp_v4_reqsk_destructor(struct request_sock *req) { - kfree(inet_rsk(req)->opt); + kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); } #ifdef CONFIG_TCP_MD5SIG @@ -1265,10 +1269,11 @@ static void tcp_v4_init_req(struct request_sock *req, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); + struct net *net = sock_net(sk_listener); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb); + RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, @@ -1355,10 +1360,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, sk_daddr_set(newsk, ireq->ir_rmt_addr); sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newsk->sk_bound_dev_if = ireq->ir_iif; - newinet->inet_saddr = ireq->ir_loc_addr; - inet_opt = ireq->opt; - rcu_assign_pointer(newinet->inet_opt, inet_opt); - ireq->opt = NULL; + newinet->inet_saddr = ireq->ir_loc_addr; + inet_opt = rcu_dereference(ireq->ireq_opt); + RCU_INIT_POINTER(newinet->inet_opt, inet_opt); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; @@ -1403,9 +1407,12 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); - if (*own_req) + if (likely(*own_req)) { tcp_move_syn(newtp, req); - + ireq->ireq_opt = NULL; + } else { + newinet->inet_opt = NULL; + } return newsk; exit_overflow: @@ -1416,6 +1423,7 @@ exit: tcp_listendrop(sk); return NULL; put_and_exit: + newinet->inet_opt = NULL; inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto exit; @@ -1503,23 +1511,23 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -void tcp_v4_early_demux(struct sk_buff *skb) +int tcp_v4_early_demux(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) - return; + return 0; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return; + return 0; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) - return; + return 0; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, @@ -1538,6 +1546,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb_dst_set_noref(skb, dst); } } + return 0; } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) @@ -1778,8 +1787,9 @@ do_time_wait: refcounted = false; goto process; } - /* Fall through to ACK */ } + /* to ACK */ + /* fall through */ case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; @@ -1859,6 +1869,8 @@ void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + trace_tcp_destroy_sock(sk); + tcp_clear_xmit_timers(sk); tcp_cleanup_congestion_control(sk); @@ -1891,6 +1903,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_fastopen_destroy_cipher(sk); tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); @@ -2396,8 +2409,8 @@ struct proto tcp_prot = { .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, - .sysctl_wmem = sysctl_tcp_wmem, - .sysctl_rmem = sysctl_tcp_rmem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -2417,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net) { int cpu; + module_put(net->ipv4.tcp_congestion_control->owner); + for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); free_percpu(net->ipv4.tcp_sk); @@ -2471,6 +2486,50 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_early_retrans = 3; + net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; + net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ + net->ipv4.sysctl_tcp_retrans_collapse = 1; + net->ipv4.sysctl_tcp_max_reordering = 300; + net->ipv4.sysctl_tcp_dsack = 1; + net->ipv4.sysctl_tcp_app_win = 31; + net->ipv4.sysctl_tcp_adv_win_scale = 1; + net->ipv4.sysctl_tcp_frto = 2; + net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; + /* This limits the percentage of the congestion window which we + * will allow a single TSO frame to consume. Building TSO frames + * which are too large can cause TCP streams to be bursty. + */ + net->ipv4.sysctl_tcp_tso_win_divisor = 3; + /* Default TSQ limit of four TSO segments */ + net->ipv4.sysctl_tcp_limit_output_bytes = 262144; + /* rfc5961 challenge ack rate limiting */ + net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; + net->ipv4.sysctl_tcp_min_tso_segs = 2; + net->ipv4.sysctl_tcp_min_rtt_wlen = 300; + net->ipv4.sysctl_tcp_autocorking = 1; + net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2; + net->ipv4.sysctl_tcp_pacing_ss_ratio = 200; + net->ipv4.sysctl_tcp_pacing_ca_ratio = 120; + if (net != &init_net) { + memcpy(net->ipv4.sysctl_tcp_rmem, + init_net.ipv4.sysctl_tcp_rmem, + sizeof(init_net.ipv4.sysctl_tcp_rmem)); + memcpy(net->ipv4.sysctl_tcp_wmem, + init_net.ipv4.sysctl_tcp_wmem, + sizeof(init_net.ipv4.sysctl_tcp_wmem)); + } + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); + net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; + atomic_set(&net->ipv4.tfo_active_disable_times, 0); + + /* Reno is always built in */ + if (!net_eq(net, &init_net) && + try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; + else + net->ipv4.tcp_congestion_control = &tcp_reno; return 0; fail: @@ -2481,7 +2540,12 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { + struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + + list_for_each_entry(net, net_exit_list, exit_list) + tcp_fastopen_ctx_destroy(net); } static struct pernet_operations __net_initdata tcp_sk_ops = { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 102b2c90bb80..7097f92d16e5 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/jiffies.h> @@ -20,8 +21,6 @@ #include <net/tcp.h> #include <net/genetlink.h> -int sysctl_tcp_nometrics_save __read_mostly; - static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, const struct inetpeer_addr *daddr, struct net *net, unsigned int hash); @@ -330,7 +329,7 @@ void tcp_update_metrics(struct sock *sk) int m; sk_dst_confirm(sk); - if (sysctl_tcp_nometrics_save || !dst) + if (net->ipv4.sysctl_tcp_nometrics_save || !dst) return; rcu_read_lock(); @@ -471,10 +470,8 @@ void tcp_init_metrics(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; } val = tcp_metric_get(tm, TCP_METRIC_REORDERING); - if (val && tp->reordering != val) { - tcp_disable_fack(tp); + if (val && tp->reordering != val) tp->reordering = val; - } crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); @@ -892,10 +889,14 @@ static void tcp_metrics_flush_all(struct net *net) for (row = 0; row < max_rows; row++, hb++) { struct tcp_metrics_block __rcu **pp; + bool match; + spin_lock_bh(&tcp_metrics_lock); pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { - if (net_eq(tm_net(tm), net)) { + match = net ? net_eq(tm_net(tm), net) : + !atomic_read(&tm_net(tm)->count); + if (match) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); } else { @@ -1018,14 +1019,14 @@ static int __net_init tcp_net_metrics_init(struct net *net) return 0; } -static void __net_exit tcp_net_metrics_exit(struct net *net) +static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) { - tcp_metrics_flush_all(net); + tcp_metrics_flush_all(NULL); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { - .init = tcp_net_metrics_init, - .exit = tcp_net_metrics_exit, + .init = tcp_net_metrics_init, + .exit_batch = tcp_net_metrics_exit_batch, }; void __init tcp_metrics_init(void) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 188a6f31356d..e36eff0403f4 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -23,13 +23,12 @@ #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/workqueue.h> +#include <linux/static_key.h> #include <net/tcp.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/busy_poll.h> -int sysctl_tcp_abort_on_overflow __read_mostly; - static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -180,7 +179,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if (sysctl_tcp_rfc1337 == 0) { + if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; @@ -298,8 +297,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); - if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()) - BUG(); + BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()); } } while (0); #endif @@ -371,7 +369,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(full_space, + tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, @@ -417,6 +415,21 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) } EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); +static void smc_check_reset_syn_req(struct tcp_sock *oldtp, + struct request_sock *req, + struct tcp_sock *newtp) +{ +#if IS_ENABLED(CONFIG_SMC) + struct inet_request_sock *ireq; + + if (static_branch_unlikely(&tcp_have_smc)) { + ireq = inet_rsk(req); + if (oldtp->syn_smc && !ireq->smc_ok) + newtp->syn_smc = 0; + } +#endif +} + /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -434,6 +447,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct tcp_sock *newtp = tcp_sk(newsk); + struct tcp_sock *oldtp = tcp_sk(sk); + + smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; @@ -446,6 +462,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); @@ -458,7 +475,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->packets_out = 0; newtp->retrans_out = 0; newtp->sacked_out = 0; - newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->tlp_high_seq = 0; newtp->lsndtime = tcp_jiffies32; @@ -492,10 +508,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { - if (sysctl_tcp_fack) - tcp_enable_fack(newtp); - } + newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; @@ -534,6 +547,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->syn_data_acked = 0; newtp->rack.mstamp = 0; newtp->rack.advanced = 0; + newtp->rack.reo_wnd_steps = 1; + newtp->rack.last_delivered = 0; + newtp->rack.reo_wnd_persist = 0; + newtp->rack.dsack_seen = 0; __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); } @@ -764,7 +781,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: - if (!sysctl_tcp_abort_on_overflow) { + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; } diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 1ff73982e28c..0b5a05bd82e3 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -39,7 +39,7 @@ * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected * nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8 * nv_rtt_factor RTT averaging factor - * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur + * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping @@ -61,7 +61,7 @@ static int nv_min_cwnd __read_mostly = 2; static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ -static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */ +static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */ static int nv_cwnd_growth_rate_neg __read_mostly = 8; static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ static int nv_dec_eval_min_calls __read_mostly = 60; @@ -101,6 +101,11 @@ struct tcpnv { u32 nv_last_rtt; /* last rtt */ u32 nv_min_rtt; /* active min rtt. Used to determine slope */ u32 nv_min_rtt_new; /* min rtt for future use */ + u32 nv_base_rtt; /* If non-zero it represents the threshold for + * congestion */ + u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is + * set to 80% of nv_base_rtt. It helps reduce + * unfairness between flows */ u32 nv_rtt_max_rate; /* max rate seen during current RTT */ u32 nv_rtt_start_seq; /* current RTT ends when packet arrives * acking beyond nv_rtt_start_seq */ @@ -132,9 +137,24 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) static void tcpnv_init(struct sock *sk) { struct tcpnv *ca = inet_csk_ca(sk); + int base_rtt; tcpnv_reset(ca, sk); + /* See if base_rtt is available from socket_ops bpf program. + * It is meant to be used in environments, such as communication + * within a datacenter, where we have reasonable estimates of + * RTTs + */ + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + if (base_rtt > 0) { + ca->nv_base_rtt = base_rtt; + ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ + } else { + ca->nv_base_rtt = 0; + ca->nv_lower_bound_rtt = 0; + } + ca->nv_allow_cwnd_growth = 1; ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; ca->nv_min_rtt = NV_INIT_RTT; @@ -144,6 +164,19 @@ static void tcpnv_init(struct sock *sk) ca->cwnd_growth_factor = 0; } +/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt) + * bounds to RTT. + */ +inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val) +{ + if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt) + return ca->nv_lower_bound_rtt; + else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt) + return ca->nv_base_rtt; + else + return val; +} + static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -209,7 +242,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) struct tcp_sock *tp = tcp_sk(sk); struct tcpnv *ca = inet_csk_ca(sk); unsigned long now = jiffies; - s64 rate64 = 0; + u64 rate64; u32 rate, max_win, cwnd_by_slope; u32 avg_rtt; u32 bytes_acked = 0; @@ -251,8 +284,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) } /* rate in 100's bits per second */ - rate64 = ((u64)sample->in_flight) * 8000000; - rate = (u32)div64_u64(rate64, (u64)(avg_rtt * 100)); + rate64 = ((u64)sample->in_flight) * 80000; + do_div(rate64, avg_rtt ?: 1); + rate = (u32)rate64; /* Remember the maximum rate seen during this RTT * Note: It may be more than one RTT. This function should be @@ -265,6 +299,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) if (ca->nv_eval_call_cnt < 255) ca->nv_eval_call_cnt++; + /* Apply bounds to rtt. Only used to update min_rtt */ + avg_rtt = nv_get_bounded_rtt(ca, avg_rtt); + /* update min rtt if necessary */ if (avg_rtt < ca->nv_min_rtt) ca->nv_min_rtt = avg_rtt; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 11f69bbf9307..b6a2aa1dcf56 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -149,11 +149,19 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, * is freed by GSO engine */ if (copy_destructor) { + int delta; + swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); sum_truesize += skb->truesize; - refcount_add(sum_truesize - gso_skb->truesize, - &skb->sk->sk_wmem_alloc); + delta = sum_truesize - gso_skb->truesize; + /* In some pathological cases, delta can be negative. + * We need to either use refcount_add() or refcount_sub_and_test() + */ + if (likely(delta >= 0)) + refcount_add(delta, &skb->sk->sk_wmem_alloc); + else + WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc)); } delta = htonl(oldlen + (skb_tail_pointer(skb) - diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1c839c99114c..540b7d92cc70 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -41,40 +41,25 @@ #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> +#include <linux/static_key.h> -/* People can turn this off for buggy TCP's found in printers etc. */ -int sysctl_tcp_retrans_collapse __read_mostly = 1; - -/* People can turn this on to work with those rare, broken TCPs that - * interpret the window field as a signed quantity. - */ -int sysctl_tcp_workaround_signed_windows __read_mostly = 0; - -/* Default TSQ limit of four TSO segments */ -int sysctl_tcp_limit_output_bytes __read_mostly = 262144; - -/* This limits the percentage of the congestion window which we - * will allow a single TSO frame to consume. Building TSO frames - * which are too large can cause TCP streams to be bursty. - */ -int sysctl_tcp_tso_win_divisor __read_mostly = 3; - -/* By default, RFC2861 behavior. */ -int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +#include <trace/events/tcp.h> static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_write_queue); + tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); + tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -203,7 +188,7 @@ u32 tcp_default_init_rwnd(u32 mss) * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ -void tcp_select_initial_window(int __space, __u32 mss, +void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) @@ -227,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ - if (sysctl_tcp_workaround_signed_windows) + if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; @@ -235,7 +220,7 @@ void tcp_select_initial_window(int __space, __u32 mss, (*rcv_wscale) = 0; if (wscale_ok) { /* Set window scaling on max possible window */ - space = max_t(u32, space, sysctl_tcp_rmem[2]); + space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { @@ -287,7 +272,8 @@ static u16 tcp_select_window(struct sock *sk) /* Make sure we do not exceed the maximum possible * scaled window. */ - if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) + if (!tp->rx_opt.rcv_wscale && + sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -395,7 +381,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum = 0; TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; @@ -418,6 +403,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) +#define OPTION_SMC (1 << 9) + +static void smc_options_write(__be32 *ptr, u16 *options) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (unlikely(OPTION_SMC & *options)) { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_EXP << 8) | + (TCPOLEN_EXP_SMC_BASE)); + *ptr++ = htonl(TCPOPT_SMC_MAGIC); + } + } +#endif +} struct tcp_out_options { u16 options; /* bit field of OPTION_* */ @@ -536,6 +537,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } ptr += (len + 3) >> 2; } + + smc_options_write(ptr, &options); +} + +static void smc_set_option(const struct tcp_sock *tp, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc) { + if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; + } + } + } +#endif +} + +static void smc_set_option_cond(const struct tcp_sock *tp, + const struct inet_request_sock *ireq, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_SMC) + if (static_branch_unlikely(&tcp_have_smc)) { + if (tp->syn_smc && ireq->smc_ok) { + if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; + } + } + } +#endif } /* Compute TCP options for SYN packets. This is not the final @@ -603,11 +639,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + smc_set_option(tp, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ -static unsigned int tcp_synack_options(struct request_sock *req, +static unsigned int tcp_synack_options(const struct sock *sk, + struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, @@ -663,6 +702,8 @@ static unsigned int tcp_synack_options(struct request_sock *req, } } + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -739,8 +780,10 @@ static void tcp_tsq_handler(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tp->lost_out > tp->retrans_out && - tp->snd_cwnd > tcp_packets_in_flight(tp)) + tp->snd_cwnd > tcp_packets_in_flight(tp)) { + tcp_mstamp_refresh(tp); tcp_xmit_retransmit_queue(sk); + } tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, 0, GFP_ATOMIC); @@ -971,6 +1014,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) HRTIMER_MODE_ABS_PINNED); } +static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) +{ + skb->skb_mstamp = tp->tcp_mstamp; + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1003,10 +1052,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; oskb = skb; - if (unlikely(skb_cloned(skb))) - skb = pskb_copy(skb, gfp_mask); - else - skb = skb_clone(skb, gfp_mask); + + tcp_skb_tsorted_save(oskb) { + if (unlikely(skb_cloned(oskb))) + skb = pskb_copy(oskb, gfp_mask); + else + skb = skb_clone(oskb, gfp_mask); + } tcp_skb_tsorted_restore(oskb); + if (unlikely(!skb)) return -ENOBUFS; } @@ -1127,7 +1180,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = net_xmit_eval(err); } if (!err && oskb) { - oskb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, oskb); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1165,21 +1218,6 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) } } -/* When a modification to fackets out becomes necessary, we need to check - * skb is counted to fackets_out or not. - */ -static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, - int decr) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (!tp->sacked_out || tcp_is_reno(tp)) - return; - - if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) - tp->fackets_out -= decr; -} - /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ @@ -1200,11 +1238,9 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); - tcp_adjust_fackets_out(sk, skb, decr); - if (tp->lost_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && - (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))) + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) tp->lost_cnt_hint -= decr; tcp_verify_left_out(tp); @@ -1239,12 +1275,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) TCP_SKB_CB(skb)->eor = 0; } +/* Insert buff after skb on the write or rtx queue of sk. */ +static void tcp_insert_write_queue_after(struct sk_buff *skb, + struct sk_buff *buff, + struct sock *sk, + enum tcp_queue tcp_queue) +{ + if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) + __skb_queue_after(&sk->sk_write_queue, skb, buff); + else + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ -int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, +int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); @@ -1327,7 +1376,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); + if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) + list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } @@ -1605,7 +1656,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if (sysctl_tcp_slow_start_after_idle && + if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); @@ -1614,10 +1665,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) - * 3) no more data to send (null tcp_send_head ) + * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ - if (!tcp_send_head(sk) && sk->sk_socket && + if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); @@ -1669,7 +1720,7 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, { u32 bytes, segs; - bytes = min(sk->sk_pacing_rate >> 10, + bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); /* Goal is to send at least one packet per ms, @@ -1692,7 +1743,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; return tso_segs ? : - tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); + tcp_tso_autosize(sk, mss_now, + sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); } /* Returns the portion of skb which can be sent right away */ @@ -1806,40 +1858,6 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, return !after(end_seq, tcp_wnd_end(tp)); } -/* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) - * should be put on the wire right now. If so, it returns the number of - * packets allowed by the congestion window. - */ -static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - const struct tcp_sock *tp = tcp_sk(sk); - unsigned int cwnd_quota; - - tcp_init_tso_segs(skb, cur_mss); - - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; - - cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; - - return cwnd_quota; -} - -/* Test if sending is allowed right now. */ -bool tcp_may_send_now(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = tcp_send_head(sk); - - return skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk), - (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH)); -} - /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions @@ -1847,7 +1865,8 @@ bool tcp_may_send_now(struct sock *sk) * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ -static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, +static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { struct sk_buff *buff; @@ -1856,7 +1875,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now, gfp); + return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) @@ -1892,7 +1911,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); return 0; } @@ -1942,7 +1961,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; - win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); + win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); @@ -1962,8 +1981,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, goto send_now; } - head = tcp_write_queue_head(sk); - + /* TODO : use tsorted_sent_queue ? */ + head = tcp_rtx_queue_head(sk); + if (!head) + goto send_now; age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) @@ -2094,6 +2115,7 @@ static int tcp_mtu_probe(struct sock *sk) nskb->ip_summed = skb->ip_summed; tcp_insert_write_queue_before(nskb, skb, sk); + tcp_highest_sack_replace(sk, skb, nskb); len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { @@ -2176,18 +2198,18 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, { unsigned int limit; - limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); - limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); + limit = min_t(u32, limit, + sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { - /* Always send the 1st or 2nd skb in write queue. + /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ - if (skb == sk->sk_write_queue.next || - skb->prev == sk->sk_write_queue.next) + if (tcp_rtx_queue_empty(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); @@ -2238,7 +2260,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ - if (tcp_write_queue_empty(sk)) + if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); @@ -2271,6 +2293,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0; + tcp_mstamp_refresh(tp); if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); @@ -2282,7 +2305,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } max_segs = tcp_tso_segs(sk, mss_now); - tcp_mstamp_refresh(tp); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -2294,7 +2316,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); goto repair; /* Skip network transmission */ } @@ -2333,7 +2355,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle); if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, limit, mss_now, gfp))) break; if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) @@ -2373,7 +2396,7 @@ repair: tcp_cwnd_validate(sk, is_cwnd_limited); return false; } - return !tp->packets_out && tcp_send_head(sk); + return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk) @@ -2381,6 +2404,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, rto_delta_us; + int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. @@ -2388,16 +2412,17 @@ bool tcp_schedule_loss_probe(struct sock *sk) if (tp->fastopen_rsk) return false; + early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || + if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || icsk->icsk_ca_state != TCP_CA_Open) return false; if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && - tcp_send_head(sk)) + !tcp_write_queue_empty(sk)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account @@ -2450,18 +2475,14 @@ void tcp_send_loss_probe(struct sock *sk) int mss = tcp_current_mss(sk); skb = tcp_send_head(sk); - if (skb) { - if (tcp_snd_wnd_test(tp, skb, mss)) { - pcount = tp->packets_out; - tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); - if (tp->packets_out > pcount) - goto probe_sent; - goto rearm_timer; - } - skb = tcp_write_queue_prev(sk, skb); - } else { - skb = tcp_write_queue_tail(sk); + if (skb && tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; + tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; + goto rearm_timer; } + skb = skb_rb_last(&sk->tcp_rtx_queue); /* At most one outstanding TLP retransmission. */ if (tp->tlp_high_seq) @@ -2479,10 +2500,11 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { - if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; - skb = tcp_write_queue_next(sk, skb); + skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) @@ -2682,7 +2704,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); + struct sk_buff *next_skb = skb_rb_next(skb); int skb_size, next_skb_size; skb_size = skb->len; @@ -2697,9 +2719,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) else if (!skb_shift(skb, next_skb, next_skb_size)) return false; } - tcp_highest_sack_combine(sk, next_skb, skb); - - tcp_unlink_write_queue(next_skb, sk); + tcp_highest_sack_replace(sk, next_skb, skb); if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -2728,7 +2748,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_skb_collapse_tstamp(skb, next_skb); - sk_wmem_free_skb(sk, next_skb); + tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } @@ -2739,8 +2759,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) return false; if (skb_cloned(skb)) return false; - if (skb == tcp_send_head(sk)) - return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; @@ -2758,12 +2776,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, struct sk_buff *skb = to, *tmp; bool first = true; - if (!sysctl_tcp_retrans_collapse) + if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; - tcp_for_write_queue_from_safe(skb, tmp, sk) { + skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; @@ -2838,7 +2856,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) len = cur_mss * segs; if (skb->len > len) { - if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, + cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone(skb, GFP_ATOMIC)) @@ -2872,17 +2891,23 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; - if (!err) - skb->skb_mstamp = tp->tcp_mstamp; + tcp_skb_tsorted_save(skb) { + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } tcp_skb_tsorted_restore(skb); + + if (!err) { + tcp_update_skb_after_send(tp, skb); + tcp_rate_skb_sent(sk, skb); + } } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } @@ -2919,36 +2944,25 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. - * If doing SACK, the first ACK which comes back for a timeout - * based retransmit packet might feed us FACK information again. - * If so, we use it to avoid unnecessarily retransmissions. */ void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - struct sk_buff *hole = NULL; u32 max_segs; int mib_idx; if (!tp->packets_out) return; - if (tp->retransmit_skb_hint) { - skb = tp->retransmit_skb_hint; - } else { - skb = tcp_write_queue_head(sk); - } - + rtx_head = tcp_rtx_queue_head(sk); + skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { __u8 sacked; int segs; - if (skb == tcp_send_head(sk)) - break; - if (tcp_pacing_check(sk)) break; @@ -2993,7 +3007,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); - if (skb == tcp_write_queue_head(sk) && + if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, @@ -3035,12 +3049,15 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { + if (!tskb && tcp_under_memory_pressure(sk)) + tskb = skb_rb_last(&sk->tcp_rtx_queue); + + if (tskb) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; - if (!tcp_send_head(sk)) { + if (tcp_write_queue_empty(sk)) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have @@ -3057,6 +3074,7 @@ coalesce: goto coalesce; return; } + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ @@ -3093,6 +3111,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); + + /* skb of trace_tcp_send_reset() keeps the skb that caused RST, + * skb here is different to the troublesome skb, so use NULL + */ + trace_tcp_send_reset(sk, NULL); } /* Send a crossed SYN-ACK during socket establishment. @@ -3105,20 +3128,24 @@ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { - pr_debug("%s: wrong queue state\n", __func__); + pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { - struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + struct sk_buff *nskb; + + tcp_skb_tsorted_save(skb) { + nskb = skb_copy(skb, GFP_ATOMIC); + } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; - tcp_unlink_write_queue(skb, sk); + INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); + tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); - __tcp_add_write_queue_head(sk, nskb); - sk_wmem_free_skb(sk, skb); + tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk->sk_wmem_queued += nskb->truesize; sk_mem_charge(sk, nskb->truesize); skb = nskb; @@ -3195,8 +3222,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); - tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) + - sizeof(*th); + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, + foc) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -3209,13 +3236,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; - /* Setting of flags are superfluous here for callers (and ECE is - * not even correctly set) - */ - tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, - TCPHDR_SYN | TCPHDR_ACK); - - th->seq = htonl(TCP_SKB_CB(skb)->seq); + skb->ip_summed = CHECKSUM_PARTIAL; + th->seq = htonl(tcp_rsk(req)->snt_isn); /* XXX data is queued and acked as is. No buffer/window check */ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); @@ -3302,7 +3324,7 @@ static void tcp_connect_init(struct sock *sk) if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, @@ -3341,7 +3363,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); tp->write_seq = tcb->end_seq; @@ -3389,6 +3410,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) int copied = copy_from_iter(skb_put(syn_data, space), space, &fo->data->msg_iter); if (unlikely(!copied)) { + tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } @@ -3419,10 +3441,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); + tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } + /* data was not sent, put it in write_queue */ + __skb_queue_tail(&sk->sk_write_queue, syn_data); + tp->packets_out -= tcp_skb_pcount(syn_data); + fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) @@ -3463,6 +3490,7 @@ int tcp_connect(struct sock *sk) tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : @@ -3475,6 +3503,11 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + buff = tcp_send_head(sk); + if (unlikely(buff)) { + tp->snd_nxt = TCP_SKB_CB(buff)->seq; + tp->pushed_seq = TCP_SKB_CB(buff)->seq; + } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ @@ -3652,7 +3685,8 @@ int tcp_write_wakeup(struct sock *sk, int mib) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); @@ -3682,7 +3716,7 @@ void tcp_send_probe0(struct sock *sk) err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; @@ -3723,6 +3757,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) tcp_sk(sk)->total_retrans++; + trace_tcp_retransmit_synack(sk, req); } return res; } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 449cd914d58e..d3ea89020c69 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -1,8 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/tcp.h> #include <net/tcp.h> -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; - static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -45,7 +44,8 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + u32 min_rtt = tcp_min_rtt(tp); + struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; @@ -55,48 +55,36 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) * to queuing or delayed ACKs. */ reo_wnd = 1000; - if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) - reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); + if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) { + reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd); + reo_wnd = min(reo_wnd, tp->srtt_us >> 3); + } - tcp_for_write_queue(skb, sk) { + list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, + tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + s32 remaining; - if (skb == tcp_send_head(sk)) - break; - - /* Skip ones already (s)acked */ - if (!after(scb->end_seq, tp->snd_una) || - scb->sacked & TCPCB_SACKED_ACKED) + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, - tp->rack.end_seq, scb->end_seq)) { - /* Step 3 in draft-cheng-tcpm-rack-00.txt: - * A packet is lost if its elapsed time is beyond - * the recent RTT plus the reordering window. - */ - u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, - skb->skb_mstamp); - s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - - if (remaining < 0) { - tcp_rack_mark_skb_lost(sk, skb); - continue; - } - - /* Skip ones marked lost but not yet retransmitted */ - if ((scb->sacked & TCPCB_LOST) && - !(scb->sacked & TCPCB_SACKED_RETRANS)) - continue; + if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) + break; + /* A packet is lost if it has not been s/acked beyond + * the recent RTT plus the reordering window. + */ + remaining = tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); + } else { /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - - } else if (!(scb->sacked & TCPCB_RETRANS)) { - /* Original data are sent sequentially so stop early - * b/c the rest are all sent after rack_sent - */ - break; } } } @@ -175,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk) if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) tcp_rearm_rto(sk); } + +/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. + * + * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded + * by srtt), since there is possibility that spurious retransmission was + * due to reordering delay longer than reo_wnd. + * + * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) + * no. of successful recoveries (accounts for full DSACK-based loss + * recovery undo). After that, reset it to default (min_rtt/4). + * + * At max, reo_wnd is incremented only once per rtt. So that the new + * DSACK on which we are reacting, is due to the spurious retx (approx) + * after the reo_wnd has been updated last time. + * + * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than + * absolute value to account for change in rtt. + */ +void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND || + !rs->prior_delivered) + return; + + /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ + if (before(rs->prior_delivered, tp->rack.last_delivered)) + tp->rack.dsack_seen = 0; + + /* Adjust the reo_wnd if update is pending */ + if (tp->rack.dsack_seen) { + tp->rack.reo_wnd_steps = min_t(u32, 0xFF, + tp->rack.reo_wnd_steps + 1); + tp->rack.dsack_seen = 0; + tp->rack.last_delivered = tp->delivered; + tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; + } else if (!tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_steps = 1; + } +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 655dd8d7f064..16df6dd44b98 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,8 +22,6 @@ #include <linux/gfp.h> #include <net/tcp.h> -int sysctl_tcp_thin_linear_timeouts __read_mostly; - /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. @@ -109,26 +107,23 @@ static int tcp_orphan_retries(struct sock *sk, bool alive) static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) { - struct net *net = sock_net(sk); + const struct net *net = sock_net(sk); + int mss; /* Black hole detection */ - if (net->ipv4.sysctl_tcp_mtu_probing) { - if (!icsk->icsk_mtup.enabled) { - icsk->icsk_mtup.enabled = 1; - icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } else { - struct net *net = sock_net(sk); - struct tcp_sock *tp = tcp_sk(sk); - int mss; - - mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; - mss = min(net->ipv4.sysctl_tcp_base_mss, mss); - mss = max(mss, 68 - tp->tcp_header_len); - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } + if (!net->ipv4.sysctl_tcp_mtu_probing) + return; + + if (!icsk->icsk_mtup.enabled) { + icsk->icsk_mtup.enabled = 1; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; + } else { + mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; + mss = min(net->ipv4.sysctl_tcp_base_mss, mss); + mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } @@ -156,8 +151,13 @@ static bool retransmits_timed_out(struct sock *sk, return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) - start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return false; + start_ts = tcp_skb_timestamp(head); + } if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -283,15 +283,17 @@ out: * * Returns: Nothing (void) */ -static void tcp_delack_timer(unsigned long data) +static void tcp_delack_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); } else { - inet_csk(sk)->icsk_ack.blocked = 1; + icsk->icsk_ack.blocked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) @@ -304,11 +306,12 @@ static void tcp_delack_timer(unsigned long data) static void tcp_probe_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; u32 start_ts; - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; return; } @@ -321,9 +324,9 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - start_ts = tcp_skb_timestamp(tcp_send_head(sk)); + start_ts = tcp_skb_timestamp(skb); if (!start_ts) - tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > jiffies_to_msecs(icsk->icsk_user_timeout)) @@ -408,7 +411,7 @@ void tcp_retransmit_timer(struct sock *sk) if (!tp->packets_out) goto out; - WARN_ON(tcp_write_queue_empty(sk)); + WARN_ON(tcp_rtx_queue_empty(sk)); tp->tlp_high_seq = 0; @@ -441,7 +444,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); + tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1); __sk_dst_reset(sk); goto out_reset_timer; } @@ -473,7 +476,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { + if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ @@ -514,7 +517,7 @@ out_reset_timer: * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && - (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && + (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; @@ -570,9 +573,11 @@ out: sk_mem_reclaim(sk); } -static void tcp_write_timer(unsigned long data) +static void tcp_write_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { @@ -607,9 +612,9 @@ void tcp_set_keepalive(struct sock *sk, int val) EXPORT_SYMBOL_GPL(tcp_set_keepalive); -static void tcp_keepalive_timer (unsigned long data) +static void tcp_keepalive_timer (struct timer_list *t) { - struct sock *sk = (struct sock *) data; + struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; @@ -647,7 +652,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ - if (tp->packets_out || tcp_send_head(sk)) + if (tp->packets_out || !tcp_write_queue_empty(sk)) goto resched; elapsed = keepalive_time_elapsed(tp); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 218cfcc77650..ee113ff15fd0 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) { - return min(tp->snd_ssthresh, tp->snd_cwnd-1); + return min(tp->snd_ssthresh, tp->snd_cwnd); } static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index 248cfc0ff9ae..4f24d0e37d9c 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * TCP Vegas congestion control interface */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef29df8648e4..e4ff25c947c5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -231,10 +231,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) } } - /* Initial allocation may have already happened via setsockopt */ - if (!rcu_access_pointer(sk->sk_reuseport_cb)) - return reuseport_alloc(sk); - return 0; + return reuseport_alloc(sk); } /** @@ -1061,7 +1058,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - net_dbg_ratelimited("cork app bug 2\n"); + net_dbg_ratelimited("socket already corked\n"); err = -EINVAL; goto out; } @@ -1144,7 +1141,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, if (unlikely(!up->pending)) { release_sock(sk); - net_dbg_ratelimited("udp cork app bug 3\n"); + net_dbg_ratelimited("cork failed\n"); return -EINVAL; } @@ -1212,8 +1209,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2) && - !skb_queue_empty(&up->reader_queue)) + if (size < (sk->sk_rcvbuf >> 2)) return; } else { size += up->forward_deficit; @@ -1856,7 +1852,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ /* if we're overly short, let UDP handle it */ - encap_rcv = ACCESS_ONCE(up->encap_rcv); + encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret; @@ -2221,9 +2217,10 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -void udp_v4_early_demux(struct sk_buff *skb) +int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; @@ -2234,25 +2231,21 @@ void udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return; + return 0; iph = ip_hdr(skb); uh = udp_hdr(skb); - if (skb->pkt_type == PACKET_BROADCAST || - skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + if (skb->pkt_type == PACKET_MULTICAST) { + in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return; + return 0; - /* we are supposed to accept bcast packets */ - if (skb->pkt_type == PACKET_MULTICAST) { - ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, - iph->protocol); - if (!ours) - return; - } + ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, + iph->protocol); + if (!ours) + return 0; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, @@ -2263,7 +2256,7 @@ void udp_v4_early_demux(struct sk_buff *skb) } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) - return; + return 0; skb->sk = sk; skb->destructor = sock_efree; @@ -2272,12 +2265,23 @@ void udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { + u32 itag = 0; + /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); + + /* for unconnected multicast sockets we need to validate + * the source on each packet + */ + if (!inet_sk(sk)->inet_daddr && in_dev) + return ip_mc_validate_source(skb, iph->daddr, + iph->saddr, iph->tos, + skb->dev, in_dev, &itag); } + return 0; } int udp_rcv(struct sk_buff *skb) @@ -2293,7 +2297,7 @@ void udp_destroy_sock(struct sock *sk) unlock_sock_fast(sk, slow); if (static_key_false(&udp_encap_needed) && up->encap_type) { void (*encap_destroy)(struct sock *sk); - encap_destroy = ACCESS_ONCE(up->encap_destroy); + encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) encap_destroy(sk); } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index a8cf8c6fb60c..e7d18b140287 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _UDP4_IMPL_H #define _UDP4_IMPL_H #include <net/udp.h> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 97658bfc1b58..e360d55be555 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -120,7 +120,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 1fc684111ce6..e50b7fea57ee 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * xfrm4_input.c * diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index d7bf0b041885..05017e2c849c 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * xfrm4_policy.c * diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index d6660a8c0ea5..80c40b4981bb 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * xfrm4_state.c * |