diff options
Diffstat (limited to 'net/ipv4')
51 files changed, 1393 insertions, 1794 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0dfb72c46671..eab3ebde981e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1385,6 +1385,15 @@ out: } EXPORT_SYMBOL(inet_gso_segment); +static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) + return ERR_PTR(-EINVAL); + + return inet_gso_segment(skb, features); +} + INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, @@ -1861,7 +1870,7 @@ static struct packet_offload ip_packet_offload __read_mostly = { static const struct net_offload ipip_offload = { .callbacks = { - .gso_segment = inet_gso_segment, + .gso_segment = ipip_gso_segment, .gro_receive = ipip_gro_receive, .gro_complete = ipip_gro_complete, }, diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 5e04ed25bc0e..1e976bb93d99 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -1,28 +1,54 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/module.h> #include <linux/uaccess.h> #include <linux/bpfilter.h> #include <uapi/linux/bpf.h> #include <linux/wait.h> #include <linux/kmod.h> +#include <linux/fs.h> +#include <linux/file.h> -int (*bpfilter_process_sockopt)(struct sock *sk, int optname, - char __user *optval, - unsigned int optlen, bool is_set); -EXPORT_SYMBOL_GPL(bpfilter_process_sockopt); +struct bpfilter_umh_ops bpfilter_ops; +EXPORT_SYMBOL_GPL(bpfilter_ops); + +static void bpfilter_umh_cleanup(struct umh_info *info) +{ + mutex_lock(&bpfilter_ops.lock); + bpfilter_ops.stop = true; + fput(info->pipe_to_umh); + fput(info->pipe_from_umh); + info->pid = 0; + mutex_unlock(&bpfilter_ops.lock); +} static int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) { - if (!bpfilter_process_sockopt) { - int err = request_module("bpfilter"); + int err; + mutex_lock(&bpfilter_ops.lock); + if (!bpfilter_ops.sockopt) { + mutex_unlock(&bpfilter_ops.lock); + err = request_module("bpfilter"); + mutex_lock(&bpfilter_ops.lock); if (err) - return err; - if (!bpfilter_process_sockopt) - return -ECHILD; + goto out; + if (!bpfilter_ops.sockopt) { + err = -ECHILD; + goto out; + } + } + if (bpfilter_ops.stop) { + err = bpfilter_ops.start(); + if (err) + goto out; } - return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set); + err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); +out: + mutex_unlock(&bpfilter_ops.lock); + return err; } int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, @@ -41,3 +67,15 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, return bpfilter_mbox_request(sk, optname, optval, len, false); } + +static int __init bpfilter_sockopt_init(void) +{ + mutex_init(&bpfilter_ops.lock); + bpfilter_ops.stop = true; + bpfilter_ops.info.cmdline = "bpfilter_umh"; + bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; + + return 0; +} + +module_init(bpfilter_sockopt_init); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 777fa3b7fb13..f0165c5f376b 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -667,7 +667,8 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level) case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: - if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL) + if ((level < doi_def->map.std->lvl.cipso_size) && + (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)) return 0; break; } @@ -1735,13 +1736,26 @@ validate_return: */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options *opt = (struct ip_options *)optbuf; + if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; + /* + * We might be called above the IP layer, + * so we can not use icmp_send and IPCB here. + */ + + memset(opt, 0, sizeof(struct ip_options)); + opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); + if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL)) + return; + if (gateway) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); else - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); } /** diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 04ba321ae5ce..eb514f312e6f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1826,7 +1826,7 @@ put_tgt_net: if (fillargs.netnsid >= 0) put_net(tgt_net); - return err < 0 ? err : skb->len; + return skb->len ? : err; } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, @@ -2063,13 +2063,49 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; +static int inet_netconf_valid_get_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb, + NETCONFA_MAX, devconf_ipv4_policy, extack); + + err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb, + NETCONFA_MAX, devconf_ipv4_policy, extack); + if (err) + return err; + + for (i = 0; i <= NETCONFA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case NETCONFA_IFINDEX: + break; + default: + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request"); + return -EINVAL; + } + } + + return 0; +} + static int inet_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; - struct netconfmsg *ncm; struct sk_buff *skb; struct ipv4_devconf *devconf; struct in_device *in_dev; @@ -2077,9 +2113,8 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, int ifindex; int err; - err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_ipv4_policy, extack); - if (err < 0) + err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); + if (err) goto errout; err = -EINVAL; @@ -2556,32 +2591,34 @@ static __net_init int devinet_init_net(struct net *net) int err; struct ipv4_devconf *all, *dflt; #ifdef CONFIG_SYSCTL - struct ctl_table *tbl = ctl_forward_entry; + struct ctl_table *tbl; struct ctl_table_header *forw_hdr; #endif err = -ENOMEM; - all = &ipv4_devconf; - dflt = &ipv4_devconf_dflt; + all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); + if (!all) + goto err_alloc_all; - if (!net_eq(net, &init_net)) { - all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); - if (!all) - goto err_alloc_all; - - dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); - if (!dflt) - goto err_alloc_dflt; + dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); + if (!dflt) + goto err_alloc_dflt; #ifdef CONFIG_SYSCTL - tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL); - if (!tbl) - goto err_alloc_ctl; + tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL); + if (!tbl) + goto err_alloc_ctl; - tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; - tbl[0].extra1 = all; - tbl[0].extra2 = net; + tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; + tbl[0].extra1 = all; + tbl[0].extra2 = net; #endif + + if ((!IS_ENABLED(CONFIG_SYSCTL) || + sysctl_devconf_inherit_init_net != 2) && + !net_eq(net, &init_net)) { + memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); + memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); } #ifdef CONFIG_SYSCTL @@ -2611,15 +2648,12 @@ err_reg_ctl: err_reg_dflt: __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: - if (tbl != ctl_forward_entry) - kfree(tbl); + kfree(tbl); err_alloc_ctl: #endif - if (dflt != &ipv4_devconf_dflt) - kfree(dflt); + kfree(dflt); err_alloc_dflt: - if (all != &ipv4_devconf) - kfree(all); + kfree(all); err_alloc_all: return err; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 5459f41fc26f..10e809b296ec 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -328,7 +328,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * skb->len += tailen; skb->data_len += tailen; skb->truesize += tailen; - if (sk) + if (sk && sk_fullsock(sk)) refcount_add(tailen, &sk->sk_wmem_alloc); goto out; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6df95be96311..ed14ec245584 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -203,7 +203,7 @@ static void fib_flush(struct net *net) struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) - flushed += fib_table_flush(net, tb); + flushed += fib_table_flush(net, tb, false); } if (flushed) @@ -710,6 +710,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_GATEWAY: cfg->fc_gw = nla_get_be32(attr); break; + case RTA_VIA: + NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute"); + err = -EINVAL; + goto errout; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; @@ -1463,7 +1467,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); - fib_table_flush(net, tb); + fib_table_flush(net, tb, true); fib_free_table(tb); } } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5022bc63863a..8e185b5a2bf6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1072,7 +1072,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; } - fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); + fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) goto failure; fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 237c9f72b265..a573e37e0615 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1856,7 +1856,7 @@ void fib_table_flush_external(struct fib_table *tb) } /* Caller must hold RTNL. */ -int fib_table_flush(struct net *net, struct fib_table *tb) +int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; @@ -1904,8 +1904,17 @@ int fib_table_flush(struct net *net, struct fib_table *tb) hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; - if (!fi || !(fi->fib_flags & RTNH_F_DEAD) || - tb->tb_id != fa->tb_id) { + if (!fi || tb->tb_id != fa->tb_id || + (!(fi->fib_flags & RTNH_F_DEAD) && + !fib_props[fa->fa_type].error)) { + slen = fa->fa_slen; + continue; + } + + /* Do not flush error routes if network namespace is + * not being dismantled + */ + if (!flush_all && fib_props[fa->fa_type].error) { slen = fa->fa_slen; continue; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 0c9f171fb085..437070d1ffb1 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1020,10 +1020,11 @@ static int gue_err(struct sk_buff *skb, u32 info) { int transport_offset = skb_transport_offset(skb); struct guehdr *guehdr; - size_t optlen; + size_t len, optlen; int ret; - if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + len = sizeof(struct udphdr) + sizeof(struct guehdr); + if (!pskb_may_pull(skb, len)) return -EINVAL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; @@ -1058,6 +1059,10 @@ static int gue_err(struct sk_buff *skb, u32 info) optlen = guehdr->hlen << 2; + if (!pskb_may_pull(skb, len + optlen)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; if (validate_gue_flags(guehdr, optlen)) return -EINVAL; @@ -1065,7 +1070,8 @@ static int gue_err(struct sk_buff *skb, u32 info) * recursion. Besides, this kind of encapsulation can't even be * configured currently. Discard this. */ - if (guehdr->proto_ctype == IPPROTO_UDP) + if (guehdr->proto_ctype == IPPROTO_UDP || + guehdr->proto_ctype == IPPROTO_UDPLITE) return -EOPNOTSUPP; skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index a4bf22ee3aed..7c4a41dc04bb 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -25,6 +25,7 @@ #include <linux/spinlock.h> #include <net/protocol.h> #include <net/gre.h> +#include <net/erspan.h> #include <net/icmp.h> #include <net/route.h> @@ -119,6 +120,22 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, hdr_len += 4; } tpi->hdr_len = hdr_len; + + /* ERSPAN ver 1 and 2 protocol sets GRE key field + * to 0 and sets the configured key in the + * inner erspan header field + */ + if (greh->protocol == htons(ETH_P_ERSPAN) || + greh->protocol == htons(ETH_P_ERSPAN2)) { + struct erspan_base_hdr *ershdr; + + if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr))) + return -EINVAL; + + ershdr = (struct erspan_base_hdr *)options; + tpi->key = cpu_to_be32(get_session_id(ershdr)); + } + return hdr_len; } EXPORT_SYMBOL(gre_parse_header); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 065997f414e6..f3a5893b1e86 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -570,7 +570,8 @@ relookup_failed: * MUST reply to only the first fragment. */ -void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) +void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, + const struct ip_options *opt) { struct iphdr *iph; int room; @@ -691,7 +692,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in)) + if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) goto out_unlock; @@ -742,7 +743,7 @@ out_bh_enable: local_bh_enable(); out:; } -EXPORT_SYMBOL(icmp_send); +EXPORT_SYMBOL(__icmp_send); static void icmp_socket_deliver(struct sk_buff *skb, u32 info) @@ -1245,9 +1246,7 @@ static int __net_init icmp_sk_init(struct net *net) return 0; fail: - for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i)); - free_percpu(net->ipv4.icmp_sk); + icmp_sk_exit(net); return err; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 765b2b32c4a4..6c2febc39dca 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -159,7 +159,8 @@ static int unsolicited_report_interval(struct in_device *in_dev) return interval_jiffies; } -static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); +static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, + gfp_t gfp); static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_clear_delrec(struct in_device *in_dev); static int sf_setstate(struct ip_mc_list *pmc); @@ -1145,7 +1146,8 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) /* * deleted ip_mc_list manipulation */ -static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) +static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, + gfp_t gfp) { struct ip_mc_list *pmc; struct net *net = dev_net(in_dev->dev); @@ -1156,7 +1158,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ - pmc = kzalloc(sizeof(*pmc), GFP_KERNEL); + pmc = kzalloc(sizeof(*pmc), gfp); if (!pmc) return; spin_lock_init(&pmc->lock); @@ -1261,7 +1263,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) } #endif -static void igmp_group_dropped(struct ip_mc_list *im) +static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST @@ -1292,13 +1294,18 @@ static void igmp_group_dropped(struct ip_mc_list *im) return; } /* IGMPv3 */ - igmpv3_add_delrec(in_dev, im); + igmpv3_add_delrec(in_dev, im, gfp); igmp_ifc_event(in_dev); } #endif } +static void igmp_group_dropped(struct ip_mc_list *im) +{ + __igmp_group_dropped(im, GFP_KERNEL); +} + static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; @@ -1400,8 +1407,8 @@ static void ip_mc_hash_remove(struct in_device *in_dev, /* * A socket has joined a multicast group on device dev. */ -static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, - unsigned int mode) +static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, + unsigned int mode, gfp_t gfp) { struct ip_mc_list *im; @@ -1415,7 +1422,7 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, } } - im = kzalloc(sizeof(*im), GFP_KERNEL); + im = kzalloc(sizeof(*im), gfp); if (!im) goto out; @@ -1448,6 +1455,12 @@ out: return; } +void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) +{ + ____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp); +} +EXPORT_SYMBOL(__ip_mc_inc_group); + void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE); @@ -1493,22 +1506,22 @@ static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) len += sizeof(struct igmpv3_report); - return pskb_may_pull(skb, len) ? 0 : -EINVAL; + return ip_mc_may_pull(skb, len) ? 0 : -EINVAL; } static int ip_mc_check_igmp_query(struct sk_buff *skb) { - unsigned int len = skb_transport_offset(skb); - - len += sizeof(struct igmphdr); - if (skb->len < len) - return -EINVAL; + unsigned int transport_len = ip_transport_len(skb); + unsigned int len; /* IGMPv{1,2}? */ - if (skb->len != len) { + if (transport_len != sizeof(struct igmphdr)) { /* or IGMPv3? */ - len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr); - if (skb->len < len || !pskb_may_pull(skb, len)) + if (transport_len < sizeof(struct igmpv3_query)) + return -EINVAL; + + len = skb_transport_offset(skb) + sizeof(struct igmpv3_query); + if (!ip_mc_may_pull(skb, len)) return -EINVAL; } @@ -1528,7 +1541,6 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb) case IGMP_HOST_LEAVE_MESSAGE: case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: - /* fall through */ return 0; case IGMPV3_HOST_MEMBERSHIP_REPORT: return ip_mc_check_igmp_reportv3(skb); @@ -1544,47 +1556,29 @@ static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) return skb_checksum_simple_validate(skb); } -static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) - +static int ip_mc_check_igmp_csum(struct sk_buff *skb) { - struct sk_buff *skb_chk; - unsigned int transport_len; unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); - int ret = -EINVAL; + unsigned int transport_len = ip_transport_len(skb); + struct sk_buff *skb_chk; - transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + if (!ip_mc_may_pull(skb, len)) + return -EINVAL; skb_chk = skb_checksum_trimmed(skb, transport_len, ip_mc_validate_checksum); if (!skb_chk) - goto err; - - if (!pskb_may_pull(skb_chk, len)) - goto err; - - ret = ip_mc_check_igmp_msg(skb_chk); - if (ret) - goto err; - - if (skb_trimmed) - *skb_trimmed = skb_chk; - /* free now unneeded clone */ - else if (skb_chk != skb) - kfree_skb(skb_chk); - - ret = 0; + return -EINVAL; -err: - if (ret && skb_chk && skb_chk != skb) + if (skb_chk != skb) kfree_skb(skb_chk); - return ret; + return 0; } /** * ip_mc_check_igmp - checks whether this is a sane IGMP packet * @skb: the skb to validate - * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional) * * Checks whether an IPv4 packet is a valid IGMP packet. If so sets * skb transport header accordingly and returns zero. @@ -1594,18 +1588,10 @@ err: * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. * -ENOMEM: A memory allocation failure happened. * - * Optionally, an skb pointer might be provided via skb_trimmed (or set it - * to NULL): After parsing an IGMP packet successfully it will point to - * an skb which has its tail aligned to the IP packet end. This might - * either be the originally provided skb or a trimmed, cloned version if - * the skb frame had data beyond the IP packet. A cloned skb allows us - * to leave the original skb and its full frame unchanged (which might be - * desirable for layer 2 frame jugglers). - * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ -int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) +int ip_mc_check_igmp(struct sk_buff *skb) { int ret = ip_mc_check_iphdr(skb); @@ -1615,7 +1601,11 @@ int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) if (ip_hdr(skb)->protocol != IPPROTO_IGMP) return -ENOMSG; - return __ip_mc_check_igmp(skb, skb_trimmed); + ret = ip_mc_check_igmp_csum(skb); + if (ret < 0) + return ret; + + return ip_mc_check_igmp_msg(skb); } EXPORT_SYMBOL(ip_mc_check_igmp); @@ -1656,7 +1646,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) * A socket has left a multicast group on device dev */ -void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) +void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) { struct ip_mc_list *i; struct ip_mc_list __rcu **ip; @@ -1671,7 +1661,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) ip_mc_hash_remove(in_dev, i); *ip = i->next_rcu; in_dev->mc_count--; - igmp_group_dropped(i); + __igmp_group_dropped(i, gfp); ip_mc_clear_src(i); if (!in_dev->dead) @@ -1684,7 +1674,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) } } } -EXPORT_SYMBOL(ip_mc_dec_group); +EXPORT_SYMBOL(__ip_mc_dec_group); /* Device changing type */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 1a4e9ff02762..5731670c560b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -108,6 +108,7 @@ static size_t inet_sk_attr_size(struct sock *sk, + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + nla_total_size(4) /* INET_DIAG_MARK */ + + nla_total_size(4) /* INET_DIAG_CLASS_ID */ + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) @@ -287,12 +288,19 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto errout; } - if (ext & (1 << (INET_DIAG_CLASS_ID - 1))) { + if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || + ext & (1 << (INET_DIAG_TCLASS - 1))) { u32 classid = 0; #ifdef CONFIG_SOCK_CGROUP_DATA classid = sock_cgroup_classid(&sk->sk_cgrp_data); #endif + /* Fallback to socket priority if class id isn't set. + * Classful qdiscs use it as direct reference to class. + * For cgroup2 classid is always zero. + */ + if (!classid) + classid = sk->sk_priority; if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) goto errout; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 760a9e52e02b..737808e27f8b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,6 +25,62 @@ #include <net/sock.h> #include <net/inet_frag.h> #include <net/inet_ecn.h> +#include <net/ip.h> +#include <net/ipv6.h> + +/* Use skb->cb to track consecutive/adjacent fragments coming at + * the end of the queue. Nodes in the rb-tree queue will + * contain "runs" of one or more adjacent fragments. + * + * Invariants: + * - next_frag is NULL at the tail of a "run"; + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. + */ +struct ipfrag_skb_cb { + union { + struct inet_skb_parm h4; + struct inet6_skb_parm h6; + }; + struct sk_buff *next_frag; + int frag_run_len; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +static void fragcb_clear(struct sk_buff *skb) +{ + RB_CLEAR_NODE(&skb->rbnode); + FRAG_CB(skb)->next_frag = NULL; + FRAG_CB(skb)->frag_run_len = skb->len; +} + +/* Append skb to the last "run". */ +static void fragrun_append_to_last(struct inet_frag_queue *q, + struct sk_buff *skb) +{ + fragcb_clear(skb); + + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; + FRAG_CB(q->fragments_tail)->next_frag = skb; + q->fragments_tail = skb; +} + +/* Create a new "run" with the skb. */ +static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); + fragcb_clear(skb); + + if (q->last_run_head) + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, + &q->last_run_head->rbnode.rb_right); + else + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + + q->fragments_tail = skb; + q->last_run_head = skb; +} /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. @@ -123,9 +179,30 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) kmem_cache_free(f->frags_cachep, q); } +unsigned int inet_frag_rbtree_purge(struct rb_root *root) +{ + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + while (skb) { + struct sk_buff *next = FRAG_CB(skb)->next_frag; + + sum += skb->truesize; + kfree_skb(skb); + skb = next; + } + } + return sum; +} +EXPORT_SYMBOL(inet_frag_rbtree_purge); + void inet_frag_destroy(struct inet_frag_queue *q) { - struct sk_buff *fp; struct netns_frags *nf; unsigned int sum, sum_truesize = 0; struct inet_frags *f; @@ -134,20 +211,9 @@ void inet_frag_destroy(struct inet_frag_queue *q) WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ - fp = q->fragments; nf = q->net; f = nf->f; - if (fp) { - do { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; - } while (fp); - } else { - sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); - } + sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); sum = sum_truesize + f->qsize; call_rcu(&q->rcu, inet_frag_destroy_rcu); @@ -224,3 +290,212 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) return fq; } EXPORT_SYMBOL(inet_frag_find); + +int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, + int offset, int end) +{ + struct sk_buff *last = q->fragments_tail; + + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. + * + * Duplicates, however, should be ignored (i.e. skb dropped, but the + * queue/fragments kept for later reassembly). + */ + if (!last) + fragrun_create(q, skb); /* First fragment. */ + else if (last->ip_defrag_offset + last->len < end) { + /* This is the common case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < last->ip_defrag_offset + last->len) + return IPFRAG_OVERLAP; + if (offset == last->ip_defrag_offset + last->len) + fragrun_append_to_last(q, skb); + else + fragrun_create(q, skb); + } else { + /* Binary search. Note that skb can become the first fragment, + * but not the last (covered above). + */ + struct rb_node **rbn, *parent; + + rbn = &q->rb_fragments.rb_node; + do { + struct sk_buff *curr; + int curr_run_end; + + parent = *rbn; + curr = rb_to_skb(parent); + curr_run_end = curr->ip_defrag_offset + + FRAG_CB(curr)->frag_run_len; + if (end <= curr->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= curr_run_end) + rbn = &parent->rb_right; + else if (offset >= curr->ip_defrag_offset && + end <= curr_run_end) + return IPFRAG_DUP; + else + return IPFRAG_OVERLAP; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. + */ + fragcb_clear(skb); + rb_link_node(&skb->rbnode, parent, rbn); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + } + + skb->ip_defrag_offset = offset; + + return IPFRAG_OK; +} +EXPORT_SYMBOL(inet_frag_queue_insert); + +void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, + struct sk_buff *parent) +{ + struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); + struct sk_buff **nextp; + int delta; + + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); + if (!fp) + return NULL; + FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; + if (RB_EMPTY_NODE(&skb->rbnode)) + FRAG_CB(parent)->next_frag = fp; + else + rb_replace_node(&skb->rbnode, &fp->rbnode, + &q->rb_fragments); + if (q->fragments_tail == skb) + q->fragments_tail = fp; + skb_morph(skb, head); + FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + consume_skb(head); + head = skb; + } + WARN_ON(head->ip_defrag_offset != 0); + + delta = -head->truesize; + + /* Head of list must not be cloned. */ + if (skb_unclone(head, GFP_ATOMIC)) + return NULL; + + delta += head->truesize; + if (delta) + add_frag_mem_limit(q->net, delta); + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. + */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + clone = alloc_skb(0, GFP_ATOMIC); + if (!clone) + return NULL; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); + clone->data_len = head->data_len - plen; + clone->len = clone->data_len; + head->truesize += clone->truesize; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + add_frag_mem_limit(q->net, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; + } + + return nextp; +} +EXPORT_SYMBOL(inet_frag_reasm_prepare); + +void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, + void *reasm_data) +{ + struct sk_buff **nextp = (struct sk_buff **)reasm_data; + struct rb_node *rbn; + struct sk_buff *fp; + + skb_push(head, head->data - skb_network_header(head)); + + /* Traverse the tree in order, to build frag_list. */ + fp = FRAG_CB(head)->next_frag; + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &q->rb_fragments); + while (rbn || fp) { + /* fp points to the next sk_buff in the current run; + * rbn points to the next run. + */ + /* Go through the current run. */ + while (fp) { + *nextp = fp; + nextp = &fp->next; + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + fp->sk = NULL; + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + fp = FRAG_CB(fp)->next_frag; + } + /* Move to the next run. */ + if (rbn) { + struct rb_node *rbnext = rb_next(rbn); + + fp = rb_to_skb(rbn); + rb_erase(rbn, &q->rb_fragments); + rbn = rbnext; + } + } + sub_frag_mem_limit(q->net, head->truesize); + + *nextp = NULL; + skb_mark_not_on_list(head); + head->prev = NULL; + head->tstamp = q->stamp; +} +EXPORT_SYMBOL(inet_frag_reasm_finish); + +struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) +{ + struct sk_buff *head, *skb; + + head = skb_rb_first(&q->rb_fragments); + if (!head) + return NULL; + skb = FRAG_CB(head)->next_frag; + if (skb) + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + else + rb_erase(&head->rbnode, &q->rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + + if (head == q->fragments_tail) + q->fragments_tail = NULL; + + sub_frag_mem_limit(q->net, head->truesize); + + return head; +} +EXPORT_SYMBOL(inet_frag_pull_head); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d757b9642d0d..be778599bfed 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -216,6 +216,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; + p->n_redirects = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first * calculation of tokens is at its maximum. */ diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 867be8f7f1fa..cf2b0a6a3337 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,57 +57,6 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; -/* Use skb->cb to track consecutive/adjacent fragments coming at - * the end of the queue. Nodes in the rb-tree queue will - * contain "runs" of one or more adjacent fragments. - * - * Invariants: - * - next_frag is NULL at the tail of a "run"; - * - the head of a "run" has the sum of all fragment lengths in frag_run_len. - */ -struct ipfrag_skb_cb { - struct inet_skb_parm h; - struct sk_buff *next_frag; - int frag_run_len; -}; - -#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) - -static void ip4_frag_init_run(struct sk_buff *skb) -{ - BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); - - FRAG_CB(skb)->next_frag = NULL; - FRAG_CB(skb)->frag_run_len = skb->len; -} - -/* Append skb to the last "run". */ -static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, - struct sk_buff *skb) -{ - RB_CLEAR_NODE(&skb->rbnode); - FRAG_CB(skb)->next_frag = NULL; - - FRAG_CB(q->last_run_head)->frag_run_len += skb->len; - FRAG_CB(q->fragments_tail)->next_frag = skb; - q->fragments_tail = skb; -} - -/* Create a new "run" with the skb. */ -static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) -{ - if (q->last_run_head) - rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, - &q->last_run_head->rbnode.rb_right); - else - rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); - rb_insert_color(&skb->rbnode, &q->rb_fragments); - - ip4_frag_init_run(skb); - q->fragments_tail = skb; - q->last_run_head = skb; -} - /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; @@ -212,27 +161,9 @@ static void ip_expire(struct timer_list *t) * pull the head out of the tree in order to be able to * deal with head->dev. */ - if (qp->q.fragments) { - head = qp->q.fragments; - qp->q.fragments = head->next; - } else { - head = skb_rb_first(&qp->q.rb_fragments); - if (!head) - goto out; - if (FRAG_CB(head)->next_frag) - rb_replace_node(&head->rbnode, - &FRAG_CB(head)->next_frag->rbnode, - &qp->q.rb_fragments); - else - rb_erase(&head->rbnode, &qp->q.rb_fragments); - memset(&head->rbnode, 0, sizeof(head->rbnode)); - barrier(); - } - if (head == qp->q.fragments_tail) - qp->q.fragments_tail = NULL; - - sub_frag_mem_limit(qp->q.net, head->truesize); - + head = inet_frag_pull_head(&qp->q); + if (!head) + goto out; head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; @@ -330,7 +261,6 @@ static int ip_frag_reinit(struct ipq *qp) qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; - qp->q.fragments = NULL; qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; @@ -344,12 +274,10 @@ static int ip_frag_reinit(struct ipq *qp) static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct rb_node **rbn, *parent; - struct sk_buff *skb1, *prev_tail; - int ihl, end, skb1_run_end; + int ihl, end, flags, offset; + struct sk_buff *prev_tail; struct net_device *dev; unsigned int fragsize; - int flags, offset; int err = -ENOENT; u8 ecn; @@ -413,62 +341,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) /* Makes sure compiler wont do silly aliasing games */ barrier(); - /* RFC5722, Section 4, amended by Errata ID : 3089 - * When reassembling an IPv6 datagram, if - * one or more its constituent fragments is determined to be an - * overlapping fragment, the entire datagram (and any constituent - * fragments) MUST be silently discarded. - * - * We do the same here for IPv4 (and increment an snmp counter) but - * we do not want to drop the whole queue in response to a duplicate - * fragment. - */ - - err = -EINVAL; - /* Find out where to put this fragment. */ prev_tail = qp->q.fragments_tail; - if (!prev_tail) - ip4_frag_create_run(&qp->q, skb); /* First fragment. */ - else if (prev_tail->ip_defrag_offset + prev_tail->len < end) { - /* This is the common case: skb goes to the end. */ - /* Detect and discard overlaps. */ - if (offset < prev_tail->ip_defrag_offset + prev_tail->len) - goto overlap; - if (offset == prev_tail->ip_defrag_offset + prev_tail->len) - ip4_frag_append_to_last_run(&qp->q, skb); - else - ip4_frag_create_run(&qp->q, skb); - } else { - /* Binary search. Note that skb can become the first fragment, - * but not the last (covered above). - */ - rbn = &qp->q.rb_fragments.rb_node; - do { - parent = *rbn; - skb1 = rb_to_skb(parent); - skb1_run_end = skb1->ip_defrag_offset + - FRAG_CB(skb1)->frag_run_len; - if (end <= skb1->ip_defrag_offset) - rbn = &parent->rb_left; - else if (offset >= skb1_run_end) - rbn = &parent->rb_right; - else if (offset >= skb1->ip_defrag_offset && - end <= skb1_run_end) - goto err; /* No new data, potential duplicate */ - else - goto overlap; /* Found an overlap */ - } while (*rbn); - /* Here we have parent properly set, and rbn pointing to - * one of its NULL left/right children. Insert skb. - */ - ip4_frag_init_run(skb); - rb_link_node(&skb->rbnode, parent, rbn); - rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - } + err = inet_frag_queue_insert(&qp->q, skb, offset, end); + if (err) + goto insert_error; if (dev) qp->iif = dev->ifindex; - skb->ip_defrag_offset = offset; qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; @@ -501,10 +380,16 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) skb_dst_drop(skb); return -EINPROGRESS; -overlap: +insert_error: + if (err == IPFRAG_DUP) { + kfree_skb(skb); + return -EINVAL; + } + err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: inet_frag_kill(&qp->q); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb(skb); return err; @@ -516,13 +401,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; - struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); - struct sk_buff **nextp; /* To build frag_list. */ - struct rb_node *rbn; - int len; - int ihlen; - int delta; - int err; + void *reasm_data; + int len, err; u8 ecn; ipq_kill(qp); @@ -532,117 +412,23 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, err = -EINVAL; goto out_fail; } - /* Make the one we just received the head. */ - if (head != skb) { - fp = skb_clone(skb, GFP_ATOMIC); - if (!fp) - goto out_nomem; - FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; - if (RB_EMPTY_NODE(&skb->rbnode)) - FRAG_CB(prev_tail)->next_frag = fp; - else - rb_replace_node(&skb->rbnode, &fp->rbnode, - &qp->q.rb_fragments); - if (qp->q.fragments_tail == skb) - qp->q.fragments_tail = fp; - skb_morph(skb, head); - FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; - rb_replace_node(&head->rbnode, &skb->rbnode, - &qp->q.rb_fragments); - consume_skb(head); - head = skb; - } - WARN_ON(head->ip_defrag_offset != 0); - - /* Allocate a new buffer for the datagram. */ - ihlen = ip_hdrlen(head); - len = ihlen + qp->q.len; + /* Make the one we just received the head. */ + reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail); + if (!reasm_data) + goto out_nomem; + len = ip_hdrlen(skb) + qp->q.len; err = -E2BIG; if (len > 65535) goto out_oversize; - delta = - head->truesize; - - /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) - goto out_nomem; - - delta += head->truesize; - if (delta) - add_frag_mem_limit(qp->q.net, delta); - - /* If the first fragment is fragmented itself, we split - * it to two chunks: the first with data and paged part - * and the second, holding only fragments. */ - if (skb_has_frag_list(head)) { - struct sk_buff *clone; - int i, plen = 0; - - clone = alloc_skb(0, GFP_ATOMIC); - if (!clone) - goto out_nomem; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; - head->truesize += clone->truesize; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - add_frag_mem_limit(qp->q.net, clone->truesize); - skb_shinfo(head)->frag_list = clone; - nextp = &clone->next; - } else { - nextp = &skb_shinfo(head)->frag_list; - } - - skb_push(head, head->data - skb_network_header(head)); + inet_frag_reasm_finish(&qp->q, skb, reasm_data); - /* Traverse the tree in order, to build frag_list. */ - fp = FRAG_CB(head)->next_frag; - rbn = rb_next(&head->rbnode); - rb_erase(&head->rbnode, &qp->q.rb_fragments); - while (rbn || fp) { - /* fp points to the next sk_buff in the current run; - * rbn points to the next run. - */ - /* Go through the current run. */ - while (fp) { - *nextp = fp; - nextp = &fp->next; - fp->prev = NULL; - memset(&fp->rbnode, 0, sizeof(fp->rbnode)); - fp->sk = NULL; - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - fp = FRAG_CB(fp)->next_frag; - } - /* Move to the next run. */ - if (rbn) { - struct rb_node *rbnext = rb_next(rbn); - - fp = rb_to_skb(rbn); - rb_erase(rbn, &qp->q.rb_fragments); - rbn = rbnext; - } - } - sub_frag_mem_limit(qp->q.net, head->truesize); - - *nextp = NULL; - skb_mark_not_on_list(head); - head->prev = NULL; - head->dev = dev; - head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); + skb->dev = dev; + IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); - iph = ip_hdr(head); + iph = ip_hdr(skb); iph->tot_len = htons(len); iph->tos |= ecn; @@ -655,7 +441,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, * from one very small df-fragment and one large non-df frag. */ if (qp->max_df_size == qp->q.max_size) { - IPCB(head)->flags |= IPSKB_FRAG_PMTU; + IPCB(skb)->flags |= IPSKB_FRAG_PMTU; iph->frag_off = htons(IP_DF); } else { iph->frag_off = 0; @@ -664,7 +450,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, ip_send_check(iph); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); - qp->q.fragments = NULL; qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; @@ -753,28 +538,6 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_check_defrag); -unsigned int inet_frag_rbtree_purge(struct rb_root *root) -{ - struct rb_node *p = rb_first(root); - unsigned int sum = 0; - - while (p) { - struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); - - p = rb_next(p); - rb_erase(&skb->rbnode, root); - while (skb) { - struct sk_buff *next = FRAG_CB(skb)->next_frag; - - sum += skb->truesize; - kfree_skb(skb); - skb = next; - } - } - return sum; -} -EXPORT_SYMBOL(inet_frag_rbtree_purge); - #ifdef CONFIG_SYSCTL static int dist_min; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d1d09f3e5f9e..fd219f7bd3ea 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -268,20 +268,11 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int len; itn = net_generic(net, erspan_net_id); - len = gre_hdr_len + sizeof(*ershdr); - - /* Check based hdr len */ - if (unlikely(!pskb_may_pull(skb, len))) - return PACKET_REJECT; iph = ip_hdr(skb); ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; - /* The original GRE header does not have key field, - * Use ERSPAN 10-bit session ID as key. - */ - tpi->key = cpu_to_be32(get_session_id(ershdr)); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); @@ -458,81 +449,14 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum) return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } -static struct rtable *gre_get_rt(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl, - const struct ip_tunnel_key *key) -{ - struct net *net = dev_net(dev); - - memset(fl, 0, sizeof(*fl)); - fl->daddr = key->u.ipv4.dst; - fl->saddr = key->u.ipv4.src; - fl->flowi4_tos = RT_TOS(key->tos); - fl->flowi4_mark = skb->mark; - fl->flowi4_proto = IPPROTO_GRE; - - return ip_route_output_key(net, fl); -} - -static struct rtable *prepare_fb_xmit(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl, - int tunnel_hlen) -{ - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - struct rtable *rt = NULL; - int min_headroom; - bool use_cache; - int err; - - tun_info = skb_tunnel_info(skb); - key = &tun_info->key; - use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); - - if (use_cache) - rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr); - if (!rt) { - rt = gre_get_rt(skb, dev, fl, key); - if (IS_ERR(rt)) - goto err_free_skb; - if (use_cache) - dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, - fl->saddr); - } - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + tunnel_hlen + sizeof(struct iphdr); - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - return rt; - -err_free_rt: - ip_rt_put(rt); -err_free_skb: - kfree_skb(skb); - dev->stats.tx_dropped++; - return NULL; -} - static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; - struct rtable *rt = NULL; - struct flowi4 fl; int tunnel_hlen; - __be16 df, flags; + __be16 flags; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -542,13 +466,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, key = &tun_info->key; tunnel_hlen = gre_calc_hlen(key->tun_flags); - rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); - if (!rt) - return; + if (skb_cow_head(skb, dev->needed_headroom)) + goto err_free_skb; /* Push Tunnel header. */ if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) - goto err_free_rt; + goto err_free_skb; flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); @@ -556,32 +479,25 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, tunnel_id_to_key32(tun_info->key.tun_id), (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); - iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, - key->tos, key->ttl, df, false); return; -err_free_rt: - ip_rt_put(rt); err_free_skb: kfree_skb(skb); dev->stats.tx_dropped++; } -static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, - __be16 proto) +static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct erspan_metadata *md; - struct rtable *rt = NULL; bool truncate = false; - struct flowi4 fl; + __be16 proto; int tunnel_hlen; int version; - __be16 df; int nhoff; int thoff; @@ -592,21 +508,20 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, key = &tun_info->key; if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) - goto err_free_rt; + goto err_free_skb; md = ip_tunnel_info_opts(tun_info); if (!md) - goto err_free_rt; + goto err_free_skb; /* ERSPAN has fixed 8 byte GRE header */ version = md->version; tunnel_hlen = 8 + erspan_hdr_len(version); - rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); - if (!rt) - return; + if (skb_cow_head(skb, dev->needed_headroom)) + goto err_free_skb; if (gre_handle_offloads(skb, false)) - goto err_free_rt; + goto err_free_skb; if (skb->len > dev->mtu + dev->hard_header_len) { pskb_trim(skb, dev->mtu + dev->hard_header_len); @@ -626,27 +541,25 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (version == 1) { erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), ntohl(md->u.index), truncate, true); + proto = htons(ETH_P_ERSPAN); } else if (version == 2) { erspan_build_header_v2(skb, ntohl(tunnel_id_to_key32(key->tun_id)), md->u.md2.dir, get_hwid(&md->u.md2), truncate, true); + proto = htons(ETH_P_ERSPAN2); } else { - goto err_free_rt; + goto err_free_skb; } gre_build_header(skb, 8, TUNNEL_SEQ, - htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); + proto, 0, htonl(tunnel->o_seqno++)); - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); - iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, - key->tos, key->ttl, df, false); return; -err_free_rt: - ip_rt_put(rt); err_free_skb: kfree_skb(skb); dev->stats.tx_dropped++; @@ -655,13 +568,18 @@ err_free_skb: static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct ip_tunnel_key *key; struct rtable *rt; struct flowi4 fl4; if (ip_tunnel_info_af(info) != AF_INET) return -EINVAL; - rt = gre_get_rt(skb, dev, &fl4, &info->key); + key = &info->key; + ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, + tunnel_id_to_key32(key->tun_id), key->tos, 0, + skb->mark, skb_get_hash(skb)); + rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -721,12 +639,13 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); bool truncate = false; + __be16 proto; if (!pskb_inet_may_pull(skb)) goto free_skb; if (tunnel->collect_md) { - erspan_fb_xmit(skb, dev, skb->protocol); + erspan_fb_xmit(skb, dev); return NETDEV_TX_OK; } @@ -742,19 +661,22 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - if (tunnel->erspan_ver == 1) + if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, truncate, true); - else if (tunnel->erspan_ver == 2) + proto = htons(ETH_P_ERSPAN); + } else if (tunnel->erspan_ver == 2) { erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key), tunnel->dir, tunnel->hwid, truncate, true); - else + proto = htons(ETH_P_ERSPAN2); + } else { goto free_skb; + } tunnel->parms.o_flags &= ~TUNNEL_KEY; - __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); + __gre_xmit(skb, dev, &tunnel->parms.iph, proto); return NETDEV_TX_OK; free_skb: @@ -1459,12 +1381,31 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm *p = &t->parms; + __be16 o_flags = p->o_flags; + + if (t->erspan_ver == 1 || t->erspan_ver == 2) { + if (!t->collect_md) + o_flags |= TUNNEL_KEY; + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) + goto nla_put_failure; + } else { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } + } if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, gre_tnl_flags_to_gre_flags(p->i_flags)) || nla_put_be16(skb, IFLA_GRE_OFLAGS, - gre_tnl_flags_to_gre_flags(p->o_flags)) || + gre_tnl_flags_to_gre_flags(o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || @@ -1494,19 +1435,6 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) - goto nla_put_failure; - - if (t->erspan_ver == 1) { - if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) - goto nla_put_failure; - } else if (t->erspan_ver == 2) { - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) - goto nla_put_failure; - if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) - goto nla_put_failure; - } - return 0; nla_put_failure: diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 26921f6b3b92..ecce2dc78f17 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -307,11 +307,10 @@ drop: } static int ip_rcv_finish_core(struct net *net, struct sock *sk, - struct sk_buff *skb) + struct sk_buff *skb, struct net_device *dev) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); - struct net_device *dev = skb->dev; struct rtable *rt; int err; @@ -400,6 +399,7 @@ drop_error: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + struct net_device *dev = skb->dev; int ret; /* if ingress device is enslaved to an L3 master device pass the @@ -409,7 +409,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) if (!skb) return NET_RX_SUCCESS; - ret = ip_rcv_finish_core(net, sk, skb); + ret = ip_rcv_finish_core(net, sk, skb, dev); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; @@ -429,7 +429,6 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) if (skb->pkt_type == PACKET_OTHERHOST) goto drop; - __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -488,6 +487,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) goto drop; } + iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ @@ -520,6 +520,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, skb = ip_rcv_core(skb, net); if (skb == NULL) return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); @@ -544,6 +545,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; struct dst_entry *dst; skb_list_del_init(skb); @@ -553,7 +555,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip_rcv(skb); if (!skb) continue; - if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) + if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP) continue; dst = skb_dst(skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ed194d46c00e..32a35043c9f5 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -251,8 +251,9 @@ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) * If opt == NULL, then skb->data should point to IP header. */ -int ip_options_compile(struct net *net, - struct ip_options *opt, struct sk_buff *skb) +int __ip_options_compile(struct net *net, + struct ip_options *opt, struct sk_buff *skb, + __be32 *info) { __be32 spec_dst = htonl(INADDR_ANY); unsigned char *pp_ptr = NULL; @@ -468,11 +469,22 @@ eol: return 0; error: - if (skb) { - icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24)); - } + if (info) + *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } + +int ip_options_compile(struct net *net, + struct ip_options *opt, struct sk_buff *skb) +{ + int ret; + __be32 info; + + ret = __ip_options_compile(net, opt, skb, &info); + if (ret != 0 && skb) + icmp_send(skb, ICMP_PARAMETERPROB, 0, info); + return ret; +} EXPORT_SYMBOL(ip_options_compile); /* diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index fffcc130900e..82f341e84fae 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -148,19 +148,17 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { + __be16 _ports[2], *ports; struct sockaddr_in sin; - __be16 *ports; - int end; - - end = skb_transport_offset(skb) + 4; - if (end > 0 && !pskb_may_pull(skb, end)) - return; /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ - ports = (__be16 *)skb_transport_header(skb); + ports = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_ports), &_ports); + if (!ports) + return; sin.sin_family = AF_INET; sin.sin_addr.s_addr = ip_hdr(skb)->daddr; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c4f5602308ed..2756fb725bf0 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -310,7 +310,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, RT_TOS(iph->tos), tunnel->parms.link, - tunnel->fwmark); + tunnel->fwmark, 0); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -501,15 +501,19 @@ EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df, - const struct iphdr *inner_iph) + const struct iphdr *inner_iph, + int tunnel_hlen, __be32 dst, bool md) { struct ip_tunnel *tunnel = netdev_priv(dev); - int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len; + int pkt_size; int mtu; + tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; + pkt_size = skb->len - tunnel_hlen - dev->hard_header_len; + if (df) mtu = dst_mtu(&rt->dst) - dev->hard_header_len - - sizeof(struct iphdr) - tunnel->hlen; + - sizeof(struct iphdr) - tunnel_hlen; else mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; @@ -527,11 +531,13 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) { struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); + __be32 daddr; + + daddr = md ? dst : tunnel->parms.iph.daddr; if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { - if ((tunnel->parms.iph.daddr && - !ipv4_is_multicast(tunnel->parms.iph.daddr)) || + if ((daddr && !ipv4_is_multicast(daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); @@ -548,17 +554,19 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, return 0; } -void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) +void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, + u8 proto, int tunnel_hlen) { struct ip_tunnel *tunnel = netdev_priv(dev); u32 headroom = sizeof(struct iphdr); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; const struct iphdr *inner_iph; - struct rtable *rt; + struct rtable *rt = NULL; struct flowi4 fl4; __be16 df = 0; u8 tos, ttl; + bool use_cache; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -574,20 +582,39 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) else if (skb->protocol == htons(ETH_P_IPV6)) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } - ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, - RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); + ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, + tunnel_id_to_key32(key->tun_id), RT_TOS(tos), + 0, skb->mark, skb_get_hash(skb)); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; - rt = ip_route_output_key(tunnel->net, &fl4); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error; + + use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); + if (!rt) { + rt = ip_route_output_key(tunnel->net, &fl4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + if (use_cache) + dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, + fl4.saddr); } if (rt->dst.dev == dev) { ip_rt_put(rt); dev->stats.collisions++; goto tx_error; } + + if (key->tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, + key->u.ipv4.dst, true)) { + ip_rt_put(rt); + goto tx_error; + } + tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = key->ttl; if (ttl == 0) { @@ -598,10 +625,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) else ttl = ip4_dst_hoplimit(&rt->dst); } - if (key->tun_flags & TUNNEL_DONT_FRAGMENT) - df = htons(IP_DF); - else if (skb->protocol == htons(ETH_P_IP)) + + if (!df && skb->protocol == htons(ETH_P_IP)) df = inner_iph->frag_off & htons(IP_DF); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (headroom > dev->needed_headroom) dev->needed_headroom = headroom; @@ -627,14 +654,17 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_info *tun_info = NULL; const struct iphdr *inner_iph; - struct flowi4 fl4; - u8 tos, ttl; - __be16 df; - struct rtable *rt; /* Route to the other host */ unsigned int max_headroom; /* The extra header space needed */ - __be32 dst; + struct rtable *rt = NULL; /* Route to the other host */ + bool use_cache = false; + struct flowi4 fl4; + bool md = false; bool connected; + u8 tos, ttl; + __be32 dst; + __be16 df; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); @@ -650,7 +680,15 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (skb->protocol == htons(ETH_P_IP)) { + tun_info = skb_tunnel_info(skb); + if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && + ip_tunnel_info_af(tun_info) == AF_INET && + tun_info->key.u.ipv4.dst) { + dst = tun_info->key.u.ipv4.dst; + md = true; + connected = true; + } + else if (skb->protocol == htons(ETH_P_IP)) { rt = skb_rtable(skb); dst = rt_nexthop(rt, inner_iph->daddr); } @@ -688,7 +726,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, else goto tx_error; - connected = false; + if (!md) + connected = false; } tos = tnl_params->tos; @@ -705,13 +744,20 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, - tunnel->fwmark); + tunnel->fwmark, skb_get_hash(skb)); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; - rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : - NULL; + if (connected && md) { + use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) + rt = dst_cache_get_ip4(&tun_info->dst_cache, + &fl4.saddr); + } else { + rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, + &fl4.saddr) : NULL; + } if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); @@ -720,7 +766,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, dev->stats.tx_carrier_errors++; goto tx_error; } - if (connected) + if (use_cache) + dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, + fl4.saddr); + else if (!md && connected) dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } @@ -731,7 +780,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) { + if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph, + 0, 0, false)) { ip_rt_put(rt); goto tx_error; } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 9a0e67b52a4e..c3f3d28d1087 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -252,6 +252,14 @@ static int ip_tun_build_state(struct nlattr *attr, tun_info = lwt_tun_info(new_state); +#ifdef CONFIG_DST_CACHE + err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL); + if (err) { + lwtstate_free(new_state); + return err; + } +#endif + if (tb[LWTUNNEL_IP_ID]) tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]); @@ -278,6 +286,15 @@ static int ip_tun_build_state(struct nlattr *attr, return 0; } +static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate) +{ +#ifdef CONFIG_DST_CACHE + struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); + + dst_cache_destroy(&tun_info->dst_cache); +#endif +} + static int ip_tun_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { @@ -313,6 +330,7 @@ static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { .build_state = ip_tun_build_state, + .destroy_state = ip_tun_destroy_state, .fill_encap = ip_tun_fill_encap_info, .get_encap_size = ip_tun_encap_nlsize, .cmp_encap = ip_tun_cmp_encap, diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index d7b43e700023..68a21bf75dd0 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -74,6 +74,33 @@ drop: return 0; } +static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + struct ip_tunnel *tunnel; + const struct iphdr *iph = ip_hdr(skb); + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + if (tunnel) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + + skb->dev = tunnel->dev; + + return xfrm_input(skb, nexthdr, spi, encap_type); + } + + return -EINVAL; +drop: + kfree_skb(skb); + return 0; +} + static int vti_rcv(struct sk_buff *skb) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; @@ -82,6 +109,14 @@ static int vti_rcv(struct sk_buff *skb) return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); } +static int vti_rcv_ipip(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0); +} + static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -435,6 +470,12 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .priority = 100, }; +static struct xfrm_tunnel ipip_handler __read_mostly = { + .handler = vti_rcv_ipip, + .err_handler = vti4_err, + .priority = 0, +}; + static int __net_init vti_init_net(struct net *net) { int err; @@ -603,6 +644,13 @@ static int __init vti_init(void) if (err < 0) goto xfrm_proto_comp_failed; + msg = "ipip tunnel"; + err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + if (err < 0) { + pr_info("%s: cant't register tunnel\n",__func__); + goto xfrm_tunnel_failed; + } + msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) @@ -612,6 +660,8 @@ static int __init vti_init(void) rtnl_link_failed: xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); +xfrm_tunnel_failed: + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b9a9873c25c6..9bcca08efec9 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -85,7 +85,6 @@ /* Define the friendly delay before and after opening net devices */ #define CONF_POST_OPEN 10 /* After opening: 10 msecs */ -#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */ /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ @@ -101,6 +100,9 @@ #define NONE cpu_to_be32(INADDR_NONE) #define ANY cpu_to_be32(INADDR_ANY) +/* Wait for carrier timeout default in seconds */ +static unsigned int carrier_timeout = 120; + /* * Public IP configuration */ @@ -268,9 +270,9 @@ static int __init ic_open_devs(void) /* wait for a carrier on at least one device */ start = jiffies; - next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); + next_msg = start + msecs_to_jiffies(20000); while (time_before(jiffies, start + - msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) { + msecs_to_jiffies(carrier_timeout * 1000))) { int wait, elapsed; for_each_netdev(&init_net, dev) @@ -283,9 +285,9 @@ static int __init ic_open_devs(void) continue; elapsed = jiffies_to_msecs(jiffies - start); - wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000; + wait = (carrier_timeout * 1000 - elapsed + 500) / 1000; pr_info("Waiting up to %d more seconds for network.\n", wait); - next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); + next_msg = jiffies + msecs_to_jiffies(20000); } have_carrier: rtnl_unlock(); @@ -1780,3 +1782,18 @@ static int __init vendor_class_identifier_setup(char *addrs) return 1; } __setup("dhcpclass=", vendor_class_identifier_setup); + +static int __init set_carrier_timeout(char *str) +{ + ssize_t ret; + + if (!str) + return 0; + + ret = kstrtouint(str, 0, &carrier_timeout); + if (ret) + return 0; + + return 1; +} +__setup("carrier_timeout=", set_carrier_timeout); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 57c5dd283a2c..fe10b9a2efc8 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -302,7 +302,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, ipproto); if (tunnel->collect_md) - ip_md_tunnel_xmit(skb, dev, ipproto); + ip_md_tunnel_xmit(skb, dev, ipproto, 0); else ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ddbf8c9a1abb..2c931120c494 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -67,7 +67,6 @@ #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/nexthop.h> -#include <net/switchdev.h> #include <linux/nospec.h> @@ -111,7 +110,7 @@ static int ipmr_cache_report(struct mr_table *mrt, static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); -static void mroute_clean_tables(struct mr_table *mrt, bool all); +static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -416,7 +415,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) static void ipmr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt, true); + mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | + MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } @@ -837,10 +837,8 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { + struct netdev_phys_item_id ppid = { }; int vifi = vifc->vifc_vifi; - struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, - }; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; @@ -919,10 +917,10 @@ static int vif_add(struct net *net, struct mr_table *mrt, vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0), (VIFF_TUNNEL | VIFF_REGISTER)); - attr.orig_dev = dev; - if (!switchdev_port_attr_get(dev, &attr)) { - memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len); - v->dev_parent_id.id_len = attr.u.ppid.id_len; + err = dev_get_port_parent_id(dev, &ppid, true); + if (err == 0) { + memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len); + v->dev_parent_id.id_len = ppid.id_len; } else { v->dev_parent_id.id_len = 0; } @@ -1299,7 +1297,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, } /* Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr_table *mrt, bool all) +static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct net *net = read_pnet(&mrt->net); struct mr_mfc *c, *tmp; @@ -1308,35 +1306,44 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) int i; /* Shut down all active vif entries */ - for (i = 0; i < mrt->maxvif; i++) { - if (!all && (mrt->vif_table[i].flags & VIFF_STATIC)) - continue; - vif_delete(mrt, i, 0, &list); + if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) { + for (i = 0; i < mrt->maxvif; i++) { + if (((mrt->vif_table[i].flags & VIFF_STATIC) && + !(flags & MRT_FLUSH_VIFS_STATIC)) || + (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS))) + continue; + vif_delete(mrt, i, 0, &list); + } + unregister_netdevice_many(&list); } - unregister_netdevice_many(&list); /* Wipe the cache */ - list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { - if (!all && (c->mfc_flags & MFC_STATIC)) - continue; - rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); - list_del_rcu(&c->list); - cache = (struct mfc_cache *)c; - call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, - mrt->id); - mroute_netlink_event(mrt, cache, RTM_DELROUTE); - mr_cache_put(c); - } - - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { - spin_lock_bh(&mfc_unres_lock); - list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { - list_del(&c->list); + if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) { + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { + if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) || + (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC))) + continue; + rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); + list_del_rcu(&c->list); cache = (struct mfc_cache *)c; + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, + mrt->id); mroute_netlink_event(mrt, cache, RTM_DELROUTE); - ipmr_destroy_unres(mrt, cache); + mr_cache_put(c); + } + } + + if (flags & MRT_FLUSH_MFC) { + if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { + list_del(&c->list); + cache = (struct mfc_cache *)c; + mroute_netlink_event(mrt, cache, RTM_DELROUTE); + ipmr_destroy_unres(mrt, cache); + } + spin_unlock_bh(&mfc_unres_lock); } - spin_unlock_bh(&mfc_unres_lock); } } @@ -1357,7 +1364,7 @@ static void mrtsock_destruct(struct sock *sk) NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); - mroute_clean_tables(mrt, false); + mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC); } } rtnl_unlock(); @@ -1482,6 +1489,17 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, sk == rtnl_dereference(mrt->mroute_sk), parent); break; + case MRT_FLUSH: + if (optlen != sizeof(val)) { + ret = -EINVAL; + break; + } + if (get_user(val, (int __user *)optval)) { + ret = -EFAULT; + break; + } + mroute_clean_tables(mrt, val); + break; /* Control PIM assert. */ case MRT_ASSERT: if (optlen != sizeof(val)) { @@ -2467,6 +2485,61 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS); } +static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + + rtm = nlmsg_data(nlh); + if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || + (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || + rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || + rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + if (err) + return err; + + if ((tb[RTA_SRC] && !rtm->rtm_src_len) || + (tb[RTA_DST] && !rtm->rtm_dst_len)) { + NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); + return -EINVAL; + } + + for (i = 0; i <= RTA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case RTA_SRC: + case RTA_DST: + case RTA_TABLE: + break; + default: + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request"); + return -EINVAL; + } + } + + return 0; +} + static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2475,18 +2548,14 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct sk_buff *skb = NULL; struct mfc_cache *cache; struct mr_table *mrt; - struct rtmsg *rtm; __be32 src, grp; u32 tableid; int err; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; - rtm = nlmsg_data(nlh); - src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 8d2e5dc9a827..a058213b77a7 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -80,24 +80,6 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t } EXPORT_SYMBOL(ip_route_me_harder); -int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) -{ - const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct iphdr *iph = ip_hdr(skb); - - if (!(iph->tos == rt_info->tos && - skb->mark == rt_info->mark && - iph->daddr == rt_info->daddr && - iph->saddr == rt_info->saddr)) - return ip_route_me_harder(entry->state.net, skb, - RTN_UNSPEC); - } - return 0; -} -EXPORT_SYMBOL_GPL(nf_ip_reroute); - int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict __always_unused) { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 80f72cc5ca8d..c98391d49200 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -94,50 +94,7 @@ config NF_REJECT_IPV4 tristate "IPv4 packet rejection" default m if NETFILTER_ADVANCED=n -config NF_NAT_IPV4 - tristate "IPv4 NAT" - depends on NF_CONNTRACK - default m if NETFILTER_ADVANCED=n - select NF_NAT - help - The IPv4 NAT option allows masquerading, port forwarding and other - forms of full Network Address Port Translation. This can be - controlled by iptables or nft. - -if NF_NAT_IPV4 - -config NF_NAT_MASQUERADE_IPV4 - bool - -if NF_TABLES -config NFT_CHAIN_NAT_IPV4 - depends on NF_TABLES_IPV4 - tristate "IPv4 nf_tables nat chain support" - help - This option enables the "nat" chain for IPv4 in nf_tables. This - chain type is used to perform Network Address Translation (NAT) - packet transformations such as the source, destination address and - source and destination ports. - -config NFT_MASQ_IPV4 - tristate "IPv4 masquerading support for nf_tables" - depends on NF_TABLES_IPV4 - depends on NFT_MASQ - select NF_NAT_MASQUERADE_IPV4 - help - This is the expression that provides IPv4 masquerading support for - nf_tables. - -config NFT_REDIR_IPV4 - tristate "IPv4 redirect support for nf_tables" - depends on NF_TABLES_IPV4 - depends on NFT_REDIR - select NF_NAT_REDIRECT - help - This is the expression that provides IPv4 redirect support for - nf_tables. -endif # NF_TABLES - +if NF_NAT config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" depends on NF_CONNTRACK_SNMP @@ -166,7 +123,7 @@ config NF_NAT_H323 depends on NF_CONNTRACK default NF_CONNTRACK_H323 -endif # NF_NAT_IPV4 +endif # NF_NAT config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" @@ -263,7 +220,6 @@ config IP_NF_NAT depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT - select NF_NAT_IPV4 select NETFILTER_XT_NAT help This enables the `nat' table in iptables. This allows masquerading, @@ -276,7 +232,7 @@ if IP_NF_NAT config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" - select NF_NAT_MASQUERADE_IPV4 + select NF_NAT_MASQUERADE default m if NETFILTER_ADVANCED=n help Masquerading is a special case of NAT: all outgoing connections are diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index fd7122e0e2c9..e241f5188ebe 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -3,10 +3,6 @@ # Makefile for the netfilter modules on top of IPv4. # -nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o -nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o -obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o - # defrag obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o @@ -29,11 +25,8 @@ $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o -obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o -obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o -obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o # flow table support diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index b61977db9b7f..835d50b279f5 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -846,9 +846,9 @@ static int clusterip_net_init(struct net *net) static void clusterip_net_exit(struct net *net) { +#ifdef CONFIG_PROC_FS struct clusterip_net *cn = clusterip_pernet(net); -#ifdef CONFIG_PROC_FS mutex_lock(&cn->mutex); proc_remove(cn->procdir); cn->procdir = NULL; @@ -864,7 +864,7 @@ static struct pernet_operations clusterip_net_ops = { .size = sizeof(struct clusterip_net), }; -struct notifier_block cip_netdev_notifier = { +static struct notifier_block cip_netdev_notifier = { .notifier_call = clusterip_netdev_event }; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index a317445448bf..007da0882412 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -15,8 +15,6 @@ #include <net/ip.h> #include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_l3proto.h> static int __net_init iptable_nat_table_init(struct net *net); @@ -70,10 +68,10 @@ static int ipt_nat_register_lookups(struct net *net) int i, ret; for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) { - ret = nf_nat_l3proto_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]); + ret = nf_nat_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]); if (ret) { while (i) - nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]); + nf_nat_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]); return ret; } @@ -87,7 +85,7 @@ static void ipt_nat_unregister_lookups(struct net *net) int i; for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) - nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]); + nf_nat_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]); } static int __net_init iptable_nat_table_init(struct net *net) diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c deleted file mode 100644 index 2687db015b6f..000000000000 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ /dev/null @@ -1,387 +0,0 @@ -/* - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2011 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/icmp.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <net/secure_seq.h> -#include <net/checksum.h> -#include <net/route.h> -#include <net/ip.h> - -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_l3proto.h> -#include <net/netfilter/nf_nat_l4proto.h> - -static const struct nf_nat_l3proto nf_nat_l3proto_ipv4; - -#ifdef CONFIG_XFRM -static void nf_nat_ipv4_decode_session(struct sk_buff *skb, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, - unsigned long statusbit, - struct flowi *fl) -{ - const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; - struct flowi4 *fl4 = &fl->u.ip4; - - if (ct->status & statusbit) { - fl4->daddr = t->dst.u3.ip; - if (t->dst.protonum == IPPROTO_TCP || - t->dst.protonum == IPPROTO_UDP || - t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || - t->dst.protonum == IPPROTO_SCTP) - fl4->fl4_dport = t->dst.u.all; - } - - statusbit ^= IPS_NAT_MASK; - - if (ct->status & statusbit) { - fl4->saddr = t->src.u3.ip; - if (t->dst.protonum == IPPROTO_TCP || - t->dst.protonum == IPPROTO_UDP || - t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || - t->dst.protonum == IPPROTO_SCTP) - fl4->fl4_sport = t->src.u.all; - } -} -#endif /* CONFIG_XFRM */ - -static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, - const struct nf_conntrack_tuple *target, - enum nf_nat_manip_type maniptype) -{ - struct iphdr *iph; - unsigned int hdroff; - - if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) - return false; - - iph = (void *)skb->data + iphdroff; - hdroff = iphdroff + iph->ihl * 4; - - if (!nf_nat_l4proto_manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, - hdroff, target, maniptype)) - return false; - iph = (void *)skb->data + iphdroff; - - if (maniptype == NF_NAT_MANIP_SRC) { - csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); - iph->saddr = target->src.u3.ip; - } else { - csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); - iph->daddr = target->dst.u3.ip; - } - return true; -} - -static void nf_nat_ipv4_csum_update(struct sk_buff *skb, - unsigned int iphdroff, __sum16 *check, - const struct nf_conntrack_tuple *t, - enum nf_nat_manip_type maniptype) -{ - struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); - __be32 oldip, newip; - - if (maniptype == NF_NAT_MANIP_SRC) { - oldip = iph->saddr; - newip = t->src.u3.ip; - } else { - oldip = iph->daddr; - newip = t->dst.u3.ip; - } - inet_proto_csum_replace4(check, skb, oldip, newip, true); -} - -static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, - u8 proto, void *data, __sum16 *check, - int datalen, int oldlen) -{ - if (skb->ip_summed != CHECKSUM_PARTIAL) { - const struct iphdr *iph = ip_hdr(skb); - - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + - ip_hdrlen(skb); - skb->csum_offset = (void *)check - data; - *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, - proto, 0); - } else - inet_proto_csum_replace2(check, skb, - htons(oldlen), htons(datalen), true); -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) -static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range2 *range) -{ - if (tb[CTA_NAT_V4_MINIP]) { - range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); - range->flags |= NF_NAT_RANGE_MAP_IPS; - } - - if (tb[CTA_NAT_V4_MAXIP]) - range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); - else - range->max_addr.ip = range->min_addr.ip; - - return 0; -} -#endif - -static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { - .l3proto = NFPROTO_IPV4, - .manip_pkt = nf_nat_ipv4_manip_pkt, - .csum_update = nf_nat_ipv4_csum_update, - .csum_recalc = nf_nat_ipv4_csum_recalc, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_ipv4_nlattr_to_range, -#endif -#ifdef CONFIG_XFRM - .decode_session = nf_nat_ipv4_decode_session, -#endif -}; - -int nf_nat_icmp_reply_translation(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int hooknum) -{ - struct { - struct icmphdr icmp; - struct iphdr ip; - } *inside; - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); - unsigned int hdrlen = ip_hdrlen(skb); - struct nf_conntrack_tuple target; - unsigned long statusbit; - - WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); - - if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) - return 0; - if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) - return 0; - - inside = (void *)skb->data + hdrlen; - if (inside->icmp.type == ICMP_REDIRECT) { - if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) - return 0; - if (ct->status & IPS_NAT_MASK) - return 0; - } - - if (manip == NF_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply direction */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - if (!(ct->status & statusbit)) - return 1; - - if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), - &ct->tuplehash[!dir].tuple, !manip)) - return 0; - - if (skb->ip_summed != CHECKSUM_PARTIAL) { - /* Reloading "inside" here since manip_pkt may reallocate */ - inside = (void *)skb->data + hdrlen; - inside->icmp.checksum = 0; - inside->icmp.checksum = - csum_fold(skb_checksum(skb, hdrlen, - skb->len - hdrlen, 0)); - } - - /* Change outer to look like the reply to an incoming packet */ - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) - return 0; - - return 1; -} -EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); - -static unsigned int -nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct) - return NF_ACCEPT; - - if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { - if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - state->hook)) - return NF_DROP; - else - return NF_ACCEPT; - } - } - - return nf_nat_inet_fn(priv, skb, state); -} - -static unsigned int -nf_nat_ipv4_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - unsigned int ret; - __be32 daddr = ip_hdr(skb)->daddr; - - ret = nf_nat_ipv4_fn(priv, skb, state); - if (ret != NF_DROP && ret != NF_STOLEN && - daddr != ip_hdr(skb)->daddr) - skb_dst_drop(skb); - - return ret; -} - -static unsigned int -nf_nat_ipv4_out(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ -#ifdef CONFIG_XFRM - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - int err; -#endif - unsigned int ret; - - ret = nf_nat_ipv4_fn(priv, skb, state); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if ((ct->tuplehash[dir].tuple.src.u3.ip != - ct->tuplehash[!dir].tuple.dst.u3.ip) || - (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && - ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(state->net, skb, AF_INET); - if (err < 0) - ret = NF_DROP_ERR(err); - } - } -#endif - return ret; -} - -static unsigned int -nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned int ret; - int err; - - ret = nf_nat_ipv4_fn(priv, skb, state); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.dst.u3.ip != - ct->tuplehash[!dir].tuple.src.u3.ip) { - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#ifdef CONFIG_XFRM - else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && - ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(state->net, skb, AF_INET); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#endif - } - return ret; -} - -static const struct nf_hook_ops nf_nat_ipv4_ops[] = { - /* Before packet filtering, change destination */ - { - .hook = nf_nat_ipv4_in, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_NAT_DST, - }, - /* After packet filtering, change source */ - { - .hook = nf_nat_ipv4_out, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC, - }, - /* Before packet filtering, change destination */ - { - .hook = nf_nat_ipv4_local_fn, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_NAT_DST, - }, - /* After packet filtering, change source */ - { - .hook = nf_nat_ipv4_fn, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_NAT_SRC, - }, -}; - -int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops) -{ - return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); -} -EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_register_fn); - -void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops) -{ - nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); -} -EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn); - -static int __init nf_nat_l3proto_ipv4_init(void) -{ - return nf_nat_l3proto_register(&nf_nat_l3proto_ipv4); -} - -static void __exit nf_nat_l3proto_ipv4_exit(void) -{ - nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4); -} - -MODULE_LICENSE("GPL"); -MODULE_ALIAS("nf-nat-" __stringify(AF_INET)); - -module_init(nf_nat_l3proto_ipv4_init); -module_exit(nf_nat_l3proto_ipv4_exit); diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c deleted file mode 100644 index 41327bb99093..000000000000 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ /dev/null @@ -1,196 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/atomic.h> -#include <linux/inetdevice.h> -#include <linux/ip.h> -#include <linux/timer.h> -#include <linux/netfilter.h> -#include <net/protocol.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <net/route.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/ipv4/nf_nat_masquerade.h> - -unsigned int -nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, - const struct nf_nat_range2 *range, - const struct net_device *out) -{ - struct nf_conn *ct; - struct nf_conn_nat *nat; - enum ip_conntrack_info ctinfo; - struct nf_nat_range2 newrange; - const struct rtable *rt; - __be32 newsrc, nh; - - WARN_ON(hooknum != NF_INET_POST_ROUTING); - - ct = nf_ct_get(skb, &ctinfo); - - WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY))); - - /* Source address is 0.0.0.0 - locally generated packet that is - * probably not supposed to be masqueraded. - */ - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) - return NF_ACCEPT; - - rt = skb_rtable(skb); - nh = rt_nexthop(rt, ip_hdr(skb)->daddr); - newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); - if (!newsrc) { - pr_info("%s ate my IP address\n", out->name); - return NF_DROP; - } - - nat = nf_ct_nat_ext_add(ct); - if (nat) - nat->masq_index = out->ifindex; - - /* Transfer from original range. */ - memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); - memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); - newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.ip = newsrc; - newrange.max_addr.ip = newsrc; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; - - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); -} -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); - -static int device_cmp(struct nf_conn *i, void *ifindex) -{ - const struct nf_conn_nat *nat = nfct_nat(i); - - if (!nat) - return 0; - if (nf_ct_l3num(i) != NFPROTO_IPV4) - return 0; - return nat->masq_index == (int)(long)ifindex; -} - -static int masq_device_event(struct notifier_block *this, - unsigned long event, - void *ptr) -{ - const struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net *net = dev_net(dev); - - if (event == NETDEV_DOWN) { - /* Device was downed. Search entire table for - * conntracks which were associated with that device, - * and forget them. - */ - WARN_ON(dev->ifindex == 0); - - nf_ct_iterate_cleanup_net(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); - } - - return NOTIFY_DONE; -} - -static int inet_cmp(struct nf_conn *ct, void *ptr) -{ - struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; - struct net_device *dev = ifa->ifa_dev->dev; - struct nf_conntrack_tuple *tuple; - - if (!device_cmp(ct, (void *)(long)dev->ifindex)) - return 0; - - tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - - return ifa->ifa_address == tuple->dst.u3.ip; -} - -static int masq_inet_event(struct notifier_block *this, - unsigned long event, - void *ptr) -{ - struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; - struct net *net = dev_net(idev->dev); - - /* The masq_dev_notifier will catch the case of the device going - * down. So if the inetdev is dead and being destroyed we have - * no work to do. Otherwise this is an individual address removal - * and we have to perform the flush. - */ - if (idev->dead) - return NOTIFY_DONE; - - if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); - - return NOTIFY_DONE; -} - -static struct notifier_block masq_dev_notifier = { - .notifier_call = masq_device_event, -}; - -static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, -}; - -static int masq_refcnt; -static DEFINE_MUTEX(masq_mutex); - -int nf_nat_masquerade_ipv4_register_notifier(void) -{ - int ret = 0; - - mutex_lock(&masq_mutex); - /* check if the notifier was already set */ - if (++masq_refcnt > 1) - goto out_unlock; - - /* Register for device down reports */ - ret = register_netdevice_notifier(&masq_dev_notifier); - if (ret) - goto err_dec; - /* Register IP address change reports */ - ret = register_inetaddr_notifier(&masq_inet_notifier); - if (ret) - goto err_unregister; - - mutex_unlock(&masq_mutex); - return ret; - -err_unregister: - unregister_netdevice_notifier(&masq_dev_notifier); -err_dec: - masq_refcnt--; -out_unlock: - mutex_unlock(&masq_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); - -void nf_nat_masquerade_ipv4_unregister_notifier(void) -{ - mutex_lock(&masq_mutex); - /* check if the notifier still has clients */ - if (--masq_refcnt > 0) - goto out_unlock; - - unregister_netdevice_notifier(&masq_dev_notifier); - unregister_inetaddr_notifier(&masq_inet_notifier); -out_unlock: - mutex_unlock(&masq_mutex); -} -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c index a0aa13bcabda..0a8a60c1bf9a 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -105,6 +105,8 @@ static void fast_csum(struct snmp_ctx *ctx, unsigned char offset) int snmp_version(void *context, size_t hdrlen, unsigned char tag, const void *data, size_t datalen) { + if (datalen != 1) + return -EINVAL; if (*(unsigned char *)data > 1) return -ENOTSUPP; return 1; @@ -114,8 +116,11 @@ int snmp_helper(void *context, size_t hdrlen, unsigned char tag, const void *data, size_t datalen) { struct snmp_ctx *ctx = (struct snmp_ctx *)context; - __be32 *pdata = (__be32 *)data; + __be32 *pdata; + if (datalen != 4) + return -EINVAL; + pdata = (__be32 *)data; if (*pdata == ctx->from) { pr_debug("%s: %pI4 to %pI4\n", __func__, (void *)&ctx->from, (void *)&ctx->to); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index aa8304c618b8..7dc3c324b911 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -173,21 +173,16 @@ EXPORT_SYMBOL_GPL(nf_send_reset); void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) { struct iphdr *iph = ip_hdr(skb_in); - u8 proto; + u8 proto = iph->protocol; if (iph->frag_off & htons(IP_OFFSET)) return; - if (skb_csum_unnecessary(skb_in)) { + if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) { icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); return; } - if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP) - proto = iph->protocol; - else - proto = 0; - if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0) icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); } diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c deleted file mode 100644 index a3c4ea303e3e..000000000000 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> - * Copyright (c) 2012 Intel Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables_ipv4.h> -#include <net/netfilter/nf_nat_l3proto.h> -#include <net/ip.h> - -static unsigned int nft_nat_do_chain(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); - - return nft_do_chain(&pkt, priv); -} - -static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops) -{ - return nf_nat_l3proto_ipv4_register_fn(net, ops); -} - -static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops) -{ - nf_nat_l3proto_ipv4_unregister_fn(net, ops); -} - -static const struct nft_chain_type nft_chain_nat_ipv4 = { - .name = "nat", - .type = NFT_CHAIN_T_NAT, - .family = NFPROTO_IPV4, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_LOCAL_IN), - .hooks = { - [NF_INET_PRE_ROUTING] = nft_nat_do_chain, - [NF_INET_POST_ROUTING] = nft_nat_do_chain, - [NF_INET_LOCAL_OUT] = nft_nat_do_chain, - [NF_INET_LOCAL_IN] = nft_nat_do_chain, - }, - .ops_register = nft_nat_ipv4_reg, - .ops_unregister = nft_nat_ipv4_unreg, -}; - -static int __init nft_chain_nat_init(void) -{ - nft_register_chain_type(&nft_chain_nat_ipv4); - - return 0; -} - -static void __exit nft_chain_nat_exit(void) -{ - nft_unregister_chain_type(&nft_chain_nat_ipv4); -} - -module_init(nft_chain_nat_init); -module_exit(nft_chain_nat_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c deleted file mode 100644 index 6847de1d1db8..000000000000 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nft_masq.h> -#include <net/netfilter/ipv4/nf_nat_masquerade.h> - -static void nft_masq_ipv4_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_masq *priv = nft_expr_priv(expr); - struct nf_nat_range2 range; - - memset(&range, 0, sizeof(range)); - range.flags = priv->flags; - if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - } - regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt), - &range, nft_out(pkt)); -} - -static void -nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) -{ - nf_ct_netns_put(ctx->net, NFPROTO_IPV4); -} - -static struct nft_expr_type nft_masq_ipv4_type; -static const struct nft_expr_ops nft_masq_ipv4_ops = { - .type = &nft_masq_ipv4_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), - .eval = nft_masq_ipv4_eval, - .init = nft_masq_init, - .destroy = nft_masq_ipv4_destroy, - .dump = nft_masq_dump, - .validate = nft_masq_validate, -}; - -static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { - .family = NFPROTO_IPV4, - .name = "masq", - .ops = &nft_masq_ipv4_ops, - .policy = nft_masq_policy, - .maxattr = NFTA_MASQ_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_masq_ipv4_module_init(void) -{ - int ret; - - ret = nft_register_expr(&nft_masq_ipv4_type); - if (ret < 0) - return ret; - - ret = nf_nat_masquerade_ipv4_register_notifier(); - if (ret) - nft_unregister_expr(&nft_masq_ipv4_type); - - return ret; -} - -static void __exit nft_masq_ipv4_module_exit(void) -{ - nft_unregister_expr(&nft_masq_ipv4_type); - nf_nat_masquerade_ipv4_unregister_notifier(); -} - -module_init(nft_masq_ipv4_module_init); -module_exit(nft_masq_ipv4_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org"); -MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c deleted file mode 100644 index 5120be1d3118..000000000000 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_redirect.h> -#include <net/netfilter/nft_redir.h> - -static void nft_redir_ipv4_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_redir *priv = nft_expr_priv(expr); - struct nf_nat_ipv4_multi_range_compat mr; - - memset(&mr, 0, sizeof(mr)); - if (priv->sreg_proto_min) { - mr.range[0].min.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - mr.range[0].max.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - } - - mr.range[0].flags |= priv->flags; - - regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt)); -} - -static void -nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) -{ - nf_ct_netns_put(ctx->net, NFPROTO_IPV4); -} - -static struct nft_expr_type nft_redir_ipv4_type; -static const struct nft_expr_ops nft_redir_ipv4_ops = { - .type = &nft_redir_ipv4_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), - .eval = nft_redir_ipv4_eval, - .init = nft_redir_init, - .destroy = nft_redir_ipv4_destroy, - .dump = nft_redir_dump, - .validate = nft_redir_validate, -}; - -static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { - .family = NFPROTO_IPV4, - .name = "redir", - .ops = &nft_redir_ipv4_ops, - .policy = nft_redir_policy, - .maxattr = NFTA_REDIR_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_redir_ipv4_module_init(void) -{ - return nft_register_expr(&nft_redir_ipv4_type); -} - -static void __exit nft_redir_ipv4_module_exit(void) -{ - nft_unregister_expr(&nft_redir_ipv4_type); -} - -module_init(nft_redir_ipv4_module_init); -module_exit(nft_redir_ipv4_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); -MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c index f86bb4f06609..d8e3a1fb8e82 100644 --- a/net/ipv4/netlink.c +++ b/net/ipv4/netlink.c @@ -3,9 +3,10 @@ #include <linux/types.h> #include <net/net_namespace.h> #include <net/netlink.h> +#include <linux/in6.h> #include <net/ip.h> -int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, +int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, struct netlink_ext_ack *extack) { *ip_proto = nla_get_u8(attr); @@ -13,11 +14,19 @@ int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, switch (*ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: + return 0; case IPPROTO_ICMP: + if (family != AF_INET) + break; + return 0; +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ICMPV6: + if (family != AF_INET6) + break; return 0; - default: - NL_SET_ERR_MSG(extack, "Unsupported ip proto"); - return -EOPNOTSUPP; +#endif } + NL_SET_ERR_MSG(extack, "Unsupported ip proto"); + return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ce92f73cf104..738ff0a1a048 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -887,13 +887,15 @@ void ip_rt_send_redirect(struct sk_buff *skb) /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) + if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { peer->rate_tokens = 0; + peer->n_redirects = 0; + } /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ - if (peer->rate_tokens >= ip_rt_redirect_number) { + if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; goto out_put_peer; } @@ -910,6 +912,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->rate_tokens; + ++peer->n_redirects; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && peer->rate_tokens == ip_rt_redirect_number) @@ -1608,7 +1611,8 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, return -EINVAL; if (ipv4_is_zeronet(saddr)) { - if (!ipv4_is_local_multicast(daddr)) + if (!ipv4_is_local_multicast(daddr) && + ip_hdr(skb)->protocol != IPPROTO_IGMP) return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, @@ -1816,6 +1820,7 @@ out: int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { + u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; u32 mhash; @@ -1866,6 +1871,9 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, } mhash = flow_hash_from_keys(&hash_keys); + if (multipath_hash) + mhash = jhash_2words(mhash, multipath_hash, 0); + return mhash >> 1; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -2763,6 +2771,75 @@ static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, return skb; } +static int inet_rtm_valid_getroute_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, + "ipv4: Invalid header for route get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + + rtm = nlmsg_data(nlh); + if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || + (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || + rtm->rtm_table || rtm->rtm_protocol || + rtm->rtm_scope || rtm->rtm_type) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); + return -EINVAL; + } + + if (rtm->rtm_flags & ~(RTM_F_NOTIFY | + RTM_F_LOOKUP_TABLE | + RTM_F_FIB_MATCH)) { + NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + if (err) + return err; + + if ((tb[RTA_SRC] && !rtm->rtm_src_len) || + (tb[RTA_DST] && !rtm->rtm_dst_len)) { + NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); + return -EINVAL; + } + + for (i = 0; i <= RTA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case RTA_IIF: + case RTA_OIF: + case RTA_SRC: + case RTA_DST: + case RTA_IP_PROTO: + case RTA_SPORT: + case RTA_DPORT: + case RTA_MARK: + case RTA_UID: + break; + default: + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); + return -EINVAL; + } + } + + return 0; +} + static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2783,8 +2860,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err; int mark; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, - extack); + err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; @@ -2800,7 +2876,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], - &ip_proto, extack); + &ip_proto, AF_INET, extack); if (err) return err; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 27e2f6837062..ad07dd71063d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1127,7 +1127,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp) } static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, - int *copied, size_t size) + int *copied, size_t size, + struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -1147,6 +1148,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; + tp->fastopen_req->uarg = uarg; if (inet->defer_connect) { err = tcp_connect(sk); @@ -1186,11 +1188,6 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) flags = msg->msg_flags; if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { - if (sk->sk_state != TCP_ESTABLISHED) { - err = -EINVAL; - goto out_err; - } - skb = tcp_write_queue_tail(sk); uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { @@ -1205,7 +1202,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && !tp->repair) { - err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) @@ -1415,7 +1412,8 @@ do_fault: /* It is the one place in all of TCP, except connection * reset, where we can be unlinking the send_head. */ - tcp_check_send_head(sk, skb); + if (tcp_write_queue_empty(sk)) + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); sk_wmem_free_skb(sk, skb); } @@ -1554,7 +1552,7 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) (copied > 0 && ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && - !icsk->icsk_ack.pingpong)) && + !inet_csk_in_pingpong_mode(sk))) && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = true; } @@ -1847,57 +1845,78 @@ out: #endif static void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping *tss) + struct scm_timestamping_internal *tss) { if (skb->tstamp) - tss->ts[0] = ktime_to_timespec(skb->tstamp); + tss->ts[0] = ktime_to_timespec64(skb->tstamp); else - tss->ts[0] = (struct timespec) {0}; + tss->ts[0] = (struct timespec64) {0}; if (skb_hwtstamps(skb)->hwtstamp) - tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp); + tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); else - tss->ts[2] = (struct timespec) {0}; + tss->ts[2] = (struct timespec64) {0}; } /* Similar to __sock_recv_timestamp, but does not require an skb */ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping *tss) + struct scm_timestamping_internal *tss) { - struct timeval tv; + int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { if (sock_flag(sk, SOCK_RCVTSTAMP)) { if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { - put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, - sizeof(tss->ts[0]), &tss->ts[0]); - } else { - tv.tv_sec = tss->ts[0].tv_sec; - tv.tv_usec = tss->ts[0].tv_nsec / 1000; + if (new_tstamp) { + struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec}; - put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, - sizeof(tv), &tv); + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, + sizeof(kts), &kts); + } else { + struct timespec ts_old = timespec64_to_timespec(tss->ts[0]); + + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, + sizeof(ts_old), &ts_old); + } + } else { + if (new_tstamp) { + struct __kernel_sock_timeval stv; + + stv.tv_sec = tss->ts[0].tv_sec; + stv.tv_usec = tss->ts[0].tv_nsec / 1000; + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, + sizeof(stv), &stv); + } else { + struct __kernel_old_timeval tv; + + tv.tv_sec = tss->ts[0].tv_sec; + tv.tv_usec = tss->ts[0].tv_nsec / 1000; + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, + sizeof(tv), &tv); + } } } if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) has_timestamping = true; else - tss->ts[0] = (struct timespec) {0}; + tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) has_timestamping = true; else - tss->ts[2] = (struct timespec) {0}; + tss->ts[2] = (struct timespec64) {0}; } if (has_timestamping) { - tss->ts[1] = (struct timespec) {0}; - put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, - sizeof(*tss), tss); + tss->ts[1] = (struct timespec64) {0}; + if (sock_flag(sk, SOCK_TSTAMP_NEW)) + put_cmsg_scm_timestamping64(msg, tss); + else + put_cmsg_scm_timestamping(msg, tss); } } @@ -1938,7 +1957,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, long timeo; struct sk_buff *skb, *last; u32 urg_hole = 0; - struct scm_timestamping tss; + struct scm_timestamping_internal tss; bool has_tss = false; bool has_cmsg; @@ -2528,6 +2547,7 @@ void tcp_write_queue_purge(struct sock *sk) sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; + inet_csk(sk)->icsk_backoff = 0; } int tcp_disconnect(struct sock *sk, int flags) @@ -2572,6 +2592,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; tp->write_seq += tp->max_window + 2; if (tp->write_seq == 0) @@ -2579,7 +2600,9 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; + icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd = TCP_INIT_CWND; tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; tp->delivered_ce = 0; @@ -2603,6 +2626,23 @@ int tcp_disconnect(struct sock *sk, int flags) tp->duplicate_sack[0].end_seq = 0; tp->dsack_dups = 0; tp->reord_seen = 0; + tp->retrans_out = 0; + tp->sacked_out = 0; + tp->tlp_high_seq = 0; + tp->last_oow_ack_time = 0; + /* There's a bubble in the pipe until at least the first ACK. */ + tp->app_limited = ~0U; + tp->rack.mstamp = 0; + tp->rack.advanced = 0; + tp->rack.reo_wnd_steps = 1; + tp->rack.last_delivered = 0; + tp->rack.reo_wnd_persist = 0; + tp->rack.dsack_seen = 0; + tp->syn_data_acked = 0; + tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -2968,16 +3008,16 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_QUICKACK: if (!val) { - icsk->icsk_ack.pingpong = 1; + inet_csk_enter_pingpong_mode(sk); } else { - icsk->icsk_ack.pingpong = 0; + inet_csk_exit_pingpong_mode(sk); if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) - icsk->icsk_ack.pingpong = 1; + inet_csk_enter_pingpong_mode(sk); } } break; @@ -3391,7 +3431,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; } case TCP_QUICKACK: - val = !icsk->icsk_ack.pingpong; + val = !inet_csk_in_pingpong_mode(sk); break; case TCP_CONGESTION: @@ -3659,7 +3699,7 @@ bool tcp_alloc_md5sig_pool(void) if (!tcp_md5sig_pool_populated) { __tcp_alloc_md5sig_pool(); if (tcp_md5sig_pool_populated) - static_key_slow_inc(&tcp_md5_needed); + static_branch_inc(&tcp_md5_needed); } mutex_unlock(&tcp_md5sig_mutex); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 0f497fc49c3f..56be7d27f208 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -115,6 +115,14 @@ struct bbr { unused_b:5; u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 full_bw; /* recent bw, to estimate if pipe is full */ + + /* For tracking ACK aggregation: */ + u64 ack_epoch_mstamp; /* start of ACK sampling epoch */ + u16 extra_acked[2]; /* max excess data ACKed in epoch */ + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ + unused_c:6; }; #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ @@ -182,6 +190,15 @@ static const u32 bbr_lt_bw_diff = 4000 / 8; /* If we estimate we're policed, use lt_bw for this many round trips: */ static const u32 bbr_lt_bw_max_rtts = 48; +/* Gain factor for adding extra_acked to target cwnd: */ +static const int bbr_extra_acked_gain = BBR_UNIT; +/* Window length of extra_acked window. */ +static const u32 bbr_extra_acked_win_rtts = 5; +/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */ +static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; +/* Time period for clamping cwnd increment due to ack aggregation */ +static const u32 bbr_extra_acked_max_us = 100 * 1000; + static void bbr_check_probe_rtt_done(struct sock *sk); /* Do we estimate that STARTUP filled the pipe? */ @@ -208,6 +225,16 @@ static u32 bbr_bw(const struct sock *sk) return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); } +/* Return maximum extra acked in past k-2k round trips, + * where k = bbr_extra_acked_win_rtts. + */ +static u16 bbr_extra_acked(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return max(bbr->extra_acked[0], bbr->extra_acked[1]); +} + /* Return rate in bytes per second, optionally with a gain. * The order here is chosen carefully to avoid overflow of u64. This should * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. @@ -305,6 +332,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) if (event == CA_EVENT_TX_START && tp->app_limited) { bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; /* Avoid pointless buffer overflows: pace at est. bw if we don't * need more speed (we're restarting from idle and app-limited). */ @@ -315,30 +344,19 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) } } -/* Find target cwnd. Right-size the cwnd based on min RTT and the - * estimated bottleneck bandwidth: +/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: * - * cwnd = bw * min_rtt * gain = BDP * gain + * bdp = bw * min_rtt * gain * * The key factor, gain, controls the amount of queue. While a small gain * builds a smaller queue, it becomes more vulnerable to noise in RTT * measurements (e.g., delayed ACKs or other ACK compression effects). This * noise may cause BBR to under-estimate the rate. - * - * To achieve full performance in high-speed paths, we budget enough cwnd to - * fit full-sized skbs in-flight on both end hosts to fully utilize the path: - * - one skb in sending host Qdisc, - * - one skb in sending host TSO/GSO engine - * - one skb being received by receiver host LRO/GRO/delayed-ACK engine - * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because - * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, - * which allows 2 outstanding 2-packet sequences, to try to keep pipe - * full even with ACK-every-other-packet delayed ACKs. */ -static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) +static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) { struct bbr *bbr = inet_csk_ca(sk); - u32 cwnd; + u32 bdp; u64 w; /* If we've never had a valid RTT sample, cap cwnd at the initial @@ -353,7 +371,24 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) w = (u64)bw * bbr->min_rtt_us; /* Apply a gain to the given value, then remove the BW_SCALE shift. */ - cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + return bdp; +} + +/* To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); /* Allow enough full-sized skbs in flight to utilize end systems. */ cwnd += 3 * bbr_tso_segs_goal(sk); @@ -368,6 +403,17 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) return cwnd; } +/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */ +static u32 bbr_inflight(struct sock *sk, u32 bw, int gain) +{ + u32 inflight; + + inflight = bbr_bdp(sk, bw, gain); + inflight = bbr_quantization_budget(sk, inflight, gain); + + return inflight; +} + /* With pacing at lower layers, there's often less data "in the network" than * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), * we often have several skbs queued in the pacing layer with a pre-scheduled @@ -401,6 +447,22 @@ static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) return inflight_at_edt - interval_delivered; } +/* Find the cwnd increment based on estimate of ack aggregation */ +static u32 bbr_ack_aggregation_cwnd(struct sock *sk) +{ + u32 max_aggr_cwnd, aggr_cwnd = 0; + + if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; + aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } + + return aggr_cwnd; +} + /* An optimization in BBR to reduce losses: On the first round of recovery, we * follow the packet conservation principle: send P packets per P packets acked. * After that, we slow-start and send at most 2*P packets per P packets acked. @@ -461,8 +523,15 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) goto done; + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems + * due to aggregation (of data and/or ACKs) visible in the ACK stream. + */ + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ - target_cwnd = bbr_target_cwnd(sk, bw, gain); if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ cwnd = min(cwnd + acked, target_cwnd); else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) @@ -503,14 +572,14 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, if (bbr->pacing_gain > BBR_UNIT) return is_full_length && (rs->losses || /* perhaps pacing_gain*BDP won't fit */ - inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); + inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); /* A pacing_gain < 1.0 tries to drain extra queue we added if bw * probing didn't find more bw. If inflight falls to match BDP then we * estimate queue is drained; persisting would underutilize the pipe. */ return is_full_length || - inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); + inflight <= bbr_inflight(sk, bw, BBR_UNIT); } static void bbr_advance_cycle_phase(struct sock *sk) @@ -727,6 +796,67 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) } } +/* Estimates the windowed max degree of ack aggregation. + * This is used to provision extra in-flight data to keep sending during + * inter-ACK silences. + * + * Degree of ack aggregation is estimated as extra data acked beyond expected. + * + * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval" + * cwnd += max_extra_acked + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round + * trips. + */ +static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +{ + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); + if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; + bbr->extra_acked[bbr->extra_acked_win_idx] = 0; + } + } + + /* Compute how many packets we expected to be delivered over epoch. */ + epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp, + bbr->ack_epoch_mstamp); + expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT; + + /* Reset the aggregation epoch if ACK rate is below expected rate or + * significantly large no. of ack received since epoch (potentially + * quite old epoch). + */ + if (bbr->ack_epoch_acked <= expected_acked || + (bbr->ack_epoch_acked + rs->acked_sacked >= + bbr_ack_epoch_acked_reset_thresh)) { + bbr->ack_epoch_acked = 0; + bbr->ack_epoch_mstamp = tp->delivered_mstamp; + expected_acked = 0; + } + + /* Compute excess data delivered, beyond what was expected. */ + bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, + bbr->ack_epoch_acked + rs->acked_sacked); + extra_acked = bbr->ack_epoch_acked - expected_acked; + extra_acked = min(extra_acked, tp->snd_cwnd); + if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; +} + /* Estimate when the pipe is full, using the change in delivery rate: BBR * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited @@ -762,11 +892,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { bbr->mode = BBR_DRAIN; /* drain queue we created */ tcp_sk(sk)->snd_ssthresh = - bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= - bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ } @@ -881,6 +1011,7 @@ static void bbr_update_gains(struct sock *sk) static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) { bbr_update_bw(sk, rs); + bbr_update_ack_aggregation(sk, rs); bbr_update_cycle_phase(sk, rs); bbr_check_full_bw_reached(sk, rs); bbr_check_drain(sk, rs); @@ -932,6 +1063,13 @@ static void bbr_init(struct sock *sk) bbr_reset_lt_bw_sampling(sk); bbr_reset_startup_mode(sk); + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = 0; + bbr->extra_acked[0] = 0; + bbr->extra_acked[1] = 0; + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 76858b14ebe9..4eb0c8ca3c60 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -221,7 +221,7 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk, max_quickacks); - icsk->icsk_ack.pingpong = 0; + inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } EXPORT_SYMBOL(tcp_enter_quickack_mode); @@ -236,7 +236,7 @@ static bool tcp_in_quickack_mode(struct sock *sk) const struct dst_entry *dst = __sk_dst_get(sk); return (dst && dst_metric(dst, RTAX_QUICKACK)) || - (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); + (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) @@ -1574,9 +1574,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, return skb; } -static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, - struct tcp_sacktag_state *state, - u32 seq) +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq) { struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; struct sk_buff *skb; @@ -1598,13 +1596,12 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, } static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, - struct tcp_sacktag_state *state, u32 skip_to_seq) { if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) return skb; - return tcp_sacktag_bsearch(sk, state, skip_to_seq); + return tcp_sacktag_bsearch(sk, skip_to_seq); } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, @@ -1617,7 +1614,7 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, return skb; if (before(next_dup->start_seq, skip_to_seq)) { - skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq); + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); skb = tcp_sacktag_walk(skb, sk, NULL, state, next_dup->start_seq, next_dup->end_seq, 1); @@ -1758,8 +1755,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, /* Head todo? */ if (before(start_seq, cache->start_seq)) { - skb = tcp_sacktag_skip(skb, sk, state, - start_seq); + skb = tcp_sacktag_skip(skb, sk, start_seq); skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, @@ -1785,7 +1781,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, goto walk; } - skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq); + skb = tcp_sacktag_skip(skb, sk, cache->end_seq); /* Check overlap against next cached too (past this one already) */ cache++; continue; @@ -1796,7 +1792,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, if (!skb) break; } - skb = tcp_sacktag_skip(skb, sk, state, start_seq); + skb = tcp_sacktag_skip(skb, sk, start_seq); walk: skb = tcp_sacktag_walk(skb, sk, next_dup, state, @@ -3595,7 +3591,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * this segment (RFC793 Section 3.9). */ if (after(ack, tp->snd_nxt)) - goto invalid_ack; + return -1; if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; @@ -3714,10 +3710,6 @@ no_queue: tcp_process_tlp_ack(sk, ack, flag); return 1; -invalid_ack: - SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt); - return -1; - old_ack: /* If data was SACKed, tag it and see if we should send more data. * If data was DSACKed, see if we can undo a cwnd reduction. @@ -3731,7 +3723,6 @@ old_ack: tcp_xmit_recovery(sk, rexmit); } - SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); return 0; } @@ -4094,7 +4085,7 @@ void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - inet_csk(sk)->icsk_ack.pingpong = 1; + inet_csk_enter_pingpong_mode(sk); break; case TCP_CLOSE_WAIT: @@ -4432,13 +4423,9 @@ static void tcp_ofo_queue(struct sock *sk) rb_erase(&skb->rbnode, &tp->out_of_order_queue); if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { - SOCK_DEBUG(sk, "ofo packet was already received\n"); tcp_drop(sk, skb); continue; } - SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); @@ -4502,8 +4489,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); seq = TCP_SKB_CB(skb)->seq; end_seq = TCP_SKB_CB(skb)->end_seq; - SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, seq, end_seq); p = &tp->out_of_order_queue.rb_node; if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { @@ -4779,10 +4764,6 @@ drop: if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ - SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); - tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); /* If window is closed, drop tail of packet. But after @@ -5061,8 +5042,6 @@ static int tcp_prune_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); - NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) @@ -5889,7 +5868,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, return -1; if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || - icsk->icsk_ack.pingpong) { + inet_csk_in_pingpong_mode(sk)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index efc6fef692ff..831d844a27ca 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -536,12 +536,15 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (sock_owned_by_user(sk)) break; + skb = tcp_rtx_queue_head(sk); + if (WARN_ON_ONCE(!skb)) + break; + icsk->icsk_backoff--; icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - skb = tcp_rtx_queue_head(sk); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); @@ -970,7 +973,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) * We need to maintain these in the sk structure. */ -struct static_key tcp_md5_needed __read_mostly; +DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); EXPORT_SYMBOL(tcp_md5_needed); /* Find the Key structure for an address. */ @@ -2437,7 +2440,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) refcount_read(&sk->sk_refcnt), sk, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), - (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, + (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), tp->snd_cwnd, state == TCP_LISTEN ? fastopenq->max_qlen : diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 12affb7864d9..79900f783e0d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -294,12 +294,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) * so the timewait ack generating code has the key. */ do { - struct tcp_md5sig_key *key; tcptw->tw_md5_key = NULL; - key = tp->af_specific->md5_lookup(sk, sk); - if (key) { - tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); - BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()); + if (static_branch_unlikely(&tcp_md5_needed)) { + struct tcp_md5sig_key *key; + + key = tp->af_specific->md5_lookup(sk, sk); + if (key) { + tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); + BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()); + } } } while (0); #endif @@ -338,10 +341,12 @@ EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG - struct tcp_timewait_sock *twsk = tcp_twsk(sk); + if (static_branch_unlikely(&tcp_md5_needed)) { + struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_md5_key) - kfree_rcu(twsk->tw_md5_key, rcu); + if (twsk->tw_md5_key) + kfree_rcu(twsk->tw_md5_key, rcu); + } #endif } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); @@ -479,43 +484,16 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, tcp_init_wl(newtp, treq->rcv_isn); - newtp->srtt_us = 0; - newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); - newicsk->icsk_rto = TCP_TIMEOUT_INIT; newicsk->icsk_ack.lrcvtime = tcp_jiffies32; - newtp->packets_out = 0; - newtp->retrans_out = 0; - newtp->sacked_out = 0; - newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - newtp->tlp_high_seq = 0; newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = treq->txhash; - newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - newtp->snd_cwnd = TCP_INIT_CWND; - newtp->snd_cwnd_cnt = 0; - - /* There's a bubble in the pipe until at least the first ACK. */ - newtp->app_limited = ~0U; - tcp_init_xmit_timers(newsk); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; - newtp->rx_opt.saw_tstamp = 0; - - newtp->rx_opt.dsack = 0; - newtp->rx_opt.num_sacks = 0; - - newtp->urg_data = 0; - if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); @@ -556,13 +534,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; - newtp->syn_data_acked = 0; - newtp->rack.mstamp = 0; - newtp->rack.advanced = 0; - newtp->rack.reo_wnd_steps = 1; - newtp->rack.last_delivered = 0; - newtp->rack.reo_wnd_persist = 0; - newtp->rack.dsack_seen = 0; __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 730bc44dbad9..4522579aaca2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -165,13 +165,16 @@ static void tcp_event_data_sent(struct tcp_sock *tp, if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); - tp->lsndtime = now; - - /* If it is a reply for ato after last received - * packet, enter pingpong mode. + /* If this is the first data packet sent in response to the + * previous received data, + * and it is a reply for ato after last received packet, + * increase pingpong count. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - icsk->icsk_ack.pingpong = 1; + if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) && + (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + inet_csk_inc_pingpong_cnt(sk); + + tp->lsndtime = now; } /* Account for an ACK we sent. */ @@ -594,7 +597,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (static_key_false(&tcp_md5_needed) && + if (static_branch_unlikely(&tcp_md5_needed) && rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { @@ -731,7 +734,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (static_key_false(&tcp_md5_needed) && + if (static_branch_unlikely(&tcp_md5_needed) && rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { @@ -980,7 +983,6 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (sk->sk_pacing_status != SK_PACING_NONE) { unsigned long rate = sk->sk_pacing_rate; @@ -1028,7 +1030,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); - + prior_wstamp = tp->tcp_wstamp_ns; + tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (clone_it) { TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; @@ -1045,11 +1049,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, return -ENOBUFS; } - prior_wstamp = tp->tcp_wstamp_ns; - tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); - - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; - inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); @@ -1847,17 +1846,17 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ -static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, - struct sk_buff *skb, unsigned int len, +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { - struct sk_buff *buff; int nlen = skb->len - len; + struct sk_buff *buff; u8 flags; /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp); + return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) @@ -1893,7 +1892,7 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); + tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE); return 0; } @@ -2347,6 +2346,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ } @@ -2391,8 +2391,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle); if (skb->len > limit && - unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, - skb, limit, mss_now, gfp))) + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; if (tcp_small_queue_check(sk, skb, 0)) @@ -2937,12 +2936,16 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + /* To avoid taking spuriously low RTT samples based on a timestamp + * for a transmit that never happened, always mark EVER_RETRANS + */ + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); if (likely(!err)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); @@ -2963,13 +2966,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); - - /* Save stamp of the first retransmit. */ - if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_skb_timestamp(skb); - } + /* Save stamp of the first (attempted) retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_skb_timestamp(skb); + if (tp->undo_retrans < 0) tp->undo_retrans = 0; tp->undo_retrans += tcp_skb_pcount(skb); @@ -3456,6 +3458,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) skb_trim(syn_data, copied); space = copied; } + skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) @@ -3569,7 +3572,7 @@ void tcp_send_delayed_ack(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; - if (icsk->icsk_ack.pingpong || + if (inet_csk_in_pingpong_mode(sk) || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; @@ -3750,7 +3753,7 @@ void tcp_send_probe0(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - unsigned long probe_max; + unsigned long timeout; int err; err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); @@ -3762,26 +3765,18 @@ void tcp_send_probe0(struct sock *sk) return; } + icsk->icsk_probes_out++; if (err <= 0) { if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) icsk->icsk_backoff++; - icsk->icsk_probes_out++; - probe_max = TCP_RTO_MAX; + timeout = tcp_probe0_when(sk, TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, - * do not backoff and do not remember icsk_probes_out. - * Let local senders to fight for local resources. - * - * Use accumulated backoff yet. + * Let senders fight for local resources conservatively. */ - if (!icsk->icsk_probes_out) - icsk->icsk_probes_out = 1; - probe_max = TCP_RESOURCE_PROBE_INTERVAL; - } - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_when(sk, probe_max), - TCP_RTO_MAX, - NULL); + timeout = TCP_RESOURCE_PROBE_INTERVAL; + } + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f87dbc78b6bc..f0c86398e6a7 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,28 +22,14 @@ #include <linux/gfp.h> #include <net/tcp.h> -static u32 tcp_retransmit_stamp(const struct sock *sk) -{ - u32 start_ts = tcp_sk(sk)->retrans_stamp; - - if (unlikely(!start_ts)) { - struct sk_buff *head = tcp_rtx_queue_head(sk); - - if (!head) - return 0; - start_ts = tcp_skb_timestamp(head); - } - return start_ts; -} - static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 elapsed, start_ts; s32 remaining; - start_ts = tcp_retransmit_stamp(sk); - if (!icsk->icsk_user_timeout || !start_ts) + start_ts = tcp_sk(sk)->retrans_stamp; + if (!icsk->icsk_user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; remaining = icsk->icsk_user_timeout - elapsed; @@ -173,7 +159,20 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } - +static unsigned int tcp_model_timeout(struct sock *sk, + unsigned int boundary, + unsigned int rto_base) +{ + unsigned int linear_backoff_thresh, timeout; + + linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * rto_base; + else + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + return jiffies_to_msecs(timeout); +} /** * retransmits_timed_out() - returns true if this connection has timed out * @sk: The current socket @@ -191,26 +190,15 @@ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, unsigned int timeout) { - const unsigned int rto_base = TCP_RTO_MIN; - unsigned int linear_backoff_thresh, start_ts; + unsigned int start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; - start_ts = tcp_retransmit_stamp(sk); - if (!start_ts) - return false; - - if (likely(timeout == 0)) { - linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); + start_ts = tcp_sk(sk)->retrans_stamp; + if (likely(timeout == 0)) + timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN); - if (boundary <= linear_backoff_thresh) - timeout = ((2 << boundary) - 1) * rto_base; - else - timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; - timeout = jiffies_to_msecs(timeout); - } return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } @@ -226,7 +214,7 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { dst_negative_advice(sk); - } else if (!tp->syn_data && !tp->syn_fastopen) { + } else { sk_rethink_txhash(sk); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; @@ -289,14 +277,14 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (inet_csk_ack_scheduled(sk)) { - if (!icsk->icsk_ack.pingpong) { + if (!inet_csk_in_pingpong_mode(sk)) { /* Delayed ACK missed: inflate ATO. */ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ - icsk->icsk_ack.pingpong = 0; + inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_mstamp_refresh(tcp_sk(sk)); @@ -345,7 +333,6 @@ static void tcp_probe_timer(struct sock *sk) struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; - u32 start_ts; if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; @@ -360,12 +347,13 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - start_ts = tcp_skb_timestamp(skb); - if (!start_ts) - skb->skb_mstamp_ns = tp->tcp_clock_cache; - else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) - goto abort; + if (icsk->icsk_user_timeout) { + u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out, + tcp_probe0_base(sk)); + + if (elapsed >= icsk->icsk_user_timeout) + goto abort; + } max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { @@ -395,6 +383,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ + struct tcp_sock *tp = tcp_sk(sk); struct request_sock *req; req = tcp_sk(sk)->fastopen_rsk; @@ -412,6 +401,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk) inet_rtx_syn_ack(sk, req); req->num_timeout++; icsk->icsk_retransmits++; + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_time_stamp(tp); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); } @@ -443,10 +434,8 @@ void tcp_retransmit_timer(struct sock *sk) */ return; } - if (!tp->packets_out) - goto out; - - WARN_ON(tcp_rtx_queue_empty(sk)); + if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk))) + return; tp->tlp_high_seq = 0; @@ -511,14 +500,13 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); + icsk->icsk_retransmits++; if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, - * do not backoff. + * Let senders fight for local resources conservatively. */ - if (!icsk->icsk_retransmits) - icsk->icsk_retransmits = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RESOURCE_PROBE_INTERVAL, TCP_RTO_MAX); goto out; } @@ -539,7 +527,6 @@ void tcp_retransmit_timer(struct sock *sk) * the 120 second clamps though! */ icsk->icsk_backoff++; - icsk->icsk_retransmits++; out_reset_timer: /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3fb0ed5e4789..372fdc5381a9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -562,10 +562,12 @@ static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, u32 info); + const struct ip_tunnel_encap_ops *encap; - if (!iptun_encaps[i]) + encap = rcu_dereference(iptun_encaps[i]); + if (!encap) continue; - handler = rcu_dereference(iptun_encaps[i]->err_handler); + handler = encap->err_handler; if (handler && !handler(skb, info)) return 0; } @@ -847,15 +849,23 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) + if (hlen + cork->gso_size > cork->fragsize) { + kfree_skb(skb); return -EINVAL; - if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) + } + if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) { + kfree_skb(skb); return -EINVAL; - if (sk->sk_no_check_tx) + } + if (sk->sk_no_check_tx) { + kfree_skb(skb); return -EINVAL; + } if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || - dst_xfrm(skb_dst(skb))) + dst_xfrm(skb_dst(skb))) { + kfree_skb(skb); return -EIO; + } skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; @@ -1918,7 +1928,7 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) } EXPORT_SYMBOL(udp_lib_rehash); -static void udp_v4_rehash(struct sock *sk) +void udp_v4_rehash(struct sock *sk) { u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 322672655419..6b2fa77eeb1c 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -10,6 +10,7 @@ int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); +void udp_v4_rehash(struct sock *sk); int udp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index be8b5b2157d8..e93cc0379201 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -21,18 +21,9 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->bind_ifindex) { - struct net_device *dev; - - dev = dev_get_by_index(net, cfg->bind_ifindex); - if (!dev) { - err = -ENODEV; - goto error; - } - - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, - dev->name, strlen(dev->name) + 1); - dev_put(dev); - + err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, + (void *)&cfg->bind_ifindex, + sizeof(cfg->bind_ifindex)); if (err < 0) goto error; } diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 39c7f17d916f..3c94b8f0ff27 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -53,6 +53,7 @@ struct proto udplite_prot = { .sendpage = udp_sendpage, .hash = udp_lib_hash, .unhash = udp_lib_unhash, + .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, |