diff options
Diffstat (limited to 'net/ipv4')
41 files changed, 608 insertions, 272 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 8e94ed7c56a0..6d2c97f8e9ef 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -661,7 +661,8 @@ config TCP_CONG_CDG For further details see: D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using - delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + delay gradients." In Networking 2011. Preprint: + http://caia.swin.edu.au/cv/dahayes/content/networking2011-cdg-preprint.pdf config TCP_CONG_BBR tristate "BBR TCP" diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 3f88d0961e5b..554804774628 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -14,10 +14,6 @@ /* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ static struct bpf_struct_ops bpf_tcp_congestion_ops; -static u32 unsupported_ops[] = { - offsetof(struct tcp_congestion_ops, get_info), -}; - static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; static const struct btf_type *tcp_congestion_ops_type; @@ -45,18 +41,6 @@ static int bpf_tcp_ca_init(struct btf *btf) return 0; } -static bool is_unsupported(u32 member_offset) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { - if (member_offset == unsupported_ops[i]) - return true; - } - - return false; -} - static bool bpf_tcp_ca_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -251,15 +235,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, return 0; } -static int bpf_tcp_ca_check_member(const struct btf_type *t, - const struct btf_member *member, - const struct bpf_prog *prog) -{ - if (is_unsupported(__btf_member_bit_offset(t, member) / 8)) - return -ENOTSUPP; - return 0; -} - static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link) { return tcp_register_congestion_control(kdata); @@ -354,7 +329,6 @@ static struct bpf_struct_ops bpf_tcp_congestion_ops = { .reg = bpf_tcp_ca_reg, .unreg = bpf_tcp_ca_unreg, .update = bpf_tcp_ca_update, - .check_member = bpf_tcp_ca_check_member, .init_member = bpf_tcp_ca_init_member, .init = bpf_tcp_ca_init, .validate = bpf_tcp_ca_validate, diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d96f3e452fef..ab76744383cf 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -216,17 +216,27 @@ static void devinet_sysctl_unregister(struct in_device *idev) /* Locks all the inet devices. */ -static struct in_ifaddr *inet_alloc_ifa(void) +static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) { - return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_ACCOUNT); + struct in_ifaddr *ifa; + + ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT); + if (!ifa) + return NULL; + + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + + INIT_HLIST_NODE(&ifa->hash); + + return ifa; } static void inet_rcu_free_ifa(struct rcu_head *head) { struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); - if (ifa->ifa_dev) - in_dev_put(ifa->ifa_dev); + in_dev_put(ifa->ifa_dev); kfree(ifa); } @@ -574,17 +584,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) ASSERT_RTNL(); - if (!in_dev) { - inet_free_ifa(ifa); - return -ENOBUFS; - } ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); - if (ifa->ifa_dev != in_dev) { - WARN_ON(ifa->ifa_dev); - in_dev_hold(in_dev); - ifa->ifa_dev = in_dev; - } + if (ipv4_is_loopback(ifa->ifa_local)) ifa->ifa_scope = RT_SCOPE_HOST; return inet_insert_ifa(ifa); @@ -701,8 +703,6 @@ errout: return err; } -#define INFINITY_LIFE_TIME 0xFFFFFFFF - static void check_lifetime(struct work_struct *work) { unsigned long now, next, next_sec, next_sched; @@ -875,7 +875,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, if (!in_dev) goto errout; - ifa = inet_alloc_ifa(); + ifa = inet_alloc_ifa(in_dev); if (!ifa) /* * A potential indev allocation can be left alive, it stays @@ -885,19 +885,15 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); - in_dev_hold(in_dev); if (!tb[IFA_ADDRESS]) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; - INIT_HLIST_NODE(&ifa->hash); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; ifa->ifa_scope = ifm->ifa_scope; - ifa->ifa_dev = in_dev; - ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); @@ -1184,10 +1180,12 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) if (!ifa) { ret = -ENOBUFS; - ifa = inet_alloc_ifa(); + if (!in_dev) + break; + ifa = inet_alloc_ifa(in_dev); if (!ifa) break; - INIT_HLIST_NODE(&ifa->hash); + if (colon) memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ); else @@ -1586,16 +1584,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { - struct in_ifaddr *ifa = inet_alloc_ifa(); + struct in_ifaddr *ifa = inet_alloc_ifa(in_dev); if (ifa) { - INIT_HLIST_NODE(&ifa->hash); ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; ifa->ifa_mask = inet_make_mask(8); - in_dev_hold(in_dev); - ifa->ifa_dev = in_dev; ifa->ifa_scope = RT_SCOPE_HOST; memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, @@ -1948,8 +1943,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } static size_t inet_get_link_af_size(const struct net_device *dev, @@ -2145,8 +2139,7 @@ void inet_netconf_notify_devconf(struct net *net, int event, int type, rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); + rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); } static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 47378ca41904..f3281312eb5e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -115,7 +115,8 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(sg_page(sg), skb->pp_recycle); + skb_page_unref(page_to_netmem(sg_page(sg)), + skb->pp_recycle); } #ifdef CONFIG_INET_ESPINTCP diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7ad2cafb9276..793e6781399a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -293,7 +293,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, + .flowi4_tos = ip_hdr(skb)->tos & INET_DSCP_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; @@ -1343,7 +1343,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, - .flowi4_tos = frn->fl_tos, + .flowi4_tos = frn->fl_tos & INET_DSCP_MASK, .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 5bdd1c016009..b07292d50ee7 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -37,6 +37,7 @@ struct fib4_rule { u8 dst_len; u8 src_len; dscp_t dscp; + u8 dscp_full:1; /* DSCP or TOS selector */ __be32 src; __be32 srcmask; __be32 dst; @@ -186,7 +187,15 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, ((daddr ^ r->dst) & r->dstmask)) return 0; - if (r->dscp && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos)) + /* When DSCP selector is used we need to match on the entire DSCP field + * in the flow information structure. When TOS selector is used we need + * to mask the upper three DSCP bits prior to matching to maintain + * legacy behavior. + */ + if (r->dscp_full && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos)) + return 0; + else if (!r->dscp_full && r->dscp && + !fib_dscp_masked_match(r->dscp, fl4)) return 0; if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) @@ -217,6 +226,20 @@ static struct fib_table *fib_empty_table(struct net *net) return NULL; } +static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4, + struct netlink_ext_ack *extack) +{ + if (rule4->dscp) { + NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP"); + return -EINVAL; + } + + rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + rule4->dscp_full = true; + + return 0; +} + static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, @@ -238,6 +261,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, } rule4->dscp = inet_dsfield_to_dscp(frh->tos); + if (tb[FRA_DSCP] && + fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0) + goto errout; + /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) @@ -320,9 +347,19 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->dst_len && (rule4->dst_len != frh->dst_len)) return 0; - if (frh->tos && inet_dscp_to_dsfield(rule4->dscp) != frh->tos) + if (frh->tos && + (rule4->dscp_full || + inet_dscp_to_dsfield(rule4->dscp) != frh->tos)) return 0; + if (tb[FRA_DSCP]) { + dscp_t dscp; + + dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2); + if (!rule4->dscp_full || rule4->dscp != dscp) + return 0; + } + #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; @@ -344,7 +381,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->dst_len = rule4->dst_len; frh->src_len = rule4->src_len; - frh->tos = inet_dscp_to_dsfield(rule4->dscp); + + if (rule4->dscp_full) { + frh->tos = 0; + if (nla_put_u8(skb, FRA_DSCP, + inet_dscp_to_dsfield(rule4->dscp) >> 2)) + goto nla_put_failure; + } else { + frh->tos = inet_dscp_to_dsfield(rule4->dscp); + } if ((rule4->dst_len && nla_put_in_addr(skb, FRA_DST, rule4->dst)) || @@ -366,7 +411,8 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(4) /* dst */ + nla_total_size(4) /* src */ - + nla_total_size(4); /* flow */ + + nla_total_size(4) /* flow */ + + nla_total_size(1); /* dscp */ } static void fib4_rule_flush_cache(struct fib_rules_ops *ops) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2b57cd2b96e2..ba2df3d2ac15 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -543,8 +543,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, info->nlh, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); + rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); } static int fib_detect_death(struct fib_info *fi, int order, @@ -2066,8 +2065,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) if (fa->fa_slen != slen) continue; - if (fa->fa_dscp && - fa->fa_dscp != inet_dsfield_to_dscp(flp->flowi4_tos)) + if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; if (fa->tb_id != tb->tb_id) continue; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 8f30e3f00b7f..09e31757e96c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1580,8 +1580,7 @@ found: if (index >= (1ul << fa->fa_slen)) continue; } - if (fa->fa_dscp && - inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos) + if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; /* Paired with WRITE_ONCE() in fib_release_info() */ if (READ_ONCE(fi->fib_dead)) diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 78b869b31492..3e30745e2c09 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -336,11 +336,11 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, struct gro_remcsum grc; u8 proto; + skb_gro_remcsum_init(&grc); + if (!fou) goto out; - skb_gro_remcsum_init(&grc); - off = skb_gro_offset(skb); len = off + sizeof(*guehdr); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index ab6d0d98dbc3..e1384e7331d8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -93,6 +93,7 @@ #include <net/ip_fib.h> #include <net/l3mdev.h> #include <net/addrconf.h> +#include <net/inet_dscp.h> #define CREATE_TRACE_POINTS #include <trace/events/icmp.h> @@ -220,61 +221,56 @@ static inline void icmp_xmit_unlock(struct sock *sk) spin_unlock(&sk->sk_lock.slock); } -int sysctl_icmp_msgs_per_sec __read_mostly = 1000; -int sysctl_icmp_msgs_burst __read_mostly = 50; - -static struct { - spinlock_t lock; - u32 credit; - u32 stamp; -} icmp_global = { - .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock), -}; - /** * icmp_global_allow - Are we allowed to send one more ICMP message ? + * @net: network namespace * * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. * Returns false if we reached the limit and can not send another packet. - * Note: called with BH disabled + * Works in tandem with icmp_global_consume(). */ -bool icmp_global_allow(void) +bool icmp_global_allow(struct net *net) { - u32 credit, delta, incr = 0, now = (u32)jiffies; - bool rc = false; + u32 delta, now, oldstamp; + int incr, new, old; - /* Check if token bucket is empty and cannot be refilled - * without taking the spinlock. The READ_ONCE() are paired - * with the following WRITE_ONCE() in this same function. + /* Note: many cpus could find this condition true. + * Then later icmp_global_consume() could consume more credits, + * this is an acceptable race. */ - if (!READ_ONCE(icmp_global.credit)) { - delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ); - if (delta < HZ / 50) - return false; - } + if (atomic_read(&net->ipv4.icmp_global_credit) > 0) + return true; - spin_lock(&icmp_global.lock); - delta = min_t(u32, now - icmp_global.stamp, HZ); - if (delta >= HZ / 50) { - incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ; - if (incr) - WRITE_ONCE(icmp_global.stamp, now); - } - credit = min_t(u32, icmp_global.credit + incr, - READ_ONCE(sysctl_icmp_msgs_burst)); - if (credit) { - /* We want to use a credit of one in average, but need to randomize - * it for security reasons. - */ - credit = max_t(int, credit - get_random_u32_below(3), 0); - rc = true; + now = jiffies; + oldstamp = READ_ONCE(net->ipv4.icmp_global_stamp); + delta = min_t(u32, now - oldstamp, HZ); + if (delta < HZ / 50) + return false; + + incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ; + if (!incr) + return false; + + if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) { + old = atomic_read(&net->ipv4.icmp_global_credit); + do { + new = min(old + incr, READ_ONCE(net->ipv4.sysctl_icmp_msgs_burst)); + } while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new)); } - WRITE_ONCE(icmp_global.credit, credit); - spin_unlock(&icmp_global.lock); - return rc; + return true; } EXPORT_SYMBOL(icmp_global_allow); +void icmp_global_consume(struct net *net) +{ + int credits = get_random_u32_below(3); + + /* Note: this might make icmp_global.credit negative. */ + if (credits) + atomic_sub(credits, &net->ipv4.icmp_global_credit); +} +EXPORT_SYMBOL(icmp_global_consume); + static bool icmpv4_mask_allow(struct net *net, int type, int code) { if (type > NR_ICMP_TYPES) @@ -291,14 +287,16 @@ static bool icmpv4_mask_allow(struct net *net, int type, int code) return false; } -static bool icmpv4_global_allow(struct net *net, int type, int code) +static bool icmpv4_global_allow(struct net *net, int type, int code, + bool *apply_ratelimit) { if (icmpv4_mask_allow(net, type, code)) return true; - if (icmp_global_allow()) + if (icmp_global_allow(net)) { + *apply_ratelimit = true; return true; - + } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -308,15 +306,16 @@ static bool icmpv4_global_allow(struct net *net, int type, int code) */ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - struct flowi4 *fl4, int type, int code) + struct flowi4 *fl4, int type, int code, + bool apply_ratelimit) { struct dst_entry *dst = &rt->dst; struct inet_peer *peer; bool rc = true; int vif; - if (icmpv4_mask_allow(net, type, code)) - goto out; + if (!apply_ratelimit) + return true; /* No rate limit on loopback */ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) @@ -331,6 +330,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); + else + icmp_global_consume(net); return rc; } @@ -402,6 +403,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->dst.dev); + bool apply_ratelimit = false; struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; @@ -413,11 +415,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) return; - /* Needed by both icmp_global_allow and icmp_xmit_lock */ + /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ local_bh_disable(); - /* global icmp_msgs_per_sec */ - if (!icmpv4_global_allow(net, type, code)) + /* is global icmp_msgs_per_sec exhausted ? */ + if (!icmpv4_global_allow(net, type, code, &apply_ratelimit)) goto out_bh_enable; sk = icmp_xmit_lock(net); @@ -443,14 +445,14 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.saddr = saddr; fl4.flowi4_mark = mark; fl4.flowi4_uid = sock_net_uid(net, NULL); - fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_tos = ip_hdr(skb)->tos & INET_DSCP_MASK; fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; - if (icmpv4_xrlim_allow(net, rt, &fl4, type, code)) + if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: @@ -496,7 +498,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); - fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_tos = tos & INET_DSCP_MASK; fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; @@ -545,7 +547,7 @@ static struct rtable *icmp_route_lookup(struct net *net, orefdst = skb_in->_skb_refdst; /* save old refdst */ skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, - RT_TOS(tos), rt2->dst.dev); + tos, rt2->dst.dev); dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); @@ -596,6 +598,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, int room; struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); + bool apply_ratelimit = false; struct ipcm_cookie ipc; struct flowi4 fl4; __be32 saddr; @@ -677,7 +680,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, } } - /* Needed by both icmp_global_allow and icmp_xmit_lock */ + /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless @@ -685,7 +688,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow) */ if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) && - !icmpv4_global_allow(net, type, code)) + !icmpv4_global_allow(net, type, code, &apply_ratelimit)) goto out_bh_enable; sk = icmp_xmit_lock(net); @@ -744,7 +747,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, goto out_unlock; /* peer icmp_ratelimit */ - if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) + if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) goto ende; /* RFC says return as much as we can without exceeding 576 bytes. */ @@ -1487,6 +1490,8 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; net->ipv4.sysctl_icmp_ratemask = 0x1818; net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + net->ipv4.sysctl_icmp_msgs_per_sec = 1000; + net->ipv4.sysctl_icmp_msgs_burst = 50; return 0; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 64d07b842e73..2c5632d4fddb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -236,7 +236,7 @@ static bool inet_bhash2_conflict(const struct sock *sk, #define sk_for_each_bound_bhash(__sk, __tb2, __tb) \ hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ - sk_for_each_bound(sk2, &(__tb2)->owners) + sk_for_each_bound((__sk), &(__tb2)->owners) /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, @@ -714,6 +714,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) out: release_sock(sk); if (newsk && mem_cgroup_sockets_enabled) { + gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; int amt = 0; /* atomically get the memory usage, set and charge the @@ -731,8 +732,8 @@ out: } if (amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt, - GFP_KERNEL | __GFP_NOFAIL); + mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp); + kmem_cache_charge(newsk, gfp); release_sock(newsk); } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 9712cdb8087c..67639309163d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -442,7 +442,7 @@ static int inet_twsk_diag_fill(struct sock *sk, inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; - r->idiag_state = tw->tw_substate; + r->idiag_state = READ_ONCE(tw->tw_substate); r->idiag_timer = 3; tmo = tw->tw_timer.expires - jiffies; r->idiag_expires = jiffies_delta_to_msecs(tmo); @@ -1209,7 +1209,7 @@ next_chunk: if (num < s_num) goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_substate : sk->sk_state; + READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 48d0d494185b..9bfcfd016e18 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -310,7 +310,7 @@ inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) return inet_lhash2_bucket(h, hash); } -static inline int compute_score(struct sock *sk, struct net *net, +static inline int compute_score(struct sock *sk, const struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif) { @@ -348,7 +348,7 @@ static inline int compute_score(struct sock *sk, struct net *net, * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ -struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, +struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, @@ -374,7 +374,7 @@ EXPORT_SYMBOL_GPL(inet_lookup_reuseport); */ /* called with rcu_read_lock() : No refcount taken on the socket */ -static struct sock *inet_lhash2_lookup(struct net *net, +static struct sock *inet_lhash2_lookup(const struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, @@ -401,7 +401,7 @@ static struct sock *inet_lhash2_lookup(struct net *net, return result; } -struct sock *inet_lookup_run_sk_lookup(struct net *net, +struct sock *inet_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, @@ -423,7 +423,7 @@ struct sock *inet_lookup_run_sk_lookup(struct net *net, return sk; } -struct sock *__inet_lookup_listener(struct net *net, +struct sock *__inet_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, @@ -488,7 +488,7 @@ void sock_edemux(struct sk_buff *skb) } EXPORT_SYMBOL(sock_edemux); -struct sock *__inet_lookup_established(struct net *net, +struct sock *__inet_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ba205473522e..5f6fd382af38 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -44,6 +44,7 @@ #include <net/gre.h> #include <net/dst_metadata.h> #include <net/erspan.h> +#include <net/inet_dscp.h> /* Problems & solutions @@ -930,7 +931,7 @@ static int ipgre_open(struct net_device *dev) t->parms.iph.daddr, t->parms.iph.saddr, t->parms.o_key, - RT_TOS(t->parms.iph.tos), + t->parms.iph.tos & INET_DSCP_MASK, t->parms.link); if (IS_ERR(rt)) return -EADDRNOTAVAIL; @@ -996,7 +997,7 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph); - dev->features |= GRE_FEATURES | NETIF_F_LLTX; + dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; /* TCP offload with GRE SEQ is not supported, nor can we support 2 @@ -1010,6 +1011,8 @@ static void __gre_tunnel_init(struct net_device *dev) dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + + dev->lltx = true; } static int ipgre_tunnel_init(struct net_device *dev) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d6fbcbd2358a..b6e7d4921309 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -596,9 +596,8 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct dst_entry *dst; @@ -646,9 +645,8 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b90d0f78ac80..49811c9281d4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -77,6 +77,7 @@ #include <net/inetpeer.h> #include <net/inet_ecn.h> #include <net/lwtunnel.h> +#include <net/inet_dscp.h> #include <linux/bpf-cgroup.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> @@ -493,7 +494,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_TOS(tos), + tos & INET_DSCP_MASK, sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; @@ -1621,7 +1622,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, - RT_TOS(arg->tos), + arg->tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 5cffad42fe8c..d591c73e2c0e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -43,6 +43,7 @@ #include <net/rtnetlink.h> #include <net/udp.h> #include <net/dst_metadata.h> +#include <net/inet_dscp.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -293,7 +294,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - RT_TOS(iph->tos), dev_net(dev), + iph->tos & INET_DSCP_MASK, dev_net(dev), tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); @@ -609,9 +610,9 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, - tunnel_id_to_key32(key->tun_id), RT_TOS(tos), - dev_net(dev), 0, skb->mark, skb_get_hash(skb), - key->flow_flags); + tunnel_id_to_key32(key->tun_id), + tos & INET_DSCP_MASK, dev_net(dev), 0, skb->mark, + skb_get_hash(skb), key->flow_flags); if (!tunnel_hlen) tunnel_hlen = ip_encap_hlen(&tun_info->encap); @@ -772,7 +773,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), + tunnel->parms.o_key, tos & INET_DSCP_MASK, dev_net(dev), READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); @@ -1161,7 +1162,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, * Allowing to move it to another netns is clearly unsafe. */ if (!IS_ERR(itn->fb_tunnel_dev)) { - itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + itn->fb_tunnel_dev->netns_local = true; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); itn->type = itn->fb_tunnel_dev->type; @@ -1326,7 +1327,7 @@ int ip_tunnel_init(struct net_device *dev) tunnel->dev = dev; tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); iph->version = 4; iph->ihl = 5; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 14536da9f5dc..f0b4419cef34 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -443,7 +443,7 @@ static int vti_tunnel_init(struct net_device *dev) dev->flags = IFF_NOARP; dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; netif_keep_dst(dev); return ip_tunnel_init(dev); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 923a2ef68c2f..dc0db5895e0e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -378,7 +378,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; netif_keep_dst(dev); dev->features |= IPIP_FEATURES; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 6c750bd13dd8..089864c6a35e 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -62,6 +62,7 @@ #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/rtnh.h> +#include <net/inet_dscp.h> #include <linux/nospec.h> @@ -536,7 +537,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; } static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) @@ -1868,7 +1869,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->remote, vif->local, 0, 0, IPPROTO_IPIP, - RT_TOS(iph->tos), vif->link); + iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) goto out_free; encap = sizeof(struct iphdr); @@ -1876,7 +1877,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, 0, 0, IPPROTO_IPIP, - RT_TOS(iph->tos), vif->link); + iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) goto out_free; } @@ -2080,7 +2081,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = RT_TOS(iph->tos), + .flowi4_tos = iph->tos & INET_DSCP_MASK, .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? @@ -2406,8 +2407,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, errout: kfree_skb(skb); - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); } static size_t igmpmsg_netlink_msgsize(size_t payloadlen) diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 591a2737808e..e0aab66cd925 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -14,6 +14,7 @@ #include <net/route.h> #include <net/xfrm.h> #include <net/ip.h> +#include <net/inet_dscp.h> #include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ @@ -43,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un */ fl4.daddr = iph->daddr; fl4.saddr = saddr; - fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 14365b20f1c5..1cdd9c28ab2d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -826,7 +826,7 @@ static int get_info(struct net *net, void __user *user, const int *len) sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; - strcpy(info.name, name); + strscpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; @@ -1547,7 +1547,7 @@ int arpt_register_table(struct net *net, goto out_free; } - ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index fe89a056eb06..3d101613f27f 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -981,7 +981,7 @@ static int get_info(struct net *net, void __user *user, const int *len) sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; - strcpy(info.name, name); + strscpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; @@ -1767,7 +1767,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table, goto out_free; } - ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index ded5bef02f77..1ce7a1655b97 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> +#include <net/inet_dscp.h> #include <linux/ip.h> #include <net/ip.h> #include <net/ip_fib.h> @@ -75,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; + flow.flowi4_tos = iph->tos & INET_DSCP_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 6cc5743c553a..f4aed0789d69 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -15,6 +15,7 @@ #include <net/icmp.h> #include <net/ip.h> #include <net/route.h> +#include <net/inet_dscp.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> @@ -32,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, fl4.flowi4_oif = oif; fl4.daddr = gw->s_addr; - fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index a522c3a3be52..ef5dd88107dd 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -40,13 +40,13 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, sizeof(struct in_addr)); if (err < 0) return err; if (tb[NFTA_DUP_SREG_DEV]) - err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, sizeof(int)); return err; diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 9eee535c64dd..00da1332bbf1 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -10,6 +10,7 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_fib.h> +#include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/route.h> @@ -22,8 +23,6 @@ static __be32 get_saddr(__be32 addr) return addr; } -#define DSCP_BITS 0xfc - void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { @@ -110,7 +109,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (priv->flags & NFTA_FIB_F_MARK) fl4.flowi4_mark = pkt->skb->mark; - fl4.flowi4_tos = iph->tos & DSCP_BITS; + fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; if (priv->flags & NFTA_FIB_F_DADDR) { fl4.daddr = iph->daddr; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 6b9787ee8601..93aaea0006ba 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -865,15 +865,18 @@ out: } static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, - u32 op_flags) + u32 op_flags, u32 *resp_op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; + u16 weight; int i; + *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0; + if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) @@ -888,9 +891,12 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { + weight = nhg->nh_entries[i].weight - 1; + *p++ = (struct nexthop_grp) { .id = nhg->nh_entries[i].nh->id, - .weight = nhg->nh_entries[i].weight - 1, + .weight = weight, + .weight_high = weight >> 8, }; } @@ -934,10 +940,12 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + u32 resp_op_flags = 0; if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; - if (nla_put_nh_group(skb, nh, op_flags)) + if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) || + nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags)) goto nla_put_failure; goto out; } @@ -1050,7 +1058,9 @@ static size_t nh_nlmsg_size(struct nexthop *nh) sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) - sz += nh_nlmsg_size_grp(nh); + sz += nh_nlmsg_size_grp(nh) + + nla_total_size(4) + /* NHA_OP_FLAGS */ + 0; else sz += nh_nlmsg_size_single(nh); @@ -1080,8 +1090,7 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) info->nlh, gfp_any()); return; errout: - if (err < 0) - rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); + rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) @@ -1201,8 +1210,7 @@ static void nexthop_bucket_notify(struct nh_res_table *res_table, rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); + rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, @@ -1280,11 +1288,14 @@ static int nh_check_attr_group(struct net *net, nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { - if (nhg[i].resvd1 || nhg[i].resvd2) { - NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); + if (nhg[i].resvd2) { + NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0"); return -EINVAL; } - if (nhg[i].weight > 254) { + if (nexthop_grp_weight(&nhg[i]) == 0) { + /* 0xffff got passed in, representing weight of 0x10000, + * which is too heavy. + */ NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } @@ -1880,9 +1891,9 @@ static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { - int prev_upper_bound = 0; - int total = 0; - int w = 0; + u16 prev_upper_bound = 0; + u32 total = 0; + u32 w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); @@ -1892,11 +1903,12 @@ static void nh_res_group_rebalance(struct nh_group *nhg, for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - int upper_bound; + u16 upper_bound; + u64 btw; w += nhge->weight; - upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, - total); + btw = ((u64)res_table->num_nh_buckets) * w; + upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; @@ -1962,8 +1974,8 @@ static void replace_nexthop_grp_res(struct nh_group *oldg, static void nh_hthr_group_rebalance(struct nh_group *nhg) { - int total = 0; - int w = 0; + u32 total = 0; + u32 w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) @@ -1971,7 +1983,7 @@ static void nh_hthr_group_rebalance(struct nh_group *nhg) for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - int upper_bound; + u32 upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; @@ -2713,7 +2725,8 @@ static struct nexthop *nexthop_create_group(struct net *net, goto out_no_nh; } nhg->nh_entries[i].nh = nhe; - nhg->nh_entries[i].weight = entry[i].weight + 1; + nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]); + list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 13c0f1d455f3..723ac9181558 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -512,7 +512,7 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4, sk->sk_protocol; } - flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope, + flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope, prot, flow_flags, iph->daddr, iph->saddr, 0, 0, sock_net_uid(net, sk)); } @@ -541,7 +541,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), - ip_sock_rt_tos(sk) & IPTOS_RT_MASK, + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, @@ -1263,7 +1263,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = iph->tos & IPTOS_RT_MASK, + .flowi4_tos = iph->tos & INET_DSCP_MASK, .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, @@ -2160,7 +2160,7 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; - tos &= IPTOS_RT_MASK; + tos &= INET_DSCP_MASK; err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); if (err < 0) goto martian_source; @@ -2470,7 +2470,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct fib_result res; int err; - tos &= IPTOS_RT_MASK; + tos &= INET_DSCP_MASK; rcu_read_lock(); err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); rcu_read_unlock(); @@ -2618,7 +2618,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - fl4->flowi4_tos &= IPTOS_RT_MASK; + fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -3261,7 +3261,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; + fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; @@ -3286,7 +3286,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb->dev = dev; skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, - rtm->rtm_tos & IPTOS_RT_MASK, dev, + rtm->rtm_tos & INET_DSCP_MASK, dev, &res); rt = skb_rtable(skb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4af0c234d8d7..a79b2a52ce01 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -601,22 +601,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_tcp_available_ulp, }, { - .procname = "icmp_msgs_per_sec", - .data = &sysctl_icmp_msgs_per_sec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "icmp_msgs_burst", - .data = &sysctl_icmp_msgs_burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), @@ -702,6 +686,22 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "icmp_msgs_per_sec", + .data = &init_net.ipv4.sysctl_icmp_msgs_per_sec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "icmp_msgs_burst", + .data = &init_net.ipv4.sysctl_icmp_msgs_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { .procname = "ping_group_range", .data = &init_net.ipv4.ping_group_range.range, .maxlen = sizeof(gid_t)*2, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 831a18dc7aa6..4f77bd862e95 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -285,6 +285,8 @@ #include <trace/events/tcp.h> #include <net/rps.h> +#include "../core/devmem.h" + /* Track pending CMSGs. */ enum { TCP_CMSG_INQ = 1, @@ -471,6 +473,7 @@ void tcp_init_sock(struct sock *sk) set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); + xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1); } EXPORT_SYMBOL(tcp_init_sock); @@ -2160,6 +2163,9 @@ static int tcp_zerocopy_receive(struct sock *sk, skb = tcp_recv_skb(sk, seq, &offset); } + if (!skb_frags_readable(skb)) + break; + if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; @@ -2177,6 +2183,9 @@ static int tcp_zerocopy_receive(struct sock *sk, break; } page = skb_frag_page(frags); + if (WARN_ON_ONCE(!page)) + break; + prefetchw(page); pages[pages_to_map++] = page; length += PAGE_SIZE; @@ -2235,6 +2244,7 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); + u32 tsflags = READ_ONCE(sk->sk_tsflags); bool has_timestamping = false; if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { @@ -2274,14 +2284,18 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, } } - if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE) + if (tsflags & SOF_TIMESTAMPING_SOFTWARE && + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) has_timestamping = true; else tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { - if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE) + if (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && + (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) has_timestamping = true; else tss->ts[2] = (struct timespec64) {0}; @@ -2317,6 +2331,220 @@ static int tcp_inq_hint(struct sock *sk) return inq; } +/* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */ +struct tcp_xa_pool { + u8 max; /* max <= MAX_SKB_FRAGS */ + u8 idx; /* idx <= max */ + __u32 tokens[MAX_SKB_FRAGS]; + netmem_ref netmems[MAX_SKB_FRAGS]; +}; + +static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p) +{ + int i; + + /* Commit part that has been copied to user space. */ + for (i = 0; i < p->idx; i++) + __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY, + (__force void *)p->netmems[i], GFP_KERNEL); + /* Rollback what has been pre-allocated and is no longer needed. */ + for (; i < p->max; i++) + __xa_erase(&sk->sk_user_frags, p->tokens[i]); + + p->max = 0; + p->idx = 0; +} + +static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p) +{ + if (!p->max) + return; + + xa_lock_bh(&sk->sk_user_frags); + + tcp_xa_pool_commit_locked(sk, p); + + xa_unlock_bh(&sk->sk_user_frags); +} + +static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p, + unsigned int max_frags) +{ + int err, k; + + if (p->idx < p->max) + return 0; + + xa_lock_bh(&sk->sk_user_frags); + + tcp_xa_pool_commit_locked(sk, p); + + for (k = 0; k < max_frags; k++) { + err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k], + XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL); + if (err) + break; + } + + xa_unlock_bh(&sk->sk_user_frags); + + p->max = k; + p->idx = 0; + return k ? 0 : err; +} + +/* On error, returns the -errno. On success, returns number of bytes sent to the + * user. May not consume all of @remaining_len. + */ +static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, + unsigned int offset, struct msghdr *msg, + int remaining_len) +{ + struct dmabuf_cmsg dmabuf_cmsg = { 0 }; + struct tcp_xa_pool tcp_xa_pool; + unsigned int start; + int i, copy, n; + int sent = 0; + int err = 0; + + tcp_xa_pool.max = 0; + tcp_xa_pool.idx = 0; + do { + start = skb_headlen(skb); + + if (skb_frags_readable(skb)) { + err = -ENODEV; + goto out; + } + + /* Copy header. */ + copy = start - offset; + if (copy > 0) { + copy = min(copy, remaining_len); + + n = copy_to_iter(skb->data + offset, copy, + &msg->msg_iter); + if (n != copy) { + err = -EFAULT; + goto out; + } + + offset += copy; + remaining_len -= copy; + + /* First a dmabuf_cmsg for # bytes copied to user + * buffer. + */ + memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); + dmabuf_cmsg.frag_size = copy; + err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, + sizeof(dmabuf_cmsg), &dmabuf_cmsg); + if (err || msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~MSG_CTRUNC; + if (!err) + err = -ETOOSMALL; + goto out; + } + + sent += copy; + + if (remaining_len == 0) + goto out; + } + + /* after that, send information of dmabuf pages through a + * sequence of cmsg + */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct net_iov *niov; + u64 frag_offset; + int end; + + /* !skb_frags_readable() should indicate that ALL the + * frags in this skb are dmabuf net_iovs. We're checking + * for that flag above, but also check individual frags + * here. If the tcp stack is not setting + * skb_frags_readable() correctly, we still don't want + * to crash here. + */ + if (!skb_frag_net_iov(frag)) { + net_err_ratelimited("Found non-dmabuf skb with net_iov"); + err = -ENODEV; + goto out; + } + + niov = skb_frag_net_iov(frag); + end = start + skb_frag_size(frag); + copy = end - offset; + + if (copy > 0) { + copy = min(copy, remaining_len); + + frag_offset = net_iov_virtual_addr(niov) + + skb_frag_off(frag) + offset - + start; + dmabuf_cmsg.frag_offset = frag_offset; + dmabuf_cmsg.frag_size = copy; + err = tcp_xa_pool_refill(sk, &tcp_xa_pool, + skb_shinfo(skb)->nr_frags - i); + if (err) + goto out; + + /* Will perform the exchange later */ + dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; + dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov); + + offset += copy; + remaining_len -= copy; + + err = put_cmsg(msg, SOL_SOCKET, + SO_DEVMEM_DMABUF, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err || msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~MSG_CTRUNC; + if (!err) + err = -ETOOSMALL; + goto out; + } + + atomic_long_inc(&niov->pp_ref_count); + tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); + + sent += copy; + + if (remaining_len == 0) + goto out; + } + start = end; + } + + tcp_xa_pool_commit(sk, &tcp_xa_pool); + if (!remaining_len) + goto out; + + /* if remaining_len is not satisfied yet, we need to go to the + * next frag in the frag_list to satisfy remaining_len. + */ + skb = skb_shinfo(skb)->frag_list ?: skb->next; + + offset = offset - start; + } while (skb); + + if (remaining_len) { + err = -EFAULT; + goto out; + } + +out: + tcp_xa_pool_commit(sk, &tcp_xa_pool); + if (!sent) + sent = err; + + return sent; +} + /* * This routine copies from a sock struct into the user buffer. * @@ -2330,6 +2558,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); + int last_copied_dmabuf = -1; /* uninitialized */ int copied = 0; u32 peek_seq; u32 *seq; @@ -2509,15 +2738,44 @@ found_ok_skb: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_msg(skb, offset, msg, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; + if (last_copied_dmabuf != -1 && + last_copied_dmabuf != !skb_frags_readable(skb)) break; + + if (skb_frags_readable(skb)) { + err = skb_copy_datagram_msg(skb, offset, msg, + used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + } else { + if (!(flags & MSG_SOCK_DEVMEM)) { + /* dmabuf skbs can only be received + * with the MSG_SOCK_DEVMEM flag. + */ + if (!copied) + copied = -EFAULT; + + break; + } + + err = tcp_recvmsg_dmabuf(sk, skb, offset, msg, + used); + if (err <= 0) { + if (!copied) + copied = -EFAULT; + + break; + } + used = err; } } + last_copied_dmabuf = !skb_frags_readable(skb); + WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; @@ -2833,7 +3091,7 @@ void __tcp_close(struct sock *sk, long timeout) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_ABORT_ON_CLOSE); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); @@ -2908,7 +3166,7 @@ adjudge_to_death: if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_ABORT_ON_LINGER); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { @@ -2927,7 +3185,7 @@ adjudge_to_death: if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_ABORT_ON_MEMORY); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { @@ -3025,13 +3283,16 @@ int tcp_disconnect(struct sock *sk, int flags) inet_csk_listen_stop(sk); } else if (unlikely(tp->repair)) { WRITE_ONCE(sk->sk_err, ECONNABORTED); - } else if (tcp_need_reset(old_state) || - (tp->snd_nxt != tp->write_seq && - (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { + } else if (tcp_need_reset(old_state)) { + tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_STATE); + WRITE_ONCE(sk->sk_err, ECONNRESET); + } else if (tp->snd_nxt != tp->write_seq && + (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ - tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_NOT_SPECIFIED); + tcp_send_active_reset(sk, gfp_any(), + SK_RST_REASON_TCP_DISCONNECT_WITH_DATA); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); @@ -4655,7 +4916,7 @@ int tcp_abort(struct sock *sk, int err) if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_STATE); tcp_done_with_error(sk, err); bh_unlock_sock(sk); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index fe6178715ba0..e7658c5d6b79 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -30,7 +30,7 @@ void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) } static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, - struct sk_msg *msg, u32 apply_bytes, int flags) + struct sk_msg *msg, u32 apply_bytes) { bool apply = apply_bytes; struct scatterlist *sge; @@ -167,7 +167,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, if (unlikely(!psock)) return -EPIPE; - ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : + ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) : tcp_bpf_push_locked(sk, msg, bytes, flags, false); sk_psock_put(sk, psock); return ret; diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 52b1f2665dfa..81b96331b2bb 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -185,7 +185,7 @@ static inline void htcp_alpha_update(struct htcp *ca) u32 scale = (HZ << 3) / (10 * minRTT); /* clamping ratio to interval [0.5,10]<<3 */ - scale = min(max(scale, 1U << 2), 10U << 3); + scale = clamp(scale, 1U << 2, 10U << 3); factor = (factor << 3) / scale; if (!factor) factor = 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e37488d3453f..9f314dfa1490 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5391,6 +5391,9 @@ restart: for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { n = tcp_skb_next(skb, list); + if (!skb_frags_readable(skb)) + goto skip_this; + /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list, root); @@ -5411,17 +5414,20 @@ restart: break; } - if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) && + if (n && n != tail && skb_frags_readable(n) && + tcp_skb_can_collapse_rx(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; } +skip_this: /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; } if (end_of_skbs || - (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || + !skb_frags_readable(skb)) return; __skb_queue_head_init(&tmp); @@ -5463,7 +5469,8 @@ restart: if (!skb || skb == tail || !tcp_skb_can_collapse_rx(nskb, skb) || - (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || + !skb_frags_readable(skb)) goto end; } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a4e510846905..5afe5e57c89b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -79,6 +79,7 @@ #include <linux/seq_file.h> #include <linux/inetdevice.h> #include <linux/btf_ids.h> +#include <linux/skbuff_ref.h> #include <crypto/hash.h> #include <linux/scatterlist.h> @@ -120,6 +121,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) struct tcp_sock *tp = tcp_sk(sk); int ts_recent_stamp; + if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) + reuse = 0; + if (reuse == 2) { /* Still does not detect *everything* that goes through * lo, since we require a loopback src or dst address @@ -1070,7 +1074,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) } tcp_v4_send_ack(sk, skb, - tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), READ_ONCE(tcptw->tw_ts_recent), @@ -2509,10 +2513,25 @@ static void tcp_md5sig_info_free_rcu(struct rcu_head *head) } #endif +static void tcp_release_user_frags(struct sock *sk) +{ +#ifdef CONFIG_PAGE_POOL + unsigned long index; + void *netmem; + + xa_for_each(&sk->sk_user_frags, index, netmem) + WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); +#endif +} + void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + tcp_release_user_frags(sk); + + xa_destroy(&sk->sk_user_frags); + trace_tcp_destroy_sock(sk); tcp_clear_xmit_timers(sk); @@ -2945,7 +2964,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", - i, src, srcp, dest, destp, tw->tw_substate, 0, 0, + i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b01eb6d94413..95669935494e 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -617,9 +617,13 @@ static struct genl_family tcp_metrics_nl_family; static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, - [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr), }, + [TCP_METRICS_ATTR_ADDR_IPV6] = + NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), + [TCP_METRICS_ATTR_SADDR_IPV4] = { .type = NLA_U32, }, + [TCP_METRICS_ATTR_SADDR_IPV6] = + NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), + /* Following attributes are not received for GET/DEL, * we keep them for reference */ @@ -811,8 +815,6 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, if (a) { struct in6_addr in6; - if (nla_len(a) != sizeof(struct in6_addr)) - return -EINVAL; in6 = nla_get_in6_addr(a); inetpeer_set_addr_v6(addr, &in6); if (hash) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a19a9dbd3409..bb1fe1ba867a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -52,16 +52,17 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, return TCP_TW_SUCCESS; } -static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq) +static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, + u32 rcv_nxt) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; ao = rcu_dereference(tcptw->ao_info); - if (unlikely(ao && seq < tcptw->tw_rcv_nxt)) + if (unlikely(ao && seq < rcv_nxt)) WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1); #endif - tcptw->tw_rcv_nxt = seq; + WRITE_ONCE(tcptw->tw_rcv_nxt, seq); } /* @@ -98,8 +99,9 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, u32 *tw_isn) { - struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); + struct tcp_options_received tmp_opt; bool paws_reject = false; int ts_recent_stamp; @@ -117,26 +119,26 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, } } - if (tw->tw_substate == TCP_FIN_WAIT2) { + if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcptw->tw_rcv_nxt, - tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) + rcv_nxt, + rcv_nxt + tcptw->tw_rcv_wnd)) return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) + if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt)) return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || - !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || + !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; @@ -146,12 +148,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * reset. */ if (!th->fin || - TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) + TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1) return TCP_TW_RST; /* FIN arrived, enter true time-wait state. */ - tw->tw_substate = TCP_TIME_WAIT; - twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq); + WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT); + twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq, + rcv_nxt); if (tmp_opt.saw_tstamp) { WRITE_ONCE(tcptw->tw_ts_recent_stamp, @@ -182,7 +185,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, */ if (!paws_reject && - (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && + (TCP_SKB_CB(skb)->seq == rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ @@ -229,7 +232,7 @@ kill: */ if (th->syn && !th->rst && !th->ack && !paws_reject && - (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || + (after(TCP_SKB_CB(skb)->seq, rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; @@ -625,6 +628,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1); + return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 16c48df8df4c..4fd746bd4d54 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2344,7 +2344,8 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb) || - !skb_pure_zcopy_same(skb, next)) + !skb_pure_zcopy_same(skb, next) || + skb_frags_readable(skb) != skb_frags_readable(next)) return false; len -= skb->len; @@ -3264,6 +3265,8 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) return false; if (skb_cloned(skb)) return false; + if (!skb_frags_readable(skb)) + return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; @@ -3649,7 +3652,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority, /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ - trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED); + trace_tcp_send_reset(sk, NULL, reason); } /* Send a crossed SYN-ACK during socket establishment. diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4d40615dc8fc..79064580c8c0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -125,7 +125,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) do_reset = true; if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_ABORT_ON_MEMORY); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; @@ -282,6 +282,7 @@ static int tcp_write_timeout(struct sock *sk) expired = retransmits_timed_out(sk, retry_until, READ_ONCE(icsk->icsk_user_timeout)); tcp_fastopen_active_detect_blackhole(sk, expired); + mptcp_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, @@ -779,7 +780,7 @@ static void tcp_keepalive_timer (struct timer_list *t) goto out; } } - tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); + tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_STATE); goto death; } @@ -807,7 +808,7 @@ static void tcp_keepalive_timer (struct timer_list *t) (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT); tcp_write_err(sk); goto out; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 49c622e743e8..8accbf4cb295 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -115,6 +115,7 @@ #include <net/addrconf.h> #include <net/udp_tunnel.h> #include <net/gro.h> +#include <net/inet_dscp.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif @@ -365,7 +366,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, hash2_nulladdr); } -static int compute_score(struct sock *sk, struct net *net, +static int compute_score(struct sock *sk, const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, int dif, int sdif) @@ -420,7 +421,7 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, } /* called with rcu_read_lock() */ -static struct sock *udp4_lib_lookup2(struct net *net, +static struct sock *udp4_lib_lookup2(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, @@ -480,7 +481,7 @@ rescore: /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ -struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, +struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { @@ -561,7 +562,7 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4) -struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, +struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { struct sock *sk; @@ -2618,7 +2619,7 @@ int udp_v4_early_demux(struct sk_buff *skb) if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, iph->saddr, - iph->tos & IPTOS_RT_MASK, + iph->tos & INET_DSCP_MASK, skb->dev, in_dev, &itag); } return 0; diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index e4e0fa869fa4..619a53eb672d 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -6,6 +6,7 @@ #include <net/dst_metadata.h> #include <net/udp.h> #include <net/udp_tunnel.h> +#include <net/inet_dscp.h> int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) @@ -232,7 +233,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, fl4.saddr = key->u.ipv4.src; fl4.fl4_dport = dport; fl4.fl4_sport = sport; - fl4.flowi4_tos = RT_TOS(tos); + fl4.flowi4_tos = tos & INET_DSCP_MASK; fl4.flowi4_flags = key->flow_flags; rt = ip_route_output_key(net, &fl4); |