diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/arp.c | 6 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 3 | ||||
-rw-r--r-- | net/ipv4/esp4.c | 53 | ||||
-rw-r--r-- | net/ipv4/esp4_offload.c | 6 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 18 | ||||
-rw-r--r-- | net/ipv4/fib_rules.c | 4 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 22 | ||||
-rw-r--r-- | net/ipv4/icmp.c | 59 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 37 | ||||
-rw-r--r-- | net/ipv4/inetpeer.c | 31 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 15 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 16 | ||||
-rw-r--r-- | net/ipv4/ip_options.c | 3 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 4 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 28 | ||||
-rw-r--r-- | net/ipv4/ipmr_base.c | 9 | ||||
-rw-r--r-- | net/ipv4/netfilter/nft_fib_ipv4.c | 20 | ||||
-rw-r--r-- | net/ipv4/route.c | 65 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 57 | ||||
-rw-r--r-- | net/ipv4/tcp_bpf.c | 36 | ||||
-rw-r--r-- | net/ipv4/tcp_cubic.c | 8 | ||||
-rw-r--r-- | net/ipv4/tcp_fastopen.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 159 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp_offload.c | 14 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 9 | ||||
-rw-r--r-- | net/ipv4/udp.c | 102 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 77 | ||||
-rw-r--r-- | net/ipv4/xfrm4_input.c | 21 |
30 files changed, 560 insertions, 341 deletions
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 11c1519b3699..8fb48f42581c 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -659,10 +659,12 @@ static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb */ void arp_xmit(struct sk_buff *skb) { + rcu_read_lock(); /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, - dev_net(skb->dev), NULL, skb, NULL, skb->dev, + dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); + rcu_read_unlock(); } EXPORT_SYMBOL(arp_xmit); @@ -1075,7 +1077,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, + dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7cf5f7d0d0de..a55e95046984 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1351,10 +1351,11 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; - struct net *net = dev_net(dev); + struct net *net; int master_idx; rcu_read_lock(); + net = dev_net_rcu(dev); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f3281312eb5e..cbe4c6fc8b8e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -120,47 +120,16 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) } #ifdef CONFIG_INET_ESPINTCP -struct esp_tcp_sk { - struct sock *sk; - struct rcu_head rcu; -}; - -static void esp_free_tcp_sk(struct rcu_head *head) -{ - struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); - - sock_put(esk->sk); - kfree(esk); -} - static struct sock *esp_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); - struct esp_tcp_sk *esk; __be16 sport, dport; - struct sock *nsk; struct sock *sk; - sk = rcu_dereference(x->encap_sk); - if (sk && sk->sk_state == TCP_ESTABLISHED) - return sk; - spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (sk && sk == nsk) { - esk = kmalloc(sizeof(*esk), GFP_ATOMIC); - if (!esk) { - spin_unlock_bh(&x->lock); - return ERR_PTR(-ENOMEM); - } - RCU_INIT_POINTER(x->encap_sk, NULL); - esk->sk = sk; - call_rcu(&esk->rcu, esp_free_tcp_sk); - } spin_unlock_bh(&x->lock); sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, @@ -173,20 +142,6 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } - spin_lock_bh(&x->lock); - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (encap->encap_sport != sport || - encap->encap_dport != dport) { - sock_put(sk); - sk = nsk ?: ERR_PTR(-EREMCHG); - } else if (sk == nsk) { - sock_put(sk); - } else { - rcu_assign_pointer(x->encap_sk, sk); - } - spin_unlock_bh(&x->lock); - return sk; } @@ -199,8 +154,10 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) sk = esp_find_tcp_sk(x); err = PTR_ERR_OR_ZERO(sk); - if (err) + if (err) { + kfree_skb(skb); goto out; + } bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -209,6 +166,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); + sock_put(sk); + out: rcu_read_unlock(); return err; @@ -392,6 +351,8 @@ static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); + sock_put(sk); + *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 80c4ea0e12f4..e0d94270da28 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -53,9 +53,9 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, if (sp->len == XFRM_MAX_DEPTH) goto out_reset; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ip_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET); + x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ip_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 793e6781399a..5b7c41333d6f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -829,19 +829,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (cfg->fc_dst_len > 32) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + err = -EINVAL; + goto errout; + } + + if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { + NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); + err = -EINVAL; + goto errout; + } + if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - return -EINVAL; + err = -EINVAL; + goto errout; } if (!cfg->fc_table) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b07292d50ee7..4563e5303c1a 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -245,9 +245,9 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct nlattr **tb, struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); + struct fib4_rule *rule4 = (struct fib4_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 09e31757e96c..cc86031d2050 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1193,22 +1193,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } -static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) -{ - if (plen > KEYLENGTH) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - return false; - } - - if ((plen < KEYLENGTH) && (key << plen)) { - NL_SET_ERR_MSG(extack, - "Invalid prefix for given prefix length"); - return false; - } - - return true; -} - static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); @@ -1229,9 +1213,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); @@ -1723,9 +1704,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c3ad41573b33..b8111ec651b5 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -312,7 +312,6 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, struct dst_entry *dst = &rt->dst; struct inet_peer *peer; bool rc = true; - int vif; if (!apply_ratelimit) return true; @@ -321,12 +320,12 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) goto out; - vif = l3mdev_master_ifindex(dst->dev); - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); + rcu_read_lock(); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, + l3mdev_master_ifindex_rcu(dst->dev)); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); @@ -400,10 +399,10 @@ static void icmp_push_reply(struct sock *sk, static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { - struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); bool apply_ratelimit = false; + struct ipcm_cookie ipc; struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; @@ -478,13 +477,11 @@ static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) return route_lookup_dev; } -static struct rtable *icmp_route_lookup(struct net *net, - struct flowi4 *fl4, +static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, - const struct iphdr *iph, - __be32 saddr, u8 tos, u32 mark, - int type, int code, - struct icmp_bxm *param) + const struct iphdr *iph, __be32 saddr, + dscp_t dscp, u32 mark, int type, + int code, struct icmp_bxm *param) { struct net_device *route_lookup_dev; struct dst_entry *dst, *dst2; @@ -498,7 +495,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); - fl4->flowi4_tos = tos & INET_DSCP_MASK; + fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; @@ -550,7 +547,7 @@ static struct rtable *icmp_route_lookup(struct net *net, orefdst = skb_in->_skb_refdst; /* save old refdst */ skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, - tos, rt2->dst.dev); + dscp, rt2->dst.dev); dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); @@ -611,12 +608,14 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, struct sock *sk; if (!rt) - goto out; + return; + + rcu_read_lock(); if (rt->dst.dev) - net = dev_net(rt->dst.dev); + net = dev_net_rcu(rt->dst.dev); else if (skb_in->dev) - net = dev_net(skb_in->dev); + net = dev_net_rcu(skb_in->dev); else goto out; @@ -744,8 +743,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, ipc.opt = &icmp_param.replyopts.opt; ipc.sockc.mark = mark; - rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, - type, code, &icmp_param); + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, + inet_dsfield_to_dscp(tos), mark, type, code, + &icmp_param); if (IS_ERR(rt)) goto out_unlock; @@ -787,7 +787,8 @@ out_unlock: icmp_xmit_unlock(sk); out_bh_enable: local_bh_enable(); -out:; +out: + rcu_read_unlock(); } EXPORT_SYMBOL(__icmp_send); @@ -836,7 +837,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) * avoid additional coding at protocol handlers. */ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return; } @@ -870,7 +871,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) struct net *net; u32 info = 0; - net = dev_net(skb_dst(skb)->dev); + net = dev_net_rcu(skb_dst(skb)->dev); /* * Incomplete header ? @@ -981,7 +982,7 @@ out_err: static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } @@ -1013,7 +1014,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) struct icmp_bxm icmp_param; struct net *net; - net = dev_net(skb_dst(skb)->dev); + net = dev_net_rcu(skb_dst(skb)->dev); /* should there be an ICMP stat for ignored echos? */ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) return SKB_NOT_DROPPED_YET; @@ -1042,9 +1043,9 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) { + struct net *net = dev_net_rcu(skb->dev); struct icmp_ext_hdr *ext_hdr, _ext_hdr; struct icmp_ext_echo_iio *iio, _iio; - struct net *net = dev_net(skb->dev); struct inet6_dev *in6_dev; struct in_device *in_dev; struct net_device *dev; @@ -1183,7 +1184,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) return SKB_NOT_DROPPED_YET; out_err: - __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } @@ -1200,7 +1201,7 @@ int icmp_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); struct icmphdr *icmph; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { @@ -1373,9 +1374,9 @@ int icmp_err(struct sk_buff *skb, u32 info) struct iphdr *iph = (struct iphdr *)skb->data; int offset = iph->ihl<<2; struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); + struct net *net = dev_net_rcu(skb->dev); int type = icmp_hdr(skb)->type; int code = icmp_hdr(skb)->code; - struct net *net = dev_net(skb->dev); /* * Use ping_err to handle all icmp errors except those diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9bfcfd016e18..2b4a58824763 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1230,22 +1230,37 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); unsigned int i, nblocks = 1; + spinlock_t *ptr = NULL; - if (locksz != 0) { - /* allocate 2 cache lines or at least one spinlock per cpu */ - nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); - nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + if (locksz == 0) + goto set_mask; - /* no more locks than number of hash buckets */ - nblocks = min(nblocks, hashinfo->ehash_mask + 1); + /* Allocate 2 cache lines or at least one spinlock per cpu. */ + nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); - hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); - if (!hashinfo->ehash_locks) - return -ENOMEM; + /* At least one page per NUMA node. */ + nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); + + nblocks = roundup_pow_of_two(nblocks); + + /* No more locks than number of hash buckets. */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); - for (i = 0; i < nblocks; i++) - spin_lock_init(&hashinfo->ehash_locks[i]); + if (num_online_nodes() > 1) { + /* Use vmalloc() to allow NUMA policy to spread pages + * on all available nodes if desired. + */ + ptr = vmalloc_array(nblocks, locksz); + } + if (!ptr) { + ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); + if (!ptr) + return -ENOMEM; } + for (i = 0; i < nblocks; i++) + spin_lock_init(&ptr[i]); + hashinfo->ehash_locks = ptr; +set_mask: hashinfo->ehash_locks_mask = nblocks - 1; return 0; } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 5bd759963451..9c5ffe3b5f77 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -95,6 +95,7 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, { struct rb_node **pp, *parent, *next; struct inet_peer *p; + u32 now; pp = &base->rb_root.rb_node; parent = NULL; @@ -108,8 +109,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { - if (!refcount_inc_not_zero(&p->refcnt)) - break; + now = jiffies; + if (READ_ONCE(p->dtime) != now) + WRITE_ONCE(p->dtime, now); return p; } if (gc_stack) { @@ -155,9 +157,6 @@ static void inet_peer_gc(struct inet_peer_base *base, for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; - /* The READ_ONCE() pairs with the WRITE_ONCE() - * in inet_putpeer() - */ delta = (__u32)jiffies - READ_ONCE(p->dtime); if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) @@ -173,31 +172,23 @@ static void inet_peer_gc(struct inet_peer_base *base, } } +/* Must be called under RCU : No refcount change is done here. */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, - const struct inetpeer_addr *daddr, - int create) + const struct inetpeer_addr *daddr) { struct inet_peer *p, *gc_stack[PEER_MAX_GC]; struct rb_node **pp, *parent; unsigned int gc_cnt, seq; - int invalidated; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ - rcu_read_lock(); seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); - invalidated = read_seqretry(&base->lock, seq); - rcu_read_unlock(); if (p) return p; - /* If no writer did a change during our lookup, we can return early. */ - if (!create && !invalidated) - return NULL; - /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ @@ -206,12 +197,12 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, gc_cnt = 0; p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); - if (!p && create) { + if (!p) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; p->dtime = (__u32)jiffies; - refcount_set(&p->refcnt, 2); + refcount_set(&p->refcnt, 1); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; @@ -236,15 +227,9 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { - /* The WRITE_ONCE() pairs with itself (we run lockless) - * and the READ_ONCE() in inet_peer_gc() - */ - WRITE_ONCE(p->dtime, (__u32)jiffies); - if (refcount_dec_and_test(&p->refcnt)) call_rcu(&p->rcu, inetpeer_free_rcu); } -EXPORT_SYMBOL_GPL(inet_putpeer); /* * Check transmit rate limitation for given message. diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a92664a5ef2e..9ca0a183a55f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -82,15 +82,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); - struct net *net = q->fqdir->net; - const struct frag_v4_compare_key *key = a; + struct net *net = q->fqdir->net; + struct inet_peer *p = NULL; q->key.v4 = *key; qp->ecn = 0; - qp->peer = q->fqdir->max_dist ? - inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : - NULL; + if (q->fqdir->max_dist) { + rcu_read_lock(); + p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif); + if (p && !refcount_inc_not_zero(&p->refcnt)) + p = NULL; + rcu_read_unlock(); + } + qp->peer = p; } static void ip4_frag_free(struct inet_frag_queue *q) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f1f31ebfc793..9667f2774025 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -141,7 +141,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - unsigned int data_len = 0; struct ip_tunnel *t; if (tpi->proto == htons(ETH_P_TEB)) @@ -182,7 +181,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; - data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: @@ -190,10 +188,16 @@ static int ipgre_err(struct sk_buff *skb, u32 info, } #if IS_ENABLED(CONFIG_IPV6) - if (tpi->proto == htons(ETH_P_IPV6) && - !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, - type, data_len)) - return 0; + if (tpi->proto == htons(ETH_P_IPV6)) { + unsigned int data_len = 0; + + if (type == ICMP_TIME_EXCEEDED) + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ + + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return 0; + } #endif if (t->parms.iph.daddr == 0 || diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 68aedb8877b9..81e86e5defee 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -617,7 +617,8 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); - err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, dev); + err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph), + dev); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index a3676155be78..f65d2f727381 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -416,7 +416,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, skb_dst_update_pmtu_no_confirm(skb, mtu); - if (!reply || skb->pkt_type == PACKET_HOST) + if (!reply) return 0; if (skb->protocol == htons(ETH_P_IP)) @@ -451,7 +451,7 @@ static const struct nla_policy geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, - [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, + [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static const struct nla_policy diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 449a2ac40bdc..de0d9cc7806a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -817,7 +817,7 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, cache->mfc_un.res.maxvif = vifi + 1; } } - cache->mfc_un.res.lastuse = jiffies; + WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int vif_add(struct net *net, struct mr_table *mrt, @@ -1667,9 +1667,9 @@ int ipmr_ioctl(struct sock *sk, int cmd, void *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr); if (c) { - sr->pktcnt = c->_c.mfc_un.res.pkt; - sr->bytecnt = c->_c.mfc_un.res.bytes; - sr->wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } @@ -1739,9 +1739,9 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -1974,9 +1974,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, int vif, ct; vif = c->_c.mfc_parent; - c->_c.mfc_un.res.pkt++; - c->_c.mfc_un.res.bytes += skb->len; - c->_c.mfc_un.res.lastuse = jiffies; + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; @@ -2007,7 +2007,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, goto dont_forward; } - c->_c.mfc_un.res.wrong_if++; + atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -3015,9 +3015,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->_c.mfc_un.res.pkt, - mfc->_c.mfc_un.res.bytes, - mfc->_c.mfc_un.res.wrong_if); + atomic_long_read(&mfc->_c.mfc_un.res.pkt), + atomic_long_read(&mfc->_c.mfc_un.res.bytes), + atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index f0af12a2f70b..28d77d454d44 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -263,9 +263,9 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, lastuse = READ_ONCE(c->mfc_un.res.lastuse); lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; - mfcs.mfcs_packets = c->mfc_un.res.pkt; - mfcs.mfcs_bytes = c->mfc_un.res.bytes; - mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; + mfcs.mfcs_packets = atomic_long_read(&c->mfc_un.res.pkt); + mfcs.mfcs_bytes = atomic_long_read(&c->mfc_un.res.bytes); + mfcs.mfcs_wrong_if = atomic_long_read(&c->mfc_un.res.wrong_if); if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) @@ -330,9 +330,6 @@ next_entry: list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) goto next_entry2; - if (filter->dev && - !mr_mfc_uses_dev(mrt, mfc, filter->dev)) - goto next_entry2; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 09fff5d424ef..f514eb52b8d4 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -49,7 +49,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, else addr = iph->saddr; - *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); + if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) { + *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); + return; + } + + *dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr); } EXPORT_SYMBOL_GPL(nft_fib4_eval_type); @@ -64,12 +69,17 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, struct flowi4 fl4 = { .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_proto = pkt->tprot, .flowi4_uid = sock_net_uid(nft_net(pkt), NULL), - .flowi4_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; const struct net_device *oif; const struct net_device *found; + if (nft_fib_can_skip(pkt)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } + /* * Do not set flowi4_oif, it restricts results (for example, asking * for oif 3 will get RTN_UNICAST result even if the daddr exits @@ -84,11 +94,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && - nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; - } + fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif); iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); if (!iph) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 723ac9181558..9a5c9497b393 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -189,7 +189,11 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); +#ifndef CONFIG_PREEMPT_RT #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) +#else +#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field) +#endif #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) @@ -390,7 +394,13 @@ static inline int ip_rt_proc_init(void) static inline bool rt_is_expired(const struct rtable *rth) { - return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); + bool res; + + rcu_read_lock(); + res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); + rcu_read_unlock(); + + return res; } void rt_cache_flush(struct net *net) @@ -870,11 +880,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); - rcu_read_unlock(); net = dev_net(rt->dst.dev); - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { + rcu_read_unlock(); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; @@ -893,7 +903,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; - goto out_put_peer; + goto out_unlock; } /* Check for load limit; set rate_last to the latest sent @@ -914,8 +924,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } -out_put_peer: - inet_putpeer(peer); +out_unlock: + rcu_read_unlock(); } static int ip_error(struct sk_buff *skb) @@ -975,9 +985,9 @@ static int ip_error(struct sk_buff *skb) break; } + rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, - l3mdev_master_ifindex(skb->dev), 1); - + l3mdev_master_ifindex_rcu(skb->dev)); send = true; if (peer) { now = jiffies; @@ -989,8 +999,9 @@ static int ip_error(struct sk_buff *skb) peer->rate_tokens -= ip_rt_error_cost; else send = false; - inet_putpeer(peer); } + rcu_read_unlock(); + if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); @@ -1001,9 +1012,9 @@ out: kfree_skb_reason(skb, reason); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; - struct net *net = dev_net(dst->dev); struct fib_result res; bool lock = false; + struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) @@ -1013,6 +1024,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (old_mtu < mtu) return; + rcu_read_lock(); + net = dev_net_rcu(dst->dev); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1020,17 +1033,29 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) - return; + goto out; - rcu_read_lock(); if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, NULL); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (fib_info_num_path(res.fi) > 1) { + int nhsel; + + for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) { + nhc = fib_info_nhc(res.fi, nhsel); + update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, + jiffies + net->ipv4.ip_rt_mtu_expires); + } + goto out; + } +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } +out: rcu_read_unlock(); } @@ -1293,10 +1318,15 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - struct net *net = dev_net(dst->dev); unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); - unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, - net->ipv4.ip_rt_min_advmss); + unsigned int advmss; + struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); + advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, + net->ipv4.ip_rt_min_advmss); + rcu_read_unlock(); return min(advmss, IPV4_MAX_PMTU - header_size); } @@ -1654,8 +1684,8 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) else if (rt->rt_gw_family == AF_INET6) new_rt->rt_gw6 = rt->rt_gw6; - new_rt->dst.input = rt->dst.input; - new_rt->dst.output = rt->dst.output; + new_rt->dst.input = READ_ONCE(rt->dst.input); + new_rt->dst.output = READ_ONCE(rt->dst.output); new_rt->dst.error = rt->dst.error; new_rt->dst.lastuse = jiffies; new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); @@ -2515,7 +2545,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, do_cache = true; if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - fi = NULL; } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4f77bd862e95..156da81bce06 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1145,7 +1145,7 @@ restart: goto do_error; while (msg_data_left(msg)) { - ssize_t copy = 0; + int copy = 0; skb = tcp_write_queue_tail(sk); if (skb) @@ -1564,12 +1564,13 @@ EXPORT_SYMBOL(tcp_recv_skb); * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ -int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - u32 seq = tp->copied_seq; + u32 seq = *copied_seq; u32 offset; int copied = 0; @@ -1623,9 +1624,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_eat_recv_skb(sk, skb); if (!desc->count) break; - WRITE_ONCE(tp->copied_seq, seq); + WRITE_ONCE(*copied_seq, seq); } - WRITE_ONCE(tp->copied_seq, seq); + WRITE_ONCE(*copied_seq, seq); + + if (noack) + goto out; tcp_rcv_space_adjust(sk); @@ -1634,10 +1638,25 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } +out: return copied; } + +int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + return __tcp_read_sock(sk, desc, recv_actor, false, + &tcp_sk(sk)->copied_seq); +} EXPORT_SYMBOL(tcp_read_sock); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) +{ + return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq); +} + int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; @@ -2437,14 +2456,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, */ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); dmabuf_cmsg.frag_size = copy; - err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, - sizeof(dmabuf_cmsg), &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_LINEAR, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } sent += copy; @@ -2498,16 +2515,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, offset += copy; remaining_len -= copy; - err = put_cmsg(msg, SOL_SOCKET, - SO_DEVMEM_DMABUF, - sizeof(dmabuf_cmsg), - &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_DMABUF, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } atomic_long_inc(&niov->pp_ref_count); tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 392678ae80f4..22e8a2af5dd8 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -646,6 +646,42 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + struct sock *sk = strp->sk; + struct sk_psock *psock; + struct tcp_sock *tp; + int copied = 0; + + tp = tcp_sk(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (WARN_ON_ONCE(!psock)) { + desc->error = -EINVAL; + goto out; + } + + psock->ingress_bytes = 0; + copied = tcp_read_sock_noack(sk, desc, recv_actor, true, + &psock->copied_seq); + if (copied < 0) + goto out; + /* recv_actor may redirect skb to another socket (SK_REDIRECT) or + * just put skb into ingress queue of current socket (SK_PASS). + * For SK_REDIRECT, we need to ack the frame immediately but for + * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser(). + */ + tp->copied_seq = psock->copied_seq - psock->ingress_bytes; + tcp_rcv_space_adjust(sk); + __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes); +out: + rcu_read_unlock(); + return copied; +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 5dbed91c6178..76c23675ae50 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -392,6 +392,10 @@ static void hystart_update(struct sock *sk, u32 delay) if (after(tp->snd_una, ca->end_seq)) bictcp_hystart_reset(sk); + /* hystart triggers when cwnd is larger than some threshold */ + if (tcp_snd_cwnd(tp) < hystart_low_window) + return; + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock_us(sk); @@ -467,9 +471,7 @@ __bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample if (ca->delay_min == 0 || ca->delay_min > delay) ca->delay_min = delay; - /* hystart triggers when cwnd is larger than some threshold */ - if (!ca->found && tcp_in_slow_start(tp) && hystart && - tcp_snd_cwnd(tp) >= hystart_low_window) + if (!ca->found && tcp_in_slow_start(tp) && hystart) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 0f523cbfe329..408985eb74ee 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -3,6 +3,7 @@ #include <linux/tcp.h> #include <linux/rcupdate.h> #include <net/tcp.h> +#include <net/busy_poll.h> void tcp_fastopen_init_key_once(struct net *net) { @@ -178,7 +179,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) if (!skb) return; - skb_dst_drop(skb); + tcp_cleanup_skb(skb); /* segs_in has been initialized to 1 in tcp_create_openreq_child(). * Hence, reset segs_in to 0 before calling tcp_segs_in() * to avoid double counting. Also, tcp_segs_in() expects @@ -195,7 +196,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); tp->syn_data_acked = 1; /* u64_stats_update_begin(&tp->syncp) not needed here, @@ -279,6 +280,8 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); + sk_mark_napi_id_set(child, skb); + /* Now finish processing the fastopen child socket. */ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2d43b29da15e..30f4375f8431 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -243,9 +243,15 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; - if (old_ratio != tcp_sk(sk)->scaling_ratio) - WRITE_ONCE(tcp_sk(sk)->window_clamp, - tcp_win_from_space(sk, sk->sk_rcvbuf)); + if (old_ratio != tcp_sk(sk)->scaling_ratio) { + struct tcp_sock *tp = tcp_sk(sk); + + val = tcp_win_from_space(sk, sk->sk_rcvbuf); + tcp_set_window_clamp(sk, val); + + if (tp->window_clamp < tp->rcvq_space.space) + tp->rcvq_space.space = tp->window_clamp; + } } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); @@ -413,6 +419,20 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr return false; } +static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) +{ + tp->delivered_ce += ecn_count; +} + +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) +{ + tp->delivered += delivered; + if (ece_ack) + tcp_count_delivered_ce(tp, delivered); +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -645,10 +665,12 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { - u32 new_sample = tp->rcv_rtt_est.rtt_us; - long m = sample; + u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us; + long m = sample << 3; - if (new_sample != 0) { + if (old_sample == 0 || m < old_sample) { + new_sample = m; + } else { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially * with chatty applications or bulk transfer apps which @@ -659,17 +681,9 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * else with timestamps disabled convergence takes too * long. */ - if (!win_dep) { - m -= (new_sample >> 3); - new_sample += m; - } else { - m <<= 3; - if (m < new_sample) - new_sample = m; - } - } else { - /* No previous measure. */ - new_sample = m << 3; + if (win_dep) + return; + new_sample = old_sample - (old_sample >> 3) + sample; } tp->rcv_rtt_est.rtt_us = new_sample; @@ -693,7 +707,7 @@ new_measure: tp->rcv_rtt_est.time = tp->tcp_mstamp; } -static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) +static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta) { u32 delta, delta_us; @@ -703,7 +717,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) - delta = 1; + delta = min_delta; delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); return delta_us; } @@ -721,9 +735,9 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { - s32 delta = tcp_rtt_tsopt_us(tp); + s32 delta = tcp_rtt_tsopt_us(tp, 0); - if (delta >= 0) + if (delta > 0) tcp_rcv_rtt_update(tp, delta, 0); } } @@ -735,8 +749,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 copied; - int time; + int time, inq, copied; trace_tcp_rcv_space_adjust(sk); @@ -747,6 +760,9 @@ void tcp_rcv_space_adjust(struct sock *sk) /* Number of bytes copied to user in last RTT */ copied = tp->copied_seq - tp->rcvq_space.seq; + /* Number of bytes in receive queue. */ + inq = tp->rcv_nxt - tp->copied_seq; + copied -= inq; if (copied <= tp->rcvq_space.space) goto new_measure; @@ -1148,15 +1164,6 @@ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) } } -/* Updates the delivered and delivered_ce counts */ -static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, - bool ece_ack) -{ - tp->delivered += delivered; - if (ece_ack) - tp->delivered_ce += delivered; -} - /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -2475,20 +2482,33 @@ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { const struct sock *sk = (const struct sock *)tp; - if (tp->retrans_stamp && - tcp_tsopt_ecr_before(tp, tp->retrans_stamp)) - return true; /* got echoed TS before first retransmission */ + /* Received an echoed timestamp before the first retransmission? */ + if (tp->retrans_stamp) + return tcp_tsopt_ecr_before(tp, tp->retrans_stamp); + + /* We set tp->retrans_stamp upon the first retransmission of a loss + * recovery episode, so normally if tp->retrans_stamp is 0 then no + * retransmission has happened yet (likely due to TSQ, which can cause + * fast retransmits to be delayed). So if snd_una advanced while + * (tp->retrans_stamp is 0 then apparently a packet was merely delayed, + * not lost. But there are exceptions where we retransmit but then + * clear tp->retrans_stamp, so we check for those exceptions. + */ - /* Check if nothing was retransmitted (retrans_stamp==0), which may - * happen in fast recovery due to TSQ. But we ignore zero retrans_stamp - * in TCP_SYN_SENT, since when we set FLAG_SYN_ACKED we also clear - * retrans_stamp even if we had retransmitted the SYN. + /* (1) For non-SACK connections, tcp_is_non_sack_preventing_reopen() + * clears tp->retrans_stamp when snd_una == high_seq. */ - if (!tp->retrans_stamp && /* no record of a retransmit/SYN? */ - sk->sk_state != TCP_SYN_SENT) /* not the FLAG_SYN_ACKED case? */ - return true; /* nothing was retransmitted */ + if (!tcp_is_sack(tp) && !before(tp->snd_una, tp->high_seq)) + return false; - return false; + /* (2) In TCP_SYN_SENT tcp_clean_rtx_queue() clears tp->retrans_stamp + * when setting FLAG_SYN_ACKED is set, even if the SYN was + * retransmitted. + */ + if (sk->sk_state == TCP_SYN_SENT) + return false; + + return true; /* tp->retrans_stamp is zero; no retransmit yet */ } /* Undo procedures. */ @@ -3215,7 +3235,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp); + seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1); rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) @@ -3856,12 +3876,23 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } -static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +static void tcp_in_ack_event(struct sock *sk, int flag) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->in_ack_event) - icsk->icsk_ca_ops->in_ack_event(sk, flags); + if (icsk->icsk_ca_ops->in_ack_event) { + u32 ack_ev_flags = 0; + + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; + if (flag & FLAG_SLOWPATH) { + ack_ev_flags |= CA_ACK_SLOWPATH; + if (flag & FLAG_ECE) + ack_ev_flags |= CA_ACK_ECE; + } + + icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags); + } } /* Congestion control has updated the cwnd already. So if we're in @@ -3978,12 +4009,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; - tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { - u32 ack_ev_flags = CA_ACK_SLOWPATH; - if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3995,19 +4022,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; - ack_ev_flags |= CA_ACK_ECE; - } if (sack_state.sack_delivered) tcp_count_delivered(tp, sack_state.sack_delivered, flag & FLAG_ECE); - - if (flag & FLAG_WIN_UPDATE) - ack_ev_flags |= CA_ACK_WIN_UPDATE; - - tcp_in_ack_event(sk, ack_ev_flags); } /* This is a deviation from RFC3168 since it states that: @@ -4034,6 +4054,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + tcp_in_ack_event(sk, flag); + if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -4065,6 +4087,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, @@ -4947,8 +4970,9 @@ static void tcp_ofo_queue(struct sock *sk) if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { __u32 dsack = dsack_high; + if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) - dsack_high = TCP_SKB_CB(skb)->end_seq; + dsack = TCP_SKB_CB(skb)->end_seq; tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } p = rb_next(p); @@ -4964,7 +4988,7 @@ static void tcp_ofo_queue(struct sock *sk) tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); else kfree_skb_partial(skb, fragstolen); @@ -5016,6 +5040,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } + tcp_measure_rcv_mss(sk, skb); /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); @@ -5156,7 +5181,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); skb_set_owner_r(skb, sk); } return eaten; @@ -5239,7 +5264,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); return; } - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -6208,7 +6233,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); @@ -6827,6 +6852,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + if (req) { tcp_rcv_synrecv_state_fastopen(sk); } else { @@ -6852,9 +6880,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - if (tp->rx_opt.tstamp_ok) - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bcc2f1e090c7..824048679e1b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2025,7 +2025,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, */ skb_condense(skb); - skb_dst_drop(skb); + tcp_cleanup_skb(skb); if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index bb1fe1ba867a..f3e4fc957219 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -806,12 +806,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* In sequence, PAWS is OK. */ - /* TODO: We probably should defer ts_recent change once - * we take ownership of @req. - */ - if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) - WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval); - if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ @@ -860,6 +854,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; + if (own_req && tmp_opt.saw_tstamp && + !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) + tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; + if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 2308665b51c5..3a9c5c14c310 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -13,12 +13,15 @@ #include <net/tcp.h> #include <net/protocol.h> -static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, +static void tcp_gso_tstamp(struct sk_buff *skb, struct sk_buff *gso_skb, unsigned int seq, unsigned int mss) { + u32 flags = skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP; + u32 ts_seq = skb_shinfo(gso_skb)->tskey; + while (skb) { if (before(ts_seq, seq + mss)) { - skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP; + skb_shinfo(skb)->tx_flags |= flags; skb_shinfo(skb)->tskey = ts_seq; return; } @@ -193,8 +196,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th = tcp_hdr(skb); seq = ntohl(th->seq); - if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP)) - tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss); + if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP)) + tcp_gso_tstamp(segs, gso_skb, seq, mss); newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta)); @@ -352,6 +355,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, flush |= skb->ip_summed != p->ip_summed; flush |= skb->csum_level != p->csum_level; flush |= NAPI_GRO_CB(p)->count >= 64; + skb_set_network_header(skb, skb_gro_receive_network_offset(skb)); if (flush || skb_gro_receive_list(p, skb)) mss = 1; @@ -432,7 +436,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; if (sk) - sock_put(sk); + sock_gen_put(sk); } INDIRECT_CALLABLE_SCOPE diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8efc58716ce9..6d5387811c32 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -265,11 +265,14 @@ static u16 tcp_select_window(struct sock *sk) u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we - * are out of memory. The window is temporary, so we don't store - * it on the socket. + * are out of memory. */ - if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) + if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) { + tp->pred_flags = 0; + tp->rcv_wnd = 0; + tp->rcv_wup = tp->rcv_nxt; return 0; + } cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ff85242720a0..f4e24fc878fa 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -420,6 +420,49 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, udp_ehash_secret + net_hash_mix(net)); } +/** + * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port) + * @net: Network namespace + * @saddr: Source address, network order + * @sport: Source port, network order + * @daddr: Destination address, network order + * @hnum: Destination port, host order + * @dif: Destination interface index + * @sdif: Destination bridge port index, if relevant + * @udptable: Set of UDP hash tables + * + * Simplified lookup to be used as fallback if no sockets are found due to a + * potential race between (receive) address change, and lookup happening before + * the rehash operation. This function ignores SO_REUSEPORT groups while scoring + * result sockets, because if we have one, we don't need the fallback at all. + * + * Called under rcu_read_lock(). + * + * Return: socket with highest matching score if any, NULL if none + */ +static struct sock *udp4_lib_lookup1(const struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, + const struct udp_table *udptable) +{ + unsigned int slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot = &udptable->hash[slot]; + struct sock *sk, *result = NULL; + int score, badness = 0; + + sk_for_each_rcu(sk, &hslot->head) { + score = compute_score(sk, net, + saddr, sport, daddr, hnum, dif, sdif); + if (score > badness) { + result = sk; + badness = score; + } + } + + return result; +} + /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(const struct net *net, __be32 saddr, __be16 sport, @@ -525,6 +568,19 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, hslot2, skb); + if (!IS_ERR_OR_NULL(result)) + goto done; + + /* Primary hash (destination port) lookup as fallback for this race: + * 1. __ip4_datagram_connect() sets sk_rcv_saddr + * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet + * 3. rehash operation updating _secondary and four-tuple_ hashes + * The primary hash doesn't need an update after 1., so, thanks to this + * further step, 1. and 3. don't need to be atomic against the lookup. + */ + result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif, + udptable); + done: if (IS_ERR(result)) return NULL; @@ -929,9 +985,9 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); @@ -1414,12 +1470,12 @@ static bool udp_skb_has_head_state(struct sk_buff *skb) } /* fully reclaim rmem/fwd memory allocated for skb */ -static void udp_rmem_release(struct sock *sk, int size, int partial, - bool rx_queue_lock_held) +static void udp_rmem_release(struct sock *sk, unsigned int size, + int partial, bool rx_queue_lock_held) { struct udp_sock *up = udp_sk(sk); struct sk_buff_head *sk_queue; - int amt; + unsigned int amt; if (likely(partial)) { up->forward_deficit += size; @@ -1439,10 +1495,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (!rx_queue_lock_held) spin_lock(&sk_queue->lock); - - sk_forward_alloc_add(sk, size); - amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); - sk_forward_alloc_add(sk, -amt); + amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); + sk_forward_alloc_add(sk, size - amt); if (amt) __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); @@ -1514,17 +1568,25 @@ static int udp_rmem_schedule(struct sock *sk, int size) int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; - int rmem, err = -ENOMEM; + unsigned int rmem, rcvbuf; spinlock_t *busy = NULL; - int size, rcvbuf; + int size, err = -ENOMEM; - /* Immediately drop when the receive queue is full. - * Always allow at least one packet. - */ rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); - if (rmem > rcvbuf) - goto drop; + size = skb->truesize; + + /* Immediately drop when the receive queue is full. + * Cast to unsigned int performs the boundary check for INT_MAX. + */ + if (rmem + size > rcvbuf) { + if (rcvbuf > INT_MAX >> 1) + goto drop; + + /* Always allow at least one packet for small buffer. */ + if (rmem > rcvbuf) + goto drop; + } /* Under mem pressure, it might be helpful to help udp_recvmsg() * having linear skbs : @@ -1534,10 +1596,10 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ if (rmem > (rcvbuf >> 1)) { skb_condense(skb); - + size = skb->truesize; busy = busylock_acquire(sk); } - size = skb->truesize; + udp_set_dev_scratch(skb); atomic_add(size, &sk->sk_rmem_alloc); @@ -1624,7 +1686,7 @@ EXPORT_SYMBOL_GPL(skb_consume_udp); static struct sk_buff *__first_packet_length(struct sock *sk, struct sk_buff_head *rcvq, - int *total) + unsigned int *total) { struct sk_buff *skb; @@ -1657,8 +1719,8 @@ static int first_packet_length(struct sock *sk) { struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; struct sk_buff_head *sk_queue = &sk->sk_receive_queue; + unsigned int total = 0; struct sk_buff *skb; - int total = 0; int res; spin_lock_bh(&rcvq->lock); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a5be6e4ed326..12ba1a8db93a 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -61,7 +61,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; - need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); + need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); /* Try to offload checksum if possible */ offload_csum = !!(need_csum && !need_ipsec && @@ -247,6 +247,62 @@ static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs) return segs; } +static void __udpv6_gso_segment_csum(struct sk_buff *seg, + struct in6_addr *oldip, + const struct in6_addr *newip, + __be16 *oldport, __be16 newport) +{ + struct udphdr *uh = udp_hdr(seg); + + if (ipv6_addr_equal(oldip, newip) && *oldport == newport) + return; + + if (uh->check) { + inet_proto_csum_replace16(&uh->check, seg, oldip->s6_addr32, + newip->s6_addr32, true); + + inet_proto_csum_replace2(&uh->check, seg, *oldport, newport, + false); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + + *oldip = *newip; + *oldport = newport; +} + +static struct sk_buff *__udpv6_gso_segment_list_csum(struct sk_buff *segs) +{ + const struct ipv6hdr *iph; + const struct udphdr *uh; + struct ipv6hdr *iph2; + struct sk_buff *seg; + struct udphdr *uh2; + + seg = segs; + uh = udp_hdr(seg); + iph = ipv6_hdr(seg); + uh2 = udp_hdr(seg->next); + iph2 = ipv6_hdr(seg->next); + + if (!(*(const u32 *)&uh->source ^ *(const u32 *)&uh2->source) && + ipv6_addr_equal(&iph->saddr, &iph2->saddr) && + ipv6_addr_equal(&iph->daddr, &iph2->daddr)) + return segs; + + while ((seg = seg->next)) { + uh2 = udp_hdr(seg); + iph2 = ipv6_hdr(seg); + + __udpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, + &uh2->source, uh->source); + __udpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, + &uh2->dest, uh->dest); + } + + return segs; +} + static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, netdev_features_t features, bool is_ipv6) @@ -259,7 +315,10 @@ static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); - return is_ipv6 ? skb : __udpv4_gso_segment_list_csum(skb); + if (is_ipv6) + return __udpv6_gso_segment_list_csum(skb); + else + return __udpv4_gso_segment_list_csum(skb); } struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, @@ -273,6 +332,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, bool copy_dtor; __sum16 check; __be16 newlen; + int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) @@ -301,6 +361,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) return __udp_gso_segment_list(gso_skb, features, is_ipv6); + ret = __skb_linearize(gso_skb); + if (ret) + return ERR_PTR(ret); + /* Setup csum, as fraglist skips this in udp4_gro_receive. */ gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; gso_skb->csum_offset = offsetof(struct udphdr, check); @@ -321,13 +385,17 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, /* clear destructor to avoid skb_segment assigning it to tail */ copy_dtor = gso_skb->destructor == sock_wfree; - if (copy_dtor) + if (copy_dtor) { gso_skb->destructor = NULL; + gso_skb->sk = NULL; + } segs = skb_segment(gso_skb, features); if (IS_ERR_OR_NULL(segs)) { - if (copy_dtor) + if (copy_dtor) { gso_skb->destructor = sock_wfree; + gso_skb->sk = sk; + } return segs; } @@ -536,6 +604,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, NAPI_GRO_CB(skb)->flush = 1; return NULL; } + skb_set_network_header(skb, skb_gro_receive_network_offset(skb)); ret = skb_gro_receive_list(p, skb); } else { skb_gro_postpull_rcsum(skb, uh, diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index a620618cc568..12a1a0f42195 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -182,11 +182,15 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, int offset = skb_gro_offset(skb); const struct net_offload *ops; struct sk_buff *pp = NULL; - int ret; - - offset = offset - sizeof(struct udphdr); + int len, dlen; + __u8 *udpdata; + __be32 *udpdata32; - if (!pskb_pull(skb, offset)) + len = skb->len - offset; + dlen = offset + min(len, 8); + udpdata = skb_gro_header(skb, dlen, offset); + udpdata32 = (__be32 *)udpdata; + if (unlikely(!udpdata)) return NULL; rcu_read_lock(); @@ -194,11 +198,13 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, if (!ops || !ops->callbacks.gro_receive) goto out; - ret = __xfrm4_udp_encap_rcv(sk, skb, false); - if (ret) + /* check if it is a keepalive or IKE packet */ + if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; - skb_push(skb, offset); + /* set the transport header to ESP */ + skb_set_transport_header(skb, offset); + NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); @@ -208,7 +214,6 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, out: rcu_read_unlock(); - skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; |