diff options
Diffstat (limited to 'net/ipv4')
40 files changed, 1175 insertions, 561 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e682b48e0709..bd2901604842 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -322,6 +322,15 @@ config NET_FOU network mechanisms and optimizations for UDP (such as ECMP and RSS) can be leveraged to provide better service. +config NET_FOU_IP_TUNNELS + bool "IP: FOU encapsulation of IP tunnels" + depends on NET_IPIP || NET_IPGRE || IPV6_SIT + select NET_FOU + ---help--- + Allow configuration of FOU or GUE encapsulation for IP tunnels. + When this option is enabled IP tunnels can be configured to use + FOU or GUE encapsulation. + config GENEVE tristate "Generic Network Virtualization Encapsulation (Geneve)" depends on INET diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e67da4e6c324..a44773c8346c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1222,7 +1222,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_MPLS | + SKB_GSO_TUNNEL_REMCSUM | 0))) goto out; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 16acb59d665e..205e1472aa78 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1292,7 +1292,7 @@ static int arp_proc_init(void); void __init arp_init(void) { - neigh_table_init(&arp_tbl); + neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); dev_add_pack(&arp_packet_type); arp_proc_init(); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 4715f25dfe03..5160c710f2eb 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -50,7 +50,7 @@ #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <linux/atomic.h> -#include <asm/bug.h> +#include <linux/bug.h> #include <asm/unaligned.h> /* List of available DOI definitions */ @@ -72,6 +72,7 @@ struct cipso_v4_map_cache_bkt { u32 size; struct list_head list; }; + struct cipso_v4_map_cache_entry { u32 hash; unsigned char *key; @@ -82,7 +83,8 @@ struct cipso_v4_map_cache_entry { u32 activity; struct list_head list; }; -static struct cipso_v4_map_cache_bkt *cipso_v4_cache = NULL; + +static struct cipso_v4_map_cache_bkt *cipso_v4_cache; /* Restricted bitmap (tag #1) flags */ int cipso_v4_rbm_optfmt = 0; @@ -539,7 +541,7 @@ doi_add_return: /** * cipso_v4_doi_free - Frees a DOI definition - * @entry: the entry's RCU field + * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 360b565918c4..60173d4d3a0e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -392,8 +392,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) if (elen <= 0) goto out; - if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + err = skb_cow_data(skb, 0, &trailer); + if (err < 0) goto out; + nfrags = err; assoclen = sizeof(*esph); @@ -601,12 +603,12 @@ static int esp_init_authenc(struct xfrm_state *x) BUG_ON(!aalg_desc); err = -EINVAL; - if (aalg_desc->uinfo.auth.icv_fullbits/8 != + if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", - x->aalg->alg_name, - crypto_aead_authsize(aead), - aalg_desc->uinfo.auth.icv_fullbits/8); + pr_info("ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_aead_authsize(aead), + aalg_desc->uinfo.auth.icv_fullbits / 8); goto free_key; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 606c520ffd5a..b986298a7ba3 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -38,21 +38,17 @@ static inline struct fou *fou_from_sock(struct sock *sk) return sk->sk_user_data; } -static int fou_udp_encap_recv_deliver(struct sk_buff *skb, - u8 protocol, size_t len) +static void fou_recv_pull(struct sk_buff *skb, size_t len) { struct iphdr *iph = ip_hdr(skb); /* Remove 'len' bytes from the packet (UDP header and - * FOU header if present), modify the protocol to the one - * we found, and then call rcv_encap. + * FOU header if present). */ iph->tot_len = htons(ntohs(iph->tot_len) - len); __skb_pull(skb, len); skb_postpull_rcsum(skb, udp_hdr(skb), len); skb_reset_transport_header(skb); - - return -protocol; } static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) @@ -62,16 +58,56 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) if (!fou) return 1; - return fou_udp_encap_recv_deliver(skb, fou->protocol, - sizeof(struct udphdr)); + fou_recv_pull(skb, sizeof(struct udphdr)); + + return -fou->protocol; +} + +static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, + void *data, size_t hdrlen, u8 ipproto) +{ + __be16 *pd = data; + size_t start = ntohs(pd[0]); + size_t offset = ntohs(pd[1]); + size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + __wsum delta; + + if (skb->remcsum_offload) { + /* Already processed in GRO path */ + skb->remcsum_offload = 0; + return guehdr; + } + + if (!pskb_may_pull(skb, plen)) + return NULL; + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) + __skb_checksum_complete(skb); + + delta = remcsum_adjust((void *)guehdr + hdrlen, + skb->csum, start, offset); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + + return guehdr; +} + +static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr) +{ + /* No support yet */ + kfree_skb(skb); + return 0; } static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) { struct fou *fou = fou_from_sock(sk); - size_t len; + size_t len, optlen, hdrlen; struct guehdr *guehdr; - struct udphdr *uh; + void *data; + u16 doffset = 0; if (!fou) return 1; @@ -80,25 +116,58 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (!pskb_may_pull(skb, len)) goto drop; - uh = udp_hdr(skb); - guehdr = (struct guehdr *)&uh[1]; + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + optlen = guehdr->hlen << 2; + len += optlen; - len += guehdr->hlen << 2; if (!pskb_may_pull(skb, len)) goto drop; - uh = udp_hdr(skb); - guehdr = (struct guehdr *)&uh[1]; + /* guehdr may change after pull */ + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - if (guehdr->version != 0) - goto drop; + hdrlen = sizeof(struct guehdr) + optlen; - if (guehdr->flags) { - /* No support yet */ + if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) goto drop; + + hdrlen = sizeof(struct guehdr) + optlen; + + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); + + /* Pull csum through the guehdr now . This can be used if + * there is a remote checksum offload. + */ + skb_postpull_rcsum(skb, udp_hdr(skb), len); + + data = &guehdr[1]; + + if (guehdr->flags & GUE_FLAG_PRIV) { + __be32 flags = *(__be32 *)(data + doffset); + + doffset += GUE_LEN_PRIV; + + if (flags & GUE_PFLAG_REMCSUM) { + guehdr = gue_remcsum(skb, guehdr, data + doffset, + hdrlen, guehdr->proto_ctype); + if (!guehdr) + goto drop; + + data = &guehdr[1]; + + doffset += GUE_PLEN_REMCSUM; + } } - return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len); + if (unlikely(guehdr->control)) + return gue_control_message(skb, guehdr); + + __skb_pull(skb, sizeof(struct udphdr) + hdrlen); + skb_reset_transport_header(skb); + + return -guehdr->proto_ctype; + drop: kfree_skb(skb); return 0; @@ -149,6 +218,41 @@ out_unlock: return err; } +static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, + struct guehdr *guehdr, void *data, + size_t hdrlen, u8 ipproto) +{ + __be16 *pd = data; + size_t start = ntohs(pd[0]); + size_t offset = ntohs(pd[1]); + size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + __wsum delta; + + if (skb->remcsum_offload) + return guehdr; + + if (!NAPI_GRO_CB(skb)->csum_valid) + return NULL; + + /* Pull checksum that will be written */ + if (skb_gro_header_hard(skb, off + plen)) { + guehdr = skb_gro_header_slow(skb, off + plen, off); + if (!guehdr) + return NULL; + } + + delta = remcsum_adjust((void *)guehdr + hdrlen, + NAPI_GRO_CB(skb)->csum, start, offset); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); + + skb->remcsum_offload = 1; + + return guehdr; +} + static struct sk_buff **gue_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -156,38 +260,64 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, const struct net_offload *ops; struct sk_buff **pp = NULL; struct sk_buff *p; - u8 proto; struct guehdr *guehdr; - unsigned int hlen, guehlen; - unsigned int off; + size_t len, optlen, hdrlen, off; + void *data; + u16 doffset = 0; int flush = 1; off = skb_gro_offset(skb); - hlen = off + sizeof(*guehdr); + len = off + sizeof(*guehdr); + guehdr = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - guehdr = skb_gro_header_slow(skb, hlen, off); + if (skb_gro_header_hard(skb, len)) { + guehdr = skb_gro_header_slow(skb, len, off); if (unlikely(!guehdr)) goto out; } - proto = guehdr->next_hdr; + optlen = guehdr->hlen << 2; + len += optlen; - rcu_read_lock(); - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); - if (WARN_ON(!ops || !ops->callbacks.gro_receive)) - goto out_unlock; + if (skb_gro_header_hard(skb, len)) { + guehdr = skb_gro_header_slow(skb, len, off); + if (unlikely(!guehdr)) + goto out; + } - guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + if (unlikely(guehdr->control) || guehdr->version != 0 || + validate_gue_flags(guehdr, optlen)) + goto out; - hlen = off + guehlen; - if (skb_gro_header_hard(skb, hlen)) { - guehdr = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!guehdr)) - goto out_unlock; + hdrlen = sizeof(*guehdr) + optlen; + + /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr, + * this is needed if there is a remote checkcsum offload. + */ + skb_gro_postpull_rcsum(skb, guehdr, hdrlen); + + data = &guehdr[1]; + + if (guehdr->flags & GUE_FLAG_PRIV) { + __be32 flags = *(__be32 *)(data + doffset); + + doffset += GUE_LEN_PRIV; + + if (flags & GUE_PFLAG_REMCSUM) { + guehdr = gue_gro_remcsum(skb, off, guehdr, + data + doffset, hdrlen, + guehdr->proto_ctype); + if (!guehdr) + goto out; + + data = &guehdr[1]; + + doffset += GUE_PLEN_REMCSUM; + } } + skb_gro_pull(skb, hdrlen); + flush = 0; for (p = *head; p; p = p->next) { @@ -199,7 +329,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, guehdr2 = (struct guehdr *)(p->data + off); /* Compare base GUE header to be equal (covers - * hlen, version, next_hdr, and flags. + * hlen, version, proto_ctype, and flags. */ if (guehdr->word != guehdr2->word) { NAPI_GRO_CB(p)->same_flow = 0; @@ -214,10 +344,11 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, } } - skb_gro_pull(skb, guehlen); - - /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ - skb_gro_postpull_rcsum(skb, guehdr, guehlen); + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[guehdr->proto_ctype]); + if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); @@ -238,7 +369,7 @@ static int gue_gro_complete(struct sk_buff *skb, int nhoff) u8 proto; int err = -ENOENT; - proto = guehdr->next_hdr; + proto = guehdr->proto_ctype; guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); @@ -489,6 +620,200 @@ static const struct genl_ops fou_nl_ops[] = { }, }; +size_t fou_encap_hlen(struct ip_tunnel_encap *e) +{ + return sizeof(struct udphdr); +} +EXPORT_SYMBOL(fou_encap_hlen); + +size_t gue_encap_hlen(struct ip_tunnel_encap *e) +{ + size_t len; + bool need_priv = false; + + len = sizeof(struct udphdr) + sizeof(struct guehdr); + + if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) { + len += GUE_PLEN_REMCSUM; + need_priv = true; + } + + len += need_priv ? GUE_LEN_PRIV : 0; + + return len; +} +EXPORT_SYMBOL(gue_encap_hlen); + +static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, + struct flowi4 *fl4, u8 *protocol, __be16 sport) +{ + struct udphdr *uh; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + uh->check = 0; + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; +} + +int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + __be16 sport; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(fou_build_header); + +int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + struct guehdr *guehdr; + size_t hdrlen, optlen = 0; + __be16 sport; + void *data; + bool need_priv = false; + + if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + csum = false; + optlen += GUE_PLEN_REMCSUM; + type |= SKB_GSO_TUNNEL_REMCSUM; + need_priv = true; + } + + optlen += need_priv ? GUE_LEN_PRIV : 0; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + /* Get source port (based on flow hash) before skb_push */ + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); + + guehdr = (struct guehdr *)skb->data; + + guehdr->control = 0; + guehdr->version = 0; + guehdr->hlen = optlen >> 2; + guehdr->flags = 0; + guehdr->proto_ctype = *protocol; + + data = &guehdr[1]; + + if (need_priv) { + __be32 *flags = data; + + guehdr->flags |= GUE_FLAG_PRIV; + *flags = 0; + data += GUE_LEN_PRIV; + + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd = data; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + + } + + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(gue_build_header); + +#ifdef CONFIG_NET_FOU_IP_TUNNELS + +static const struct ip_tunnel_encap_ops __read_mostly fou_iptun_ops = { + .encap_hlen = fou_encap_hlen, + .build_header = fou_build_header, +}; + +static const struct ip_tunnel_encap_ops __read_mostly gue_iptun_ops = { + .encap_hlen = gue_encap_hlen, + .build_header = gue_build_header, +}; + +static int ip_tunnel_encap_add_fou_ops(void) +{ + int ret; + + ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); + if (ret < 0) { + pr_err("can't add fou ops\n"); + return ret; + } + + ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE); + if (ret < 0) { + pr_err("can't add gue ops\n"); + ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); + return ret; + } + + return 0; +} + +static void ip_tunnel_encap_del_fou_ops(void) +{ + ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); + ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE); +} + +#else + +static int ip_tunnel_encap_add_fou_ops(void) +{ + return 0; +} + +static void ip_tunnel_encap_del_fou_ops(void) +{ +} + +#endif + static int __init fou_init(void) { int ret; @@ -496,6 +821,14 @@ static int __init fou_init(void) ret = genl_register_family_with_ops(&fou_nl_family, fou_nl_ops); + if (ret < 0) + goto exit; + + ret = ip_tunnel_encap_add_fou_ops(); + if (ret < 0) + genl_unregister_family(&fou_nl_family); + +exit: return ret; } @@ -503,6 +836,8 @@ static void __exit fou_fini(void) { struct fou *fou, *next; + ip_tunnel_encap_del_fou_ops(); + genl_unregister_family(&fou_nl_family); /* Close all the FOU sockets */ diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index dedb21e99914..a457232f0131 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -104,7 +104,7 @@ static void geneve_build_header(struct genevehdr *geneveh, memcpy(geneveh->options, options, options_len); } -/* Transmit a fully formated Geneve frame. +/* Transmit a fully formatted Geneve frame. * * When calling this function. The skb->data should point * to the geneve header which is fully formed. @@ -131,15 +131,9 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, if (unlikely(err)) return err; - if (vlan_tx_tag_present(skb)) { - if (unlikely(!__vlan_put_tag(skb, - skb->vlan_proto, - vlan_tx_tag_get(skb)))) { - err = -ENOMEM; - return err; - } - skb->vlan_tci = 0; - } + skb = vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) + return -ENOMEM; gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index bb5947b0ce2d..51973ddc05a6 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -247,6 +247,9 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff) err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); rcu_read_unlock(); + + skb_set_inner_mac_header(skb, nhoff + grehlen); + return err; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5882f584910e..36f5584d93c5 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -190,7 +190,7 @@ EXPORT_SYMBOL(icmp_err_convert); */ struct icmp_control { - void (*handler)(struct sk_buff *skb); + bool (*handler)(struct sk_buff *skb); short error; /* This ICMP is classed as an error message */ }; @@ -746,7 +746,7 @@ static bool icmp_tag_validation(int proto) * ICMP_PARAMETERPROB. */ -static void icmp_unreach(struct sk_buff *skb) +static bool icmp_unreach(struct sk_buff *skb) { const struct iphdr *iph; struct icmphdr *icmph; @@ -784,8 +784,8 @@ static void icmp_unreach(struct sk_buff *skb) */ switch (net->ipv4.sysctl_ip_no_pmtu_disc) { default: - LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), - &iph->daddr); + net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n", + &iph->daddr); break; case 2: goto out; @@ -798,8 +798,8 @@ static void icmp_unreach(struct sk_buff *skb) } break; case ICMP_SR_FAILED: - LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"), - &iph->daddr); + net_dbg_ratelimited("%pI4: Source Route Failed\n", + &iph->daddr); break; default: break; @@ -839,10 +839,10 @@ static void icmp_unreach(struct sk_buff *skb) icmp_socket_deliver(skb, info); out: - return; + return true; out_err: ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); - goto out; + return false; } @@ -850,17 +850,20 @@ out_err: * Handle ICMP_REDIRECT. */ -static void icmp_redirect(struct sk_buff *skb) +static bool icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); - return; + return false; } - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) { + /* there aught to be a stat */ + return false; + } icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); + return true; } /* @@ -875,7 +878,7 @@ static void icmp_redirect(struct sk_buff *skb) * See also WRT handling of options once they are done and working. */ -static void icmp_echo(struct sk_buff *skb) +static bool icmp_echo(struct sk_buff *skb) { struct net *net; @@ -891,6 +894,8 @@ static void icmp_echo(struct sk_buff *skb) icmp_param.head_len = sizeof(struct icmphdr); icmp_reply(&icmp_param, skb); } + /* should there be an ICMP stat for ignored echos? */ + return true; } /* @@ -900,7 +905,7 @@ static void icmp_echo(struct sk_buff *skb) * MUST be accurate to a few minutes. * MUST be updated at least at 15Hz. */ -static void icmp_timestamp(struct sk_buff *skb) +static bool icmp_timestamp(struct sk_buff *skb) { struct timespec tv; struct icmp_bxm icmp_param; @@ -927,15 +932,17 @@ static void icmp_timestamp(struct sk_buff *skb) icmp_param.data_len = 0; icmp_param.head_len = sizeof(struct icmphdr) + 12; icmp_reply(&icmp_param, skb); -out: - return; + return true; + out_err: ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); - goto out; + return false; } -static void icmp_discard(struct sk_buff *skb) +static bool icmp_discard(struct sk_buff *skb) { + /* pretend it was a success */ + return true; } /* @@ -946,6 +953,7 @@ int icmp_rcv(struct sk_buff *skb) struct icmphdr *icmph; struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->dst.dev); + bool success; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); @@ -1012,7 +1020,12 @@ int icmp_rcv(struct sk_buff *skb) } } - icmp_pointers[icmph->type].handler(skb); + success = icmp_pointers[icmph->type].handler(skb); + + if (success) { + consume_skb(skb); + return 0; + } drop: kfree_skb(skb); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index bb15d0e03d4f..666cf364df86 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -112,17 +112,17 @@ #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ -#define IGMP_V1_Router_Present_Timeout (400*HZ) -#define IGMP_V2_Router_Present_Timeout (400*HZ) -#define IGMP_V2_Unsolicited_Report_Interval (10*HZ) -#define IGMP_V3_Unsolicited_Report_Interval (1*HZ) -#define IGMP_Query_Response_Interval (10*HZ) -#define IGMP_Query_Robustness_Variable 2 +#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ) +#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ) +#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ) +#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ) +#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) +#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2 -#define IGMP_Initial_Report_Delay (1) +#define IGMP_INITIAL_REPORT_DELAY (1) -/* IGMP_Initial_Report_Delay is not from IGMP specs! +/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs! * IGMP specs require to report membership immediately after * joining a group, but we delay the first report by a * small interval. It seems more natural and still does not @@ -878,15 +878,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (ih->code == 0) { /* Alas, old v1 router presents here. */ - max_delay = IGMP_Query_Response_Interval; + max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + - IGMP_V1_Router_Present_Timeout; + IGMP_V1_ROUTER_PRESENT_TIMEOUT; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + - IGMP_V2_Router_Present_Timeout; + IGMP_V2_ROUTER_PRESENT_TIMEOUT; } /* cancel the interface change timer */ in_dev->mr_ifc_count = 0; @@ -898,7 +898,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, return true; /* ignore bogus packet; freed by caller */ } else if (IGMP_V1_SEEN(in_dev)) { /* This is a v3 query with v1 queriers present */ - max_delay = IGMP_Query_Response_Interval; + max_delay = IGMP_QUERY_RESPONSE_INTERVAL; group = 0; } else if (IGMP_V2_SEEN(in_dev)) { /* this is a v3 query with v2 queriers present; @@ -1217,7 +1217,7 @@ static void igmp_group_added(struct ip_mc_list *im) return; if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); - igmp_start_timer(im, IGMP_Initial_Report_Delay); + igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); spin_unlock_bh(&im->lock); return; } @@ -1540,7 +1540,7 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; #ifdef CONFIG_IP_MULTICAST -int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable; +int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE; #endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, @@ -2686,11 +2686,7 @@ static int igmp_mcf_seq_show(struct seq_file *seq, void *v) struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "%3s %6s " - "%10s %10s %6s %6s\n", "Idx", - "Device", "MCA", - "SRC", "INC", "EXC"); + seq_puts(seq, "Idx Device MCA SRC INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s 0x%08x " diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 19419b60cb37..e7920352646a 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -458,6 +458,6 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, ". Dropping fragment.\n"; if (PTR_ERR(q) == -ENOBUFS) - LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); + net_dbg_ratelimited("%s%s", prefix, msg); } EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2811cc18701a..e5b6d0ddcb58 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -80,7 +80,7 @@ struct ipq { struct inet_peer *peer; }; -static inline u8 ip4_frag_ecn(u8 tos) +static u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } @@ -148,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; } -static __inline__ void ip4_frag_free(struct inet_frag_queue *q) +static void ip4_frag_free(struct inet_frag_queue *q) { struct ipq *qp; @@ -160,7 +160,7 @@ static __inline__ void ip4_frag_free(struct inet_frag_queue *q) /* Destruction primitives. */ -static __inline__ void ipq_put(struct ipq *ipq) +static void ipq_put(struct ipq *ipq) { inet_frag_put(&ipq->q, &ip4_frags); } @@ -236,7 +236,7 @@ out: /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) +static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) { struct inet_frag_queue *q; struct ip4_create_arg arg; @@ -256,7 +256,7 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) } /* Is the fragment too far ahead to be part of ipq? */ -static inline int ip_frag_too_far(struct ipq *qp) +static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; unsigned int max = sysctl_ipfrag_max_dist; @@ -618,8 +618,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, return 0; out_nomem: - LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"), - qp); + net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); err = -ENOMEM; goto out_fail; out_oversize: @@ -795,16 +794,16 @@ static void __init ip4_frags_ctl_register(void) register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else -static inline int ip4_frags_ns_ctl_register(struct net *net) +static int ip4_frags_ns_ctl_register(struct net *net) { return 0; } -static inline void ip4_frags_ns_ctl_unregister(struct net *net) +static void ip4_frags_ns_ctl_unregister(struct net *net) { } -static inline void __init ip4_frags_ctl_register(void) +static void __init ip4_frags_ctl_register(void) { } #endif diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 12055fdbe716..ac8491245e5b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -789,7 +789,7 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, t->encap.dport) || nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, - t->encap.dport)) + t->encap.flags)) goto nla_put_failure; return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index bc6471d4abcd..b50861b22b6b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -662,12 +662,10 @@ slow_path: if (len < left) { len &= ~7; } - /* - * Allocate buffer. - */ - if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { - NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); + /* Allocate buffer */ + skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC); + if (!skb2) { err = -ENOMEM; goto fail; } @@ -754,14 +752,16 @@ EXPORT_SYMBOL(ip_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { - struct iovec *iov = from; + struct msghdr *msg = from; if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (memcpy_fromiovecend(to, iov, offset, len) < 0) + /* XXX: stripping const */ + if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0) return -EFAULT; } else { __wsum csum = 0; - if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) + /* XXX: stripping const */ + if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, odd); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 9daf2177dc00..8a89c738b7a3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -192,7 +192,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, int err, val; struct cmsghdr *cmsg; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; #if IS_ENABLED(CONFIG_IPV6) @@ -399,6 +399,22 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } +static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, + const struct sk_buff *skb, + int ee_origin) +{ + struct in_pktinfo *info = PKTINFO_SKB_CB(skb); + + if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || + (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || + (!skb->dev)) + return false; + + info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; + info->ipi_ifindex = skb->dev->ifindex; + return true; +} + /* * Handle MSG_ERRQUEUE */ @@ -414,6 +430,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int err; int copied; + WARN_ON_ONCE(sk->sk_family == AF_INET6); + err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) @@ -424,7 +442,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; @@ -444,7 +462,9 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; sin->sin_family = AF_UNSPEC; - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; @@ -1049,7 +1069,7 @@ e_inval: } /** - * ipv4_pktinfo_prepare - transfert some info from rtable to skb + * ipv4_pktinfo_prepare - transfer some info from rtable to skb * @sk: socket * @skb: buffer * diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0bb8e141eacc..63e745aadab6 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,7 +56,6 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/udp.h> -#include <net/gue.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -491,18 +490,51 @@ EXPORT_SYMBOL_GPL(ip_tunnel_rcv); static int ip_encap_hlen(struct ip_tunnel_encap *e) { - switch (e->type) { - case TUNNEL_ENCAP_NONE: + const struct ip_tunnel_encap_ops *ops; + int hlen = -EINVAL; + + if (e->type == TUNNEL_ENCAP_NONE) return 0; - case TUNNEL_ENCAP_FOU: - return sizeof(struct udphdr); - case TUNNEL_ENCAP_GUE: - return sizeof(struct udphdr) + sizeof(struct guehdr); - default: + + if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; - } + + rcu_read_lock(); + ops = rcu_dereference(iptun_encaps[e->type]); + if (likely(ops && ops->encap_hlen)) + hlen = ops->encap_hlen(e); + rcu_read_unlock(); + + return hlen; } +const struct ip_tunnel_encap_ops __rcu * + iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; + +int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, + unsigned int num) +{ + return !cmpxchg((const struct ip_tunnel_encap_ops **) + &iptun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL(ip_tunnel_encap_add_ops); + +int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, + unsigned int num) +{ + int ret; + + ret = (cmpxchg((const struct ip_tunnel_encap_ops **) + &iptun_encaps[num], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(ip_tunnel_encap_del_ops); + int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap) { @@ -526,63 +558,22 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t, } EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); -static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - size_t hdr_len, u8 *protocol, struct flowi4 *fl4) -{ - struct udphdr *uh; - __be16 sport; - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - - skb = iptunnel_handle_offloads(skb, csum, type); - - if (IS_ERR(skb)) - return PTR_ERR(skb); - - /* Get length and hash before making space in skb */ - - sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), - skb, 0, 0, false); - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - uh = udp_hdr(skb); - - if (e->type == TUNNEL_ENCAP_GUE) { - struct guehdr *guehdr = (struct guehdr *)&uh[1]; - - guehdr->version = 0; - guehdr->hlen = 0; - guehdr->flags = 0; - guehdr->next_hdr = *protocol; - } - - uh->dest = e->dport; - uh->source = sport; - uh->len = htons(skb->len); - uh->check = 0; - udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, - fl4->saddr, fl4->daddr, skb->len); - - *protocol = IPPROTO_UDP; - - return 0; -} - int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, u8 *protocol, struct flowi4 *fl4) { - switch (t->encap.type) { - case TUNNEL_ENCAP_NONE: + const struct ip_tunnel_encap_ops *ops; + int ret = -EINVAL; + + if (t->encap.type == TUNNEL_ENCAP_NONE) return 0; - case TUNNEL_ENCAP_FOU: - case TUNNEL_ENCAP_GUE: - return fou_build_header(skb, &t->encap, t->encap_hlen, - protocol, fl4); - default: - return -EINVAL; - } + + rcu_read_lock(); + ops = rcu_dereference(iptun_encaps[t->encap.type]); + if (likely(ops && ops->build_header)) + ret = ops->build_header(skb, &t->encap, protocol, fl4); + rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL(ip_tunnel_encap); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 648fa1490ea7..7fa18bc7e47f 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -115,7 +115,7 @@ */ int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ -static int ic_enable __initdata = 0; /* IP config enabled? */ +static int ic_enable __initdata; /* IP config enabled? */ /* Protocol choice */ int ic_proto_enabled __initdata = 0 @@ -130,7 +130,7 @@ int ic_proto_enabled __initdata = 0 #endif ; -static int ic_host_name_set __initdata = 0; /* Host name set by us? */ +static int ic_host_name_set __initdata; /* Host name set by us? */ __be32 ic_myaddr = NONE; /* My IP address */ static __be32 ic_netmask = NONE; /* Netmask for local subnet */ @@ -160,17 +160,17 @@ static u8 ic_domain[64]; /* DNS (not NIS) domain name */ static char user_dev_name[IFNAMSIZ] __initdata = { 0, }; /* Protocols supported by available interfaces */ -static int ic_proto_have_if __initdata = 0; +static int ic_proto_have_if __initdata; /* MTU for boot device */ -static int ic_dev_mtu __initdata = 0; +static int ic_dev_mtu __initdata; #ifdef IPCONFIG_DYNAMIC static DEFINE_SPINLOCK(ic_recv_lock); -static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */ +static volatile int ic_got_reply __initdata; /* Proto(s) that replied */ #endif #ifdef IPCONFIG_DHCP -static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */ +static int ic_dhcp_msgtype __initdata; /* DHCP msg type received */ #endif @@ -186,8 +186,8 @@ struct ic_device { __be32 xid; }; -static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ -static struct net_device *ic_dev __initdata = NULL; /* Selected device */ +static struct ic_device *ic_first_dev __initdata; /* List of open device */ +static struct net_device *ic_dev __initdata; /* Selected device */ static bool __init ic_is_init_dev(struct net_device *dev) { @@ -498,7 +498,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt struct arphdr *rarp; unsigned char *rarp_ptr; __be32 sip, tip; - unsigned char *sha, *tha; /* s for "source", t for "target" */ + unsigned char *tha; /* t for "target" */ struct ic_device *d; if (!net_eq(dev_net(dev), &init_net)) @@ -549,7 +549,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto drop_unlock; /* should never happen */ /* Extract variable-width fields */ - sha = rarp_ptr; rarp_ptr += dev->addr_len; memcpy(&sip, rarp_ptr, 4); rarp_ptr += 4; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 37096d64730e..40403114f00a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -465,7 +465,7 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, - tunnel->encap.dport)) + tunnel->encap.flags)) goto nla_put_failure; return 0; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4c019d5c3f57..59f883d9cadf 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -113,6 +113,15 @@ config NFT_MASQ_IPV4 This is the expression that provides IPv4 masquerading support for nf_tables. +config NFT_REDIR_IPV4 + tristate "IPv4 redirect support for nf_tables" + depends on NF_TABLES_IPV4 + depends on NFT_REDIR + select NF_NAT_REDIRECT + help + This is the expression that provides IPv4 redirect support for + nf_tables. + config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" depends on NF_CONNTRACK_SNMP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index f4cef5af0969..7fe6c703528f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o +obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index ccfc78db12ee..d059182c1466 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -10,6 +10,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/spinlock.h> @@ -74,12 +75,12 @@ static void dump_arp_packet(struct nf_log_buf *m, ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); } -void nf_log_arp_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) +static void nf_log_arp_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) { struct nf_log_buf *m; @@ -130,8 +131,17 @@ static int __init nf_log_arp_init(void) if (ret < 0) return ret; - nf_log_register(NFPROTO_ARP, &nf_arp_logger); + ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger); + if (ret < 0) { + pr_err("failed to register logger\n"); + goto err1; + } + return 0; + +err1: + unregister_pernet_subsys(&nf_log_arp_net_ops); + return ret; } static void __exit nf_log_arp_exit(void) diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 078bdca1b607..75101980eeee 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -5,6 +5,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/spinlock.h> @@ -366,8 +367,17 @@ static int __init nf_log_ipv4_init(void) if (ret < 0) return ret; - nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + if (ret < 0) { + pr_err("failed to register logger\n"); + goto err1; + } + return 0; + +err1: + unregister_pernet_subsys(&nf_log_ipv4_net_ops); + return ret; } static void __exit nf_log_ipv4_exit(void) diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 1baaa83dfe5c..536da7bc598a 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -11,6 +11,7 @@ #include <net/tcp.h> #include <net/route.h> #include <net/dst.h> +#include <net/netfilter/ipv4/nf_reject.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/ipv4/nf_reject.h> diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c new file mode 100644 index 000000000000..ff2d23d8c87a --- /dev/null +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_redirect.h> +#include <net/netfilter/nft_redir.h> + +static void nft_redir_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_redir *priv = nft_expr_priv(expr); + struct nf_nat_ipv4_multi_range_compat mr; + unsigned int verdict; + + memset(&mr, 0, sizeof(mr)); + if (priv->sreg_proto_min) { + mr.range[0].min.all = (__force __be16) + data[priv->sreg_proto_min].data[0]; + mr.range[0].max.all = (__force __be16) + data[priv->sreg_proto_max].data[0]; + mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + + mr.range[0].flags |= priv->flags; + + verdict = nf_nat_redirect_ipv4(pkt->skb, &mr, pkt->ops->hooknum); + data[NFT_REG_VERDICT].verdict = verdict; +} + +static struct nft_expr_type nft_redir_ipv4_type; +static const struct nft_expr_ops nft_redir_ipv4_ops = { + .type = &nft_redir_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), + .eval = nft_redir_ipv4_eval, + .init = nft_redir_init, + .dump = nft_redir_dump, + .validate = nft_redir_validate, +}; + +static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "redir", + .ops = &nft_redir_ipv4_ops, + .policy = nft_redir_policy, + .maxattr = NFTA_REDIR_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_redir_ipv4_module_init(void) +{ + return nft_register_expr(&nft_redir_ipv4_type); +} + +static void __exit nft_redir_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_redir_ipv4_type); +} + +module_init(nft_redir_ipv4_module_init); +module_exit(nft_redir_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index ed33299c56d1..d729542bd1b7 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -19,9 +19,9 @@ #include <net/netfilter/ipv4/nf_reject.h> #include <net/netfilter/nft_reject.h> -void nft_reject_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +static void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); @@ -36,7 +36,6 @@ void nft_reject_ipv4_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = NF_DROP; } -EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); static struct nft_expr_type nft_reject_ipv4_type; static const struct nft_expr_ops nft_reject_ipv4_ops = { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 5d740cccf69e..c0d82f78d364 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -662,7 +662,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, * Fetch the ICMP header provided by the userland. * iovec is modified! The ICMP header is consumed. */ - if (memcpy_fromiovec(user_icmph, msg->msg_iov, icmph_len)) + if (memcpy_from_msg(user_icmph, msg, icmph_len)) return -EFAULT; if (family == AF_INET) { @@ -811,7 +811,8 @@ back_from_confirm: pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; - pfh.iov = msg->msg_iov; + /* XXX: stripping const */ + pfh.iov = (struct iovec *)msg->msg_iter.iov; pfh.wcheck = 0; pfh.family = AF_INET; @@ -869,7 +870,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } /* Don't bother checking the checksum */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; @@ -949,7 +950,7 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); * All we need to do is get the socket. */ -void ping_rcv(struct sk_buff *skb) +bool ping_rcv(struct sk_buff *skb) { struct sock *sk; struct net *net = dev_net(skb->dev); @@ -968,11 +969,11 @@ void ping_rcv(struct sk_buff *skb) pr_debug("rcv on socket %p\n", sk); ping_queue_rcv_skb(sk, skb_get(skb)); sock_put(sk); - return; + return true; } pr_debug("no socket, dropping\n"); - /* We're called from icmp_rcv(). kfree_skb() is done there. */ + return false; } EXPORT_SYMBOL_GPL(ping_rcv); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8e3eb39f84e7..8f9cd200ce20 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -181,6 +181,7 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), + SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_SENTINEL }; @@ -287,6 +288,10 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), + SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT), + SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), + SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), + SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), SNMP_MIB_SENTINEL }; @@ -296,12 +301,12 @@ static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, int j; if (count) { - seq_printf(seq, "\nIcmpMsg:"); + seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %sType%u", type[j] & 0x100 ? "Out" : "In", type[j] & 0xff); - seq_printf(seq, "\nIcmpMsg:"); + seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", vals[j]); } @@ -342,7 +347,7 @@ static void icmp_put(struct seq_file *seq) seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); - seq_printf(seq, " OutMsgs OutErrors"); + seq_puts(seq, " OutMsgs OutErrors"); for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 739db3100c23..0bb68df5055d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -79,6 +79,16 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> +#include <linux/uio.h> + +struct raw_frag_vec { + struct msghdr *msg; + union { + struct icmphdr icmph; + char c[1]; + } hdr; + int hlen; +}; static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), @@ -420,53 +430,57 @@ error: return err; } -static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) +static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4) { - struct iovec *iov; - u8 __user *type = NULL; - u8 __user *code = NULL; - int probed = 0; - unsigned int i; + int err; - if (!msg->msg_iov) + if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; - for (i = 0; i < msg->msg_iovlen; i++) { - iov = &msg->msg_iov[i]; - if (!iov) - continue; - - switch (fl4->flowi4_proto) { - case IPPROTO_ICMP: - /* check if one-byte field is readable or not. */ - if (iov->iov_base && iov->iov_len < 1) - break; - - if (!type) { - type = iov->iov_base; - /* check if code field is readable or not. */ - if (iov->iov_len > 1) - code = type + 1; - } else if (!code) - code = iov->iov_base; - - if (type && code) { - if (get_user(fl4->fl4_icmp_type, type) || - get_user(fl4->fl4_icmp_code, code)) - return -EFAULT; - probed = 1; - } - break; - default: - probed = 1; - break; - } - if (probed) - break; - } + /* We only need the first two bytes. */ + rfv->hlen = 2; + + err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen); + if (err) + return err; + + fl4->fl4_icmp_type = rfv->hdr.icmph.type; + fl4->fl4_icmp_code = rfv->hdr.icmph.code; + return 0; } +static int raw_getfrag(void *from, char *to, int offset, int len, int odd, + struct sk_buff *skb) +{ + struct raw_frag_vec *rfv = from; + + if (offset < rfv->hlen) { + int copy = min(rfv->hlen - offset, len); + + if (skb->ip_summed == CHECKSUM_PARTIAL) + memcpy(to, rfv->hdr.c + offset, copy); + else + skb->csum = csum_block_add( + skb->csum, + csum_partial_copy_nocheck(rfv->hdr.c + offset, + to, copy, 0), + odd); + + odd = 0; + offset += copy; + to += copy; + len -= copy; + + if (!len) + return 0; + } + + offset -= rfv->hlen; + + return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); +} + static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -480,6 +494,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, u8 tos; int err; struct ip_options_data opt_copy; + struct raw_frag_vec rfv; err = -EMSGSIZE; if (len > 0xFFFF) @@ -585,7 +600,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, daddr, saddr, 0, 0); if (!inet->hdrincl) { - err = raw_probe_proto_opt(&fl4, msg); + rfv.msg = msg; + rfv.hlen = 0; + + err = raw_probe_proto_opt(&rfv, &fl4); if (err) goto done; } @@ -607,7 +625,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, back_from_confirm: if (inet->hdrincl) - err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len, + /* XXX: stripping const */ + err = raw_send_hdrinc(sk, &fl4, (struct iovec *)msg->msg_iter.iov, len, &rt, msg->msg_flags); else { @@ -616,8 +635,8 @@ back_from_confirm: if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); - err = ip_append_data(sk, &fl4, ip_generic_getfrag, - msg->msg_iov, len, 0, + err = ip_append_data(sk, &fl4, raw_getfrag, + &rfv, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); @@ -718,7 +737,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 32b98d0207b4..45fe60c5238e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -19,10 +19,6 @@ #include <net/tcp.h> #include <net/route.h> -/* Timestamps: lowest bits store TCP options */ -#define TSBITS 6 -#define TSMASK (((__u32)1 << TSBITS) - 1) - extern int sysctl_tcp_syncookies; static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; @@ -30,6 +26,30 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) +/* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK + * stores TCP options: + * + * MSB LSB + * | 31 ... 6 | 5 | 4 | 3 2 1 0 | + * | Timestamp | ECN | SACK | WScale | + * + * When we receive a valid cookie-ACK, we look at the echoed tsval (if + * any) to figure out which TCP options we should use for the rebuilt + * connection. + * + * A WScale setting of '0xf' (which is an invalid scaling value) + * means that original syn did not include the TCP window scaling option. + */ +#define TS_OPT_WSCALE_MASK 0xf +#define TS_OPT_SACK BIT(4) +#define TS_OPT_ECN BIT(5) +/* There is no TS_OPT_TIMESTAMP: + * if ACK contains timestamp option, we already know it was + * requested/supported by the syn/synack exchange. + */ +#define TSBITS 6 +#define TSMASK (((__u32)1 << TSBITS) - 1) + static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch); @@ -67,9 +87,11 @@ __u32 cookie_init_timestamp(struct request_sock *req) ireq = inet_rsk(req); - options = ireq->wscale_ok ? ireq->snd_wscale : 0xf; - options |= ireq->sack_ok << 4; - options |= ireq->ecn_ok << 5; + options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; + if (ireq->sack_ok) + options |= TS_OPT_SACK; + if (ireq->ecn_ok) + options |= TS_OPT_ECN; ts = ts_now & ~TSMASK; ts |= options; @@ -219,16 +241,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, * additional tcp options in the timestamp. * This extracts these options from the timestamp echo. * - * The lowest 4 bits store snd_wscale. - * next 2 bits indicate SACK and ECN support. - * - * return false if we decode an option that should not be. + * return false if we decode a tcp option that is disabled + * on the host. */ -bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, - struct net *net, bool *ecn_ok) +bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt) { /* echoed timestamp, lowest bits contain options */ - u32 options = tcp_opt->rcv_tsecr & TSMASK; + u32 options = tcp_opt->rcv_tsecr; if (!tcp_opt->saw_tstamp) { tcp_clear_options(tcp_opt); @@ -238,22 +257,35 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, if (!sysctl_tcp_timestamps) return false; - tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0; - *ecn_ok = (options >> 5) & 1; - if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn) - return false; + tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; if (tcp_opt->sack_ok && !sysctl_tcp_sack) return false; - if ((options & 0xf) == 0xf) + if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) return true; /* no window scaling */ tcp_opt->wscale_ok = 1; - tcp_opt->snd_wscale = options & 0xf; + tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; + return sysctl_tcp_window_scaling != 0; } -EXPORT_SYMBOL(cookie_check_timestamp); +EXPORT_SYMBOL(cookie_timestamp_decode); + +bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, + const struct net *net, const struct dst_entry *dst) +{ + bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN; + + if (!ecn_ok) + return false; + + if (net->ipv4.sysctl_tcp_ecn) + return true; + + return dst_feature(dst, RTAX_FEATURE_ECN); +} +EXPORT_SYMBOL(cookie_ecn_ok); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { @@ -269,14 +301,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) int mss; struct rtable *rt; __u8 rcv_wscale; - bool ecn_ok = false; struct flowi4 fl4; if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; - if (tcp_synq_no_recent_overflow(sk) || - (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) { + if (tcp_synq_no_recent_overflow(sk)) + goto out; + + mss = __cookie_v4_check(ip_hdr(skb), th, cookie); + if (mss == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } @@ -287,7 +321,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(skb, &tcp_opt, 0, NULL); - if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) + if (!cookie_timestamp_decode(&tcp_opt)) goto out; ret = NULL; @@ -305,7 +339,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->ir_loc_addr = ip_hdr(skb)->daddr; ireq->ir_rmt_addr = ip_hdr(skb)->saddr; ireq->ir_mark = inet_request_mark(sk, skb); - ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; ireq->wscale_ok = tcp_opt.wscale_ok; @@ -354,6 +387,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; + ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); ret = get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b3c53c8b331e..e0ee384a448f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_max_reordering", + .data = &sysctl_tcp_max_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "tcp_dsack", .data = &sysctl_tcp_dsack, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 38c2bcb8dd5d..3075723c729b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 xmit_size_goal, old_size_goal; - - xmit_size_goal = mss_now; - - if (large_allowed && sk_can_gso(sk)) { - u32 gso_size, hlen; - - /* Maybe we should/could use sk->sk_prot->max_header here ? */ - hlen = inet_csk(sk)->icsk_af_ops->net_header_len + - inet_csk(sk)->icsk_ext_hdr_len + - tp->tcp_header_len; - - /* Goal is to send at least one packet per ms, - * not one big TSO packet every 100 ms. - * This preserves ACK clocking and is consistent - * with tcp_tso_should_defer() heuristic. - */ - gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); - gso_size = max_t(u32, gso_size, - sysctl_tcp_min_tso_segs * mss_now); - - xmit_size_goal = min_t(u32, gso_size, - sk->sk_gso_max_size - 1 - hlen); - - xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); - - /* We try hard to avoid divides here */ - old_size_goal = tp->xmit_size_goal_segs * mss_now; - - if (likely(old_size_goal <= xmit_size_goal && - old_size_goal + mss_now > xmit_size_goal)) { - xmit_size_goal = old_size_goal; - } else { - tp->xmit_size_goal_segs = - min_t(u16, xmit_size_goal / mss_now, - sk->sk_gso_max_segs); - xmit_size_goal = tp->xmit_size_goal_segs * mss_now; - } + u32 new_size_goal, size_goal, hlen; + + if (!large_allowed || !sk_can_gso(sk)) + return mss_now; + + /* Maybe we should/could use sk->sk_prot->max_header here ? */ + hlen = inet_csk(sk)->icsk_af_ops->net_header_len + + inet_csk(sk)->icsk_ext_hdr_len + + tp->tcp_header_len; + + new_size_goal = sk->sk_gso_max_size - 1 - hlen; + new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); + + /* We try hard to avoid divides here */ + size_goal = tp->gso_segs * mss_now; + if (unlikely(new_size_goal < size_goal || + new_size_goal >= size_goal + mss_now)) { + tp->gso_segs = min_t(u16, new_size_goal / mss_now, + sk->sk_gso_max_segs); + size_goal = tp->gso_segs * mss_now; } - return max(xmit_size_goal, mss_now); + return max(size_goal, mss_now); } static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) @@ -1085,7 +1067,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { - struct iovec *iov; + const struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags, err, copied = 0; @@ -1136,8 +1118,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, mss_now = tcp_send_mss(sk, &size_goal, flags); /* Ok commence sending. */ - iovlen = msg->msg_iovlen; - iov = msg->msg_iov; + iovlen = msg->msg_iter.nr_segs; + iov = msg->msg_iter.iov; copied = 0; err = -EPIPE; @@ -1349,7 +1331,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) if (len > 0) { if (!(flags & MSG_TRUNC)) - err = memcpy_toiovec(msg->msg_iov, &c, 1); + err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; @@ -1377,7 +1359,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ skb_queue_walk(&sk->sk_write_queue, skb) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; @@ -1729,7 +1711,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { user_recv = current; tp->ucopy.task = user_recv; - tp->ucopy.iov = msg->msg_iov; + tp->ucopy.msg = msg; } tp->ucopy.len = len; @@ -1833,8 +1815,7 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index b1c5970d47a1..27ead0dd16bc 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -1,5 +1,5 @@ /* - * Plugable TCP congestion control support and newReno + * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 20de0118c98e..6b6002416a73 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -363,16 +363,28 @@ static void hystart_update(struct sock *sk, u32 delay) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!(ca->found & hystart_detect)) { + if (ca->found & hystart_detect) + return; + + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock(); /* first detection parameter - ack-train detection */ if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) + if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { ca->found |= HYSTART_ACK_TRAIN; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } } + } + if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { if (ca->curr_rtt == 0 || ca->curr_rtt > delay) @@ -381,15 +393,16 @@ static void hystart_update(struct sock *sk, u32 delay) ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + - HYSTART_DELAY_THRESH(ca->delay_min>>4)) + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { ca->found |= HYSTART_DELAY; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } } - /* - * Either one of two conditions are met, - * we exit from slow start immediately. - */ - if (ca->found & hystart_detect) - tp->snd_ssthresh = tp->snd_cwnd; } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d107ee246a1d..075ab4d5af5e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -81,6 +81,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; +int sysctl_tcp_max_reordering __read_mostly = 300; EXPORT_SYMBOL(sysctl_tcp_reordering); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; @@ -833,7 +834,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, if (metric > tp->reordering) { int mib_idx; - tp->reordering = min(TCP_MAX_REORDERING, metric); + tp->reordering = min(sysctl_tcp_max_reordering, metric); /* This exciting event is worth to be remembered. 8) */ if (ts) @@ -4367,7 +4368,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) + if (memcpy_from_msg(skb_put(skb, size), msg, size)) goto err_free; TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; @@ -4420,7 +4421,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __set_current_state(TASK_RUNNING); local_bh_enable(); - if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { + if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { tp->ucopy.len -= chunk; tp->copied_seq += chunk; eaten = (chunk == skb->len); @@ -4940,10 +4941,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) local_bh_enable(); if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); + err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); else - err = skb_copy_and_csum_datagram_iovec(skb, hlen, - tp->ucopy.iov); + err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg); if (!err) { tp->ucopy.len -= chunk; @@ -5030,7 +5030,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* step 3: check security and precedence [ignored] */ /* step 4: Check for a SYN - * RFC 5691 4.2 : Send a challenge ack + * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { syn_challenge: @@ -5853,12 +5853,12 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) struct inet_request_sock *ireq = inet_rsk(req); if (family == AF_INET) - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), - &ireq->ir_rmt_addr, port); + net_dbg_ratelimited("drop open request from %pI4/%u\n", + &ireq->ir_rmt_addr, port); #if IS_ENABLED(CONFIG_IPV6) else if (family == AF_INET6) - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), - &ireq->ir_v6_rmt_addr, port); + net_dbg_ratelimited("drop open request from %pI6/%u\n", + &ireq->ir_v6_rmt_addr, port); #endif } @@ -5867,7 +5867,7 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) * If we receive a SYN packet with these bits set, it means a * network is playing bad games with TOS bits. In order to * avoid possible false congestion notifications, we disable - * TCP ECN negociation. + * TCP ECN negotiation. * * Exception: tcp_ca wants ECN. This is required for DCTCP * congestion control; it requires setting ECT on all packets, @@ -5877,20 +5877,22 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, - const struct sock *listen_sk) + const struct sock *listen_sk, + const struct dst_entry *dst) { const struct tcphdr *th = tcp_hdr(skb); const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; - bool ect, need_ecn; + bool ect, need_ecn, ecn_ok; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); need_ecn = tcp_ca_needs_ecn(listen_sk); + ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); - if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + if (!ect && !need_ecn && ecn_ok) inet_rsk(req)->ecn_ok = 1; else if (ect && need_ecn) inet_rsk(req)->ecn_ok = 1; @@ -5955,13 +5957,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; - if (!want_cookie || tmp_opt.tstamp_ok) - tcp_ecn_create_request(req, skb, sk); - - if (want_cookie) { - isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - } else if (!isn) { + if (!want_cookie && !isn) { /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering * state TIME-WAIT, and check against it before @@ -6009,6 +6005,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; } + tcp_ecn_create_request(req, skb, sk, dst); + + if (want_cookie) { + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + if (!tmp_opt.tstamp_ok) + inet_rsk(req)->ecn_ok = 0; + } + tcp_rsk(req)->snt_isn = isn; tcp_openreq_init_rwin(req, sk, dst); fastopen = !want_cookie && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 147be2024290..a3f72d7fc06c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -623,6 +623,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); + net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG hash_location = tcp_parse_md5sig_option(th); if (!sk && hash_location) { @@ -633,7 +634,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), + sk1 = __inet_lookup_listener(net, &tcp_hashinfo, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); @@ -681,7 +682,6 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; - net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, @@ -1432,6 +1432,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) struct dst_entry *dst = sk->sk_rx_dst; sock_rps_save_rxhash(sk, skb); + sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, 0) == NULL) { @@ -1453,6 +1454,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); + sk_mark_napi_id(sk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1664,7 +1666,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; - sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 5b90f2f447a5..9d7930ba8e0f 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -94,9 +94,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | - SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a3d453b94747..7f18262e2326 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + tcp_ca_needs_ecn(sk); + + if (!use_ecn) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } tp->ecn_flags = 0; - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || - tcp_ca_needs_ecn(sk)) { + + if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; if (tcp_ca_needs_ecn(sk)) @@ -1515,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } + +/* Return how many segs we'd like on a TSO packet, + * to send one TSO packet per ms + */ +static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) +{ + u32 bytes, segs; + + bytes = min(sk->sk_pacing_rate >> 10, + sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); + + /* Goal is to send at least one packet per ms, + * not one big TSO packet every 100 ms. + * This preserves ACK clocking and is consistent + * with tcp_tso_should_defer() heuristic. + */ + segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); + + return min_t(u32, segs, sk->sk_gso_max_segs); +} + /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, @@ -1553,7 +1583,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb) { - u32 in_flight, cwnd; + u32 in_flight, cwnd, halfcwnd; /* Don't be strict about the congestion window for the final FIN. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && @@ -1562,10 +1592,14 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, in_flight = tcp_packets_in_flight(tp); cwnd = tp->snd_cwnd; - if (in_flight < cwnd) - return (cwnd - in_flight); + if (in_flight >= cwnd) + return 0; - return 0; + /* For better scheduling, ensure we have at least + * 2 GSO packets in flight. + */ + halfcwnd = max(cwnd >> 1, 1U); + return min(halfcwnd, cwnd - in_flight); } /* Initialize TSO state of a skb. @@ -1718,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, - bool *is_cwnd_limited) + bool *is_cwnd_limited, u32 max_segs) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1748,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= min_t(unsigned int, sk->sk_gso_max_size, - tp->xmit_size_goal_segs * tp->mss_cache)) + if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1946,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int cwnd_quota; int result; bool is_cwnd_limited = false; + u32 max_segs; sent_pkts = 0; @@ -1959,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } } + max_segs = tcp_tso_autosize(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -1991,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } else { if (!push_one && - tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) + tcp_tso_should_defer(sk, skb, &is_cwnd_limited, + max_segs)) break; } + limit = mss_now; + if (tso_segs > 1 && !tcp_urg_mode(tp)) + limit = tcp_mss_split_point(sk, skb, mss_now, + min_t(unsigned int, + cwnd_quota, + max_segs), + nonagle); + + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + break; + /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * This allows for : @@ -2005,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ - limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, - sk->sk_pacing_rate >> 10); + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); + limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); @@ -2019,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } - limit = mss_now; - if (tso_segs > 1 && !tcp_urg_mode(tp)) - limit = tcp_mss_split_point(sk, skb, mss_now, - min_t(unsigned int, - cwnd_quota, - sk->sk_gso_max_segs), - nonagle); - - if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) - break; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2998,9 +3034,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; - struct sk_buff *syn_data = NULL, *data; + int syn_loss = 0, space, err = 0; unsigned long last_syn_loss = 0; + struct sk_buff *syn_data; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, @@ -3031,48 +3067,40 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, - sk->sk_allocation); - if (syn_data == NULL) + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); + if (!syn_data) + goto fallback; + syn_data->ip_summed = CHECKSUM_PARTIAL; + memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); + if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), + fo->data->msg_iter.iov, 0, space))) { + kfree_skb(syn_data); goto fallback; + } - for (i = 0; i < iovlen && syn_data->len < space; ++i) { - struct iovec *iov = &fo->data->msg_iov[i]; - unsigned char __user *from = iov->iov_base; - int len = iov->iov_len; + /* No more data pending in inet_wait_for_connect() */ + if (space == fo->size) + fo->data = NULL; + fo->copied = space; - if (syn_data->len + len > space) - len = space - syn_data->len; - else if (i + 1 == iovlen) - /* No more data pending in inet_wait_for_connect() */ - fo->data = NULL; + tcp_connect_queue_skb(sk, syn_data); - if (skb_add_data(syn_data, from, len)) - goto fallback; - } + err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - /* Queue a data-only packet after the regular SYN for retransmission */ - data = pskb_copy(syn_data, sk->sk_allocation); - if (data == NULL) - goto fallback; - TCP_SKB_CB(data)->seq++; - TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; - TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); - tcp_connect_queue_skb(sk, data); - fo->copied = data->len; - - /* syn_data is about to be sent, we need to take current time stamps - * for the packets that are in write queue : SYN packet and DATA - */ - skb_mstamp_get(&syn->skb_mstamp); - data->skb_mstamp = syn->skb_mstamp; + syn->skb_mstamp = syn_data->skb_mstamp; - if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { + /* Now full SYN+DATA was cloned and sent (or not), + * remove the SYN from the original skb (syn_data) + * we keep in write queue in case of a retransmit, as we + * also have the SYN packet (with no data) in the same queue. + */ + TCP_SKB_CB(syn_data)->seq++; + TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; + if (!err) { tp->syn_data = (fo->copied > 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } - syn_data = NULL; fallback: /* Send a regular SYN with Fast Open cookie request option */ @@ -3081,7 +3109,6 @@ fallback: err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; - kfree_skb(syn_data); done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; @@ -3101,13 +3128,10 @@ int tcp_connect(struct sock *sk) return 0; } - buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); - if (unlikely(buff == NULL)) + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + if (unlikely(!buff)) return -ENOBUFS; - /* Reserve space for headers. */ - skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 9b21ae8b2e31..1829c7fbc77e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -374,17 +374,19 @@ void tcp_retransmit_timer(struct sock *sk) */ struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), - &inet->inet_daddr, - ntohs(inet->inet_dport), inet->inet_num, - tp->snd_una, tp->snd_nxt); + net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &inet->inet_daddr, + ntohs(inet->inet_dport), + inet->inet_num, + tp->snd_una, tp->snd_nxt); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), - &sk->sk_v6_daddr, - ntohs(inet->inet_dport), inet->inet_num, - tp->snd_una, tp->snd_nxt); + net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &sk->sk_v6_daddr, + ntohs(inet->inet_dport), + inet->inet_num, + tp->snd_una, tp->snd_nxt); } #endif if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cd0db5471bb5..13b4dcf86ef6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -144,7 +144,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each(sk2, node, &hslot->head) + sk_nulls_for_each(sk2, node, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && @@ -152,14 +152,13 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || - !uid_eq(uid, sock_i_uid(sk2))) && - (*saddr_comp)(sk, sk2)) { - if (bitmap) - __set_bit(udp_sk(sk2)->udp_port_hash >> log, - bitmap); - else + !uid_eq(uid, sock_i_uid(sk2))) && + saddr_comp(sk, sk2)) { + if (!bitmap) return 1; + __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); } + } return 0; } @@ -168,10 +167,10 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, * can insert/delete a socket with local_port == num */ static int udp_lib_lport_inuse2(struct net *net, __u16 num, - struct udp_hslot *hslot2, - struct sock *sk, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + struct udp_hslot *hslot2, + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2)) { struct sock *sk2; struct hlist_nulls_node *node; @@ -179,7 +178,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, int res = 0; spin_lock(&hslot2->lock); - udp_portaddr_for_each_entry(sk2, node, &hslot2->head) + udp_portaddr_for_each_entry(sk2, node, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && @@ -187,11 +186,12 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || - !uid_eq(uid, sock_i_uid(sk2))) && - (*saddr_comp)(sk, sk2)) { + !uid_eq(uid, sock_i_uid(sk2))) && + saddr_comp(sk, sk2)) { res = 1; break; } + } spin_unlock(&hslot2->lock); return res; } @@ -206,8 +206,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2), unsigned int hash2_nulladdr) { struct udp_hslot *hslot, *hslot2; @@ -336,38 +336,45 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); } -static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, - unsigned short hnum, - __be16 sport, __be32 daddr, __be16 dport, int dif) +static inline int compute_score(struct sock *sk, struct net *net, + __be32 saddr, unsigned short hnum, __be16 sport, + __be32 daddr, __be16 dport, int dif) { - int score = -1; + int score; + struct inet_sock *inet; - if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && - !ipv6_only_sock(sk)) { - struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + ipv6_only_sock(sk)) + return -1; - score = (sk->sk_family == PF_INET ? 2 : 1); - if (inet->inet_rcv_saddr) { - if (inet->inet_rcv_saddr != daddr) - return -1; - score += 4; - } - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } + score = (sk->sk_family == PF_INET) ? 2 : 1; + inet = inet_sk(sk); + + if (inet->inet_rcv_saddr) { + if (inet->inet_rcv_saddr != daddr) + return -1; + score += 4; + } + + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) + return -1; + score += 4; + } + + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score += 4; + } + + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 4; } + return score; } @@ -378,33 +385,39 @@ static inline int compute_score2(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif) { - int score = -1; + int score; + struct inet_sock *inet; + + if (!net_eq(sock_net(sk), net) || + ipv6_only_sock(sk)) + return -1; - if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) { - struct inet_sock *inet = inet_sk(sk); + inet = inet_sk(sk); - if (inet->inet_rcv_saddr != daddr) + if (inet->inet_rcv_saddr != daddr || + inet->inet_num != hnum) + return -1; + + score = (sk->sk_family == PF_INET) ? 2 : 1; + + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) return -1; - if (inet->inet_num != hnum) + score += 4; + } + + if (inet->inet_dport) { + if (inet->inet_dport != sport) return -1; + score += 4; + } - score = (sk->sk_family == PF_INET ? 2 : 1); - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 4; } + return score; } @@ -1036,7 +1049,7 @@ back_from_confirm: /* Lockless fast path for the non-corking case. */ if (!corkreq) { - skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, + skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, msg->msg_flags); err = PTR_ERR(skb); @@ -1051,7 +1064,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n")); + net_dbg_ratelimited("cork app bug 2\n"); err = -EINVAL; goto out; } @@ -1067,7 +1080,7 @@ back_from_confirm: do_append_data: up->len += ulen; - err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen, + err = ip_append_data(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) @@ -1133,7 +1146,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, if (unlikely(!up->pending)) { release_sock(sk); - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n")); + net_dbg_ratelimited("udp cork app bug 3\n"); return -EINVAL; } @@ -1281,12 +1294,11 @@ try_again: } if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { - err = skb_copy_and_csum_datagram_iovec(skb, - sizeof(struct udphdr), - msg->msg_iov); + err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), + msg); if (err == -EINVAL) goto csum_copy_err; @@ -1445,6 +1457,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (inet_sk(sk)->inet_daddr) { sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); } rc = sock_queue_rcv_skb(sk, skb); @@ -1546,8 +1559,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * provided by the application." */ if (up->pcrlen == 0) { /* full coverage was set */ - LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n", - UDP_SKB_CB(skb)->cscov, skb->len); + net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n", + UDP_SKB_CB(skb)->cscov, skb->len); goto drop; } /* The next case involves violating the min. coverage requested @@ -1557,8 +1570,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * Therefore the above ...()->partial_cov statement is essential. */ if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { - LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n", - UDP_SKB_CB(skb)->cscov, up->pcrlen); + net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n", + UDP_SKB_CB(skb)->cscov, up->pcrlen); goto drop; } } @@ -1647,7 +1660,8 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, - struct udp_table *udptable) + struct udp_table *udptable, + int proto) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; struct hlist_nulls_node *node; @@ -1656,6 +1670,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif = skb->dev->ifindex; unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + bool inner_flushed = false; if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & @@ -1674,6 +1689,7 @@ start_lookup: dif, hnum)) { if (unlikely(count == ARRAY_SIZE(stack))) { flush_stack(stack, count, skb, ~0); + inner_flushed = true; count = 0; } stack[count++] = sk; @@ -1695,7 +1711,10 @@ start_lookup: if (count) { flush_stack(stack, count, skb, count - 1); } else { - kfree_skb(skb); + if (!inner_flushed) + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); + consume_skb(skb); } return 0; } @@ -1777,14 +1796,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (ret > 0) return -ret; return 0; - } else { - if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - return __udp4_lib_mcast_deliver(net, skb, uh, - saddr, daddr, udptable); - - sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); } + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + return __udp4_lib_mcast_deliver(net, skb, uh, + saddr, daddr, udptable, proto); + + sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { int ret; @@ -1822,11 +1840,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, return 0; short_packet: - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", - proto == IPPROTO_UDPLITE ? "Lite" : "", - &saddr, ntohs(uh->source), - ulen, skb->len, - &daddr, ntohs(uh->dest)); + net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", + proto == IPPROTO_UDPLITE ? "Lite" : "", + &saddr, ntohs(uh->source), + ulen, skb->len, + &daddr, ntohs(uh->dest)); goto drop; csum_error: @@ -1834,10 +1852,10 @@ csum_error: * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", - proto == IPPROTO_UDPLITE ? "Lite" : "", - &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), - ulen); + net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", + proto == IPPROTO_UDPLITE ? "Lite" : "", + &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), + ulen); UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); @@ -2027,7 +2045,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } else { up->corkflag = 0; lock_sock(sk); - (*push_pending_frames)(sk); + push_pending_frames(sk); release_sock(sk); } break; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6480cea7aa53..d3e537ef6b7f 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -29,7 +29,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, netdev_features_t features), - __be16 new_protocol) + __be16 new_protocol, bool is_ipv6) { struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; @@ -39,7 +39,10 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t enc_features; int udp_offset, outer_hlen; unsigned int oldlen; - bool need_csum; + bool need_csum = !!(skb_shinfo(skb)->gso_type & + SKB_GSO_UDP_TUNNEL_CSUM); + bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + bool offload_csum = false, dont_encap = (need_csum || remcsum); oldlen = (u16)~skb->len; @@ -52,10 +55,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; + skb->encap_hdr_csum = need_csum; + skb->remcsum_offload = remcsum; - need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); - if (need_csum) - skb->encap_hdr_csum = 1; + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + (skb->dev->features & + (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM))); /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & features; @@ -72,11 +78,21 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, do { struct udphdr *uh; int len; - - skb_reset_inner_headers(skb); - skb->encapsulation = 1; + __be32 delta; + + if (dont_encap) { + skb->encapsulation = 0; + skb->ip_summed = CHECKSUM_NONE; + } else { + /* Only set up inner headers if we might be offloading + * inner checksum. + */ + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } skb->mac_len = mac_len; + skb->protocol = protocol; skb_push(skb, outer_hlen); skb_reset_mac_header(skb); @@ -86,19 +102,36 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, uh = udp_hdr(skb); uh->len = htons(len); - if (need_csum) { - __be32 delta = htonl(oldlen + len); + if (!need_csum) + continue; - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); + delta = htonl(oldlen + len); + + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); + if (offload_csum) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + } else if (remcsum) { + /* Need to calculate checksum from scratch, + * inner checksums are never when doing + * remote_checksum_offload. + */ + + skb->csum = skb_checksum(skb, udp_offset, + skb->len - udp_offset, + 0); + uh->check = csum_fold(skb->csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { uh->check = gso_make_checksum(skb, ~uh->check); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } - - skb->protocol = protocol; } while ((skb = skb->next)); out: return segs; @@ -134,7 +167,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, } segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, - protocol); + protocol, is_ipv6); out_unlock: rcu_read_unlock(); @@ -172,9 +205,9 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_IPIP | - SKB_GSO_GRE | SKB_GSO_GRE_CSUM | - SKB_GSO_MPLS) || + SKB_GSO_GRE | SKB_GSO_GRE_CSUM) || !(type & (SKB_GSO_UDP)))) goto out; |