diff options
Diffstat (limited to 'net/ipv4')
52 files changed, 1073 insertions, 461 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 35913fb77dc8..09d78d4a3cff 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -245,29 +245,6 @@ out: } EXPORT_SYMBOL(inet_listen); -u32 inet_ehash_secret __read_mostly; -EXPORT_SYMBOL(inet_ehash_secret); - -u32 ipv6_hash_secret __read_mostly; -EXPORT_SYMBOL(ipv6_hash_secret); - -/* - * inet_ehash_secret must be set exactly once, and to a non nul value - * ipv6_hash_secret must be set exactly once. - */ -void build_ehash_secret(void) -{ - u32 rnd; - - do { - get_random_bytes(&rnd, sizeof(rnd)); - } while (rnd == 0); - - if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) - get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); -} -EXPORT_SYMBOL(build_ehash_secret); - /* * Create an inet socket. */ @@ -284,10 +261,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; - if (unlikely(!inet_ehash_secret)) - if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) - build_ehash_secret(); - sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ @@ -1254,36 +1227,36 @@ static int inet_gso_send_check(struct sk_buff *skb) if (ihl < sizeof(*iph)) goto out; + proto = iph->protocol; + + /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; - __skb_pull(skb, ihl); + skb_reset_transport_header(skb); - iph = ip_hdr(skb); - proto = iph->protocol; err = -EPROTONOSUPPORT; - rcu_read_lock(); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_send_check)) err = ops->callbacks.gso_send_check(skb); - rcu_read_unlock(); out: return err; } static struct sk_buff *inet_gso_segment(struct sk_buff *skb, - netdev_features_t features) + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; + unsigned int offset = 0; + bool udpfrag, encap; struct iphdr *iph; int proto; + int nhoff; int ihl; int id; - unsigned int offset = 0; - bool tunnel; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | @@ -1291,12 +1264,16 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | + SKB_GSO_IPIP | + SKB_GSO_SIT | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_MPLS | 0))) goto out; + skb_reset_network_header(skb); + nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; @@ -1305,42 +1282,48 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (ihl < sizeof(*iph)) goto out; + id = ntohs(iph->id); + proto = iph->protocol; + + /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; + __skb_pull(skb, ihl); - tunnel = !!skb->encapsulation; + encap = SKB_GSO_CB(skb)->encap_level > 0; + if (encap) + features = skb->dev->hw_enc_features & netif_skb_features(skb); + SKB_GSO_CB(skb)->encap_level += ihl; - __skb_pull(skb, ihl); skb_reset_transport_header(skb); - iph = ip_hdr(skb); - id = ntohs(iph->id); - proto = iph->protocol; + segs = ERR_PTR(-EPROTONOSUPPORT); - rcu_read_lock(); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); - rcu_read_unlock(); if (IS_ERR_OR_NULL(segs)) goto out; + udpfrag = !!skb->encapsulation && proto == IPPROTO_UDP; skb = segs; do { - iph = ip_hdr(skb); - if (!tunnel && proto == IPPROTO_UDP) { + iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); + if (udpfrag) { iph->id = htons(id); iph->frag_off = htons(offset >> 3); if (skb->next != NULL) iph->frag_off |= htons(IP_MF); - offset += (skb->len - skb->mac_len - iph->ihl * 4); - } else { + offset += skb->len - nhoff - ihl; + } else { iph->id = htons(id++); } - iph->tot_len = htons(skb->len - skb->mac_len); - iph->check = 0; - iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); + iph->tot_len = htons(skb->len - nhoff); + ip_send_check(iph); + if (encap) + skb_reset_inner_headers(skb); + skb->network_header = (u8 *)iph - skb->head; } while ((skb = skb->next)); out: @@ -1647,6 +1630,13 @@ static struct packet_offload ip_packet_offload __read_mostly = { }, }; +static const struct net_offload ipip_offload = { + .callbacks = { + .gso_send_check = inet_gso_send_check, + .gso_segment = inet_gso_segment, + }, +}; + static int __init ipv4_offload_init(void) { /* @@ -1658,6 +1648,7 @@ static int __init ipv4_offload_init(void) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); dev_add_offload(&ip_packet_offload); + inet_add_offload(&ipip_offload, IPPROTO_IPIP); return 0; } @@ -1706,8 +1697,6 @@ static int __init inet_init(void) ip_static_sysctl_init(); #endif - tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem; - /* * Add all the base protocols. */ diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 109ee89f123e..7785b28061ac 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -121,7 +121,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct aead_givcrypt_request *req; struct scatterlist *sg; struct scatterlist *asg; - struct esp_data *esp; struct sk_buff *trailer; void *tmp; u8 *iv; @@ -139,8 +138,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) /* skb is pure payload to encrypt */ - esp = x->data; - aead = esp->aead; + aead = x->data; alen = crypto_aead_authsize(aead); tfclen = 0; @@ -154,8 +152,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } blksize = ALIGN(crypto_aead_blocksize(aead), 4); clen = ALIGN(skb->len + 2 + tfclen, blksize); - if (esp->padlen) - clen = ALIGN(clen, esp->padlen); plen = clen - skb->len - tfclen; err = skb_cow_data(skb, tfclen + plen + alen, &trailer); @@ -280,8 +276,7 @@ static int esp_input_done2(struct sk_buff *skb, int err) { const struct iphdr *iph; struct xfrm_state *x = xfrm_input_state(skb); - struct esp_data *esp = x->data; - struct crypto_aead *aead = esp->aead; + struct crypto_aead *aead = x->data; int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); int elen = skb->len - hlen; @@ -376,8 +371,7 @@ static void esp_input_done(struct crypto_async_request *base, int err) static int esp_input(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; - struct esp_data *esp = x->data; - struct crypto_aead *aead = esp->aead; + struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); @@ -459,9 +453,8 @@ out: static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) { - struct esp_data *esp = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); - u32 align = max_t(u32, blksize, esp->padlen); + struct crypto_aead *aead = x->data; + u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); unsigned int net_adj; switch (x->props.mode) { @@ -476,8 +469,8 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) BUG(); } - return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - - net_adj) & ~(align - 1)) + net_adj - 2; + return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - + net_adj) & ~(blksize - 1)) + net_adj - 2; } static void esp4_err(struct sk_buff *skb, u32 info) @@ -511,18 +504,16 @@ static void esp4_err(struct sk_buff *skb, u32 info) static void esp_destroy(struct xfrm_state *x) { - struct esp_data *esp = x->data; + struct crypto_aead *aead = x->data; - if (!esp) + if (!aead) return; - crypto_free_aead(esp->aead); - kfree(esp); + crypto_free_aead(aead); } static int esp_init_aead(struct xfrm_state *x) { - struct esp_data *esp = x->data; struct crypto_aead *aead; int err; @@ -531,7 +522,7 @@ static int esp_init_aead(struct xfrm_state *x) if (IS_ERR(aead)) goto error; - esp->aead = aead; + x->data = aead; err = crypto_aead_setkey(aead, x->aead->alg_key, (x->aead->alg_key_len + 7) / 8); @@ -548,7 +539,6 @@ error: static int esp_init_authenc(struct xfrm_state *x) { - struct esp_data *esp = x->data; struct crypto_aead *aead; struct crypto_authenc_key_param *param; struct rtattr *rta; @@ -583,7 +573,7 @@ static int esp_init_authenc(struct xfrm_state *x) if (IS_ERR(aead)) goto error; - esp->aead = aead; + x->data = aead; keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) + (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param)); @@ -638,16 +628,11 @@ error: static int esp_init_state(struct xfrm_state *x) { - struct esp_data *esp; struct crypto_aead *aead; u32 align; int err; - esp = kzalloc(sizeof(*esp), GFP_KERNEL); - if (esp == NULL) - return -ENOMEM; - - x->data = esp; + x->data = NULL; if (x->aead) err = esp_init_aead(x); @@ -657,9 +642,7 @@ static int esp_init_state(struct xfrm_state *x) if (err) goto error; - aead = esp->aead; - - esp->padlen = 0; + aead = x->data; x->props.header_len = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); @@ -683,9 +666,7 @@ static int esp_init_state(struct xfrm_state *x) } align = ALIGN(crypto_aead_blocksize(aead), 4); - if (esp->padlen) - align = max_t(u32, align, esp->padlen); - x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead); + x->props.trailer_len = align + 1 + crypto_aead_authsize(aead); error: return err; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b3f627ac4ed8..d846304b7b89 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -933,7 +933,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) local_bh_disable(); frn->tb_id = tb->tb_id; - rcu_read_lock(); frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { @@ -942,7 +941,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) frn->type = res.type; frn->scope = res.scope; } - rcu_read_unlock(); local_bh_enable(); } } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index af0f14aba169..388d113fd289 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -24,21 +24,17 @@ static inline void fib_alias_accessed(struct fib_alias *fa) } /* Exported by fib_semantics.c */ -extern void fib_release_info(struct fib_info *); -extern struct fib_info *fib_create_info(struct fib_config *cfg); -extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); -extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, __be32 dst, - int dst_len, u8 tos, struct fib_info *fi, - unsigned int); -extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, - int dst_len, u32 tb_id, struct nl_info *info, - unsigned int nlm_flags); -extern struct fib_alias *fib_find_alias(struct list_head *fah, - u8 tos, u32 prio); -extern int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, - int *last_idx, int dflt); +void fib_release_info(struct fib_info *); +struct fib_info *fib_create_info(struct fib_config *cfg); +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); +int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, + u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, + unsigned int); +void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, + u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); +struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); +int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, int *last_idx, int dflt); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d5dbca5ecf62..e63f47a4e651 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -380,7 +380,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) } void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, - int dst_len, u32 tb_id, struct nl_info *info, + int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { struct sk_buff *skb; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 736c9fc3ef93..5893e99e8299 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -93,35 +93,6 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, } EXPORT_SYMBOL_GPL(gre_build_header); -struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum) -{ - int err; - - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } - - if (skb_is_gso(skb)) { - err = skb_unclone(skb, GFP_ATOMIC); - if (unlikely(err)) - goto error; - skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; - return skb; - } else if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) { - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->ip_summed = CHECKSUM_NONE; - - return skb; -error: - kfree_skb(skb); - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(gre_handle_offloads); - static __sum16 check_checksum(struct sk_buff *skb) { __sum16 csum = 0; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 55e6bfb3a289..e5d436188464 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -39,7 +39,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | - SKB_GSO_GRE))) + SKB_GSO_GRE | + SKB_GSO_IPIP))) goto out; if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 41e1c3ea8b51..56a964a553d2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -336,12 +336,9 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s err = 0; out: - if (sk) { - if (sk->sk_state == TCP_TIME_WAIT) - inet_twsk_put((struct inet_timewait_sock *)sk); - else - sock_put(sk); - } + if (sk) + sock_gen_put(sk); + out_nosk: return err; } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c5313a9c019b..bb075fc9a14f 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -93,9 +93,6 @@ void inet_frags_init(struct inet_frags *f) } rwlock_init(&f->lock); - f->rnd = (u32) ((totalram_pages ^ (totalram_pages >> 7)) ^ - (jiffies ^ (jiffies >> 6))); - setup_timer(&f->secret_timer, inet_frag_secret_rebuild, (unsigned long)f); f->secret_timer.expires = jiffies + f->secret_interval; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index a4b66bbe4f21..8b9cf279450d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -24,6 +24,31 @@ #include <net/secure_seq.h> #include <net/ip.h> +static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) +{ + static u32 inet_ehash_secret __read_mostly; + + net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); + + return __inet_ehashfn(laddr, lport, faddr, fport, + inet_ehash_secret + net_hash_mix(net)); +} + + +static unsigned int inet_sk_ehashfn(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const __be32 laddr = inet->inet_rcv_saddr; + const __u16 lport = inet->inet_num; + const __be32 faddr = inet->inet_daddr; + const __be16 fport = inet->inet_dport; + struct net *net = sock_net(sk); + + return inet_ehashfn(net, laddr, lport, faddr, fport); +} + /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b66910aaef4d..2481993a4970 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -106,6 +106,7 @@ struct ip4_create_arg { static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) { + net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); return jhash_3words((__force u32)id << 16 | prot, (__force u32)saddr, (__force u32)daddr, ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7d8357bb2ba6..51be64e18e32 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -772,15 +772,20 @@ static inline int ip_ufo_append_data(struct sock *sk, /* initialize protocol header pointer */ skb->transport_header = skb->network_header + fragheaderlen; - skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; - /* specify the length of each IP datagram fragment */ - skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + __skb_queue_tail(queue, skb); + } else if (skb_is_gso(skb)) { + goto append; } + skb->ip_summed = CHECKSUM_PARTIAL; + /* specify the length of each IP datagram fragment */ + skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + +append: return skb_append_datato_frags(sk, skb, getfrag, from, (length - transhdrlen)); } @@ -805,7 +810,7 @@ static int __ip_append_data(struct sock *sk, int copy; int err; int offset = 0; - unsigned int maxfraglen, fragheaderlen; + unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; @@ -818,8 +823,10 @@ static int __ip_append_data(struct sock *sk, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? + mtu : 0xFFFF; - if (cork->length + length > 0xFFFF - fragheaderlen) { + if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu-exthdrlen); return -EMSGSIZE; @@ -1117,7 +1124,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, int mtu; int len; int err; - unsigned int maxfraglen, fragheaderlen, fraggap; + unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize; if (inet->hdrincl) return -EPERM; @@ -1141,8 +1148,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ? + mtu : 0xFFFF; - if (cork->length + size > 0xFFFF - fragheaderlen) { + if (cork->length + size > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); return -EMSGSIZE; } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index c31e3ad98ef2..42ffbc8d65c6 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -116,3 +116,36 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) return 0; } EXPORT_SYMBOL_GPL(iptunnel_pull_header); + +struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, + bool csum_help, + int gso_type_mask) +{ + int err; + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + if (skb_is_gso(skb)) { + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + goto error; + skb_shinfo(skb)->gso_type |= gso_type_mask; + return skb; + } + + if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { + err = skb_checksum_help(skb); + if (unlikely(err)) + goto error; + } else if (skb->ip_summed != CHECKSUM_PARTIAL) + skb->ip_summed = CHECKSUM_NONE; + + return skb; +error: + kfree_skb(skb); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 91f69bc883fe..5d9c845d288a 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -61,8 +61,17 @@ static int vti_rcv(struct sk_buff *skb) iph->saddr, iph->daddr, 0); if (tunnel != NULL) { struct pcpu_tstats *tstats; + u32 oldmark = skb->mark; + int ret; - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + + /* temporarily mark the skb with the tunnel o_key, to + * only match policies with this mark. + */ + skb->mark = be32_to_cpu(tunnel->parms.o_key); + ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb); + skb->mark = oldmark; + if (!ret) return -1; tstats = this_cpu_ptr(tunnel->dev->tstats); @@ -71,7 +80,6 @@ static int vti_rcv(struct sk_buff *skb) tstats->rx_bytes += skb->len; u64_stats_update_end(&tstats->syncp); - skb->mark = 0; secpath_reset(skb); skb->dev = tunnel->dev; return 1; @@ -103,7 +111,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(&fl4, 0, sizeof(fl4)); flowi4_init_output(&fl4, tunnel->parms.link, - be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos), + be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPIP, 0, dst, tiph->saddr, 0, 0); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 7f80fb4b82d3..fe3e9f7f1f0b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -220,17 +220,17 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + if (IS_ERR(skb)) + goto out; ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); return NETDEV_TX_OK; tx_error: - dev->stats.tx_errors++; dev_kfree_skb(skb); +out: + dev->stats.tx_errors++; return NETDEV_TX_OK; } @@ -275,6 +275,7 @@ static const struct net_device_ops ipip_netdev_ops = { #define IPIP_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ + NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static void ipip_tunnel_setup(struct net_device *dev) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1657e39b291f..40d56073cd19 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,6 +36,27 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. +config NF_TABLES_IPV4 + depends on NF_TABLES + tristate "IPv4 nf_tables support" + +config NFT_REJECT_IPV4 + depends on NF_TABLES_IPV4 + tristate "nf_tables IPv4 reject support" + +config NFT_CHAIN_ROUTE_IPV4 + depends on NF_TABLES_IPV4 + tristate "IPv4 nf_tables route chain support" + +config NFT_CHAIN_NAT_IPV4 + depends on NF_TABLES_IPV4 + depends on NF_NAT_IPV4 && NFT_NAT + tristate "IPv4 nf_tables nat chain support" + +config NF_TABLES_ARP + depends on NF_TABLES + tristate "ARP nf_tables support" + config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 3622b248b6dd..19df72b7ba88 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -27,6 +27,12 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o +obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o +obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o +obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o +obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o + # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 85a4f21aac1a..59da7cde0724 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -271,6 +271,11 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); private = table->private; + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index a865f6f94013..802ddecb30b8 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -27,13 +27,14 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c */ static unsigned int -arptable_filter_hook(unsigned int hook, struct sk_buff *skb, +arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net = dev_net((in != NULL) ? in : out); - return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter); + return arpt_do_table(skb, ops->hooknum, in, out, + net->ipv4.arptable_filter); } static struct nf_hook_ops *arpfilter_ops __read_mostly; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d23118d95ff9..718dfbd30cbe 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -327,6 +327,11 @@ ipt_do_table(struct sk_buff *skb, addend = xt_write_recseq_begin(); private = table->private; cpu = smp_processor_id(); + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); table_base = private->entries[cpu]; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index ecd808a93b63..2510c02c2d21 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -495,7 +495,7 @@ static void arp_print(struct arp_payload *payload) #endif static unsigned int -arp_mangle(unsigned int hook, +arp_mangle(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index b6346bf2fde3..01cffeaa0085 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -297,7 +297,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv4_synproxy_hook(unsigned int hooknum, +static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index cbc22158af49..9cb993cd224b 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -220,6 +220,7 @@ static void ipt_ulog_packet(struct net *net, ub->qlen++; pm = nlmsg_data(nlh); + memset(pm, 0, sizeof(*pm)); /* We might not have a timestamp, get one */ if (skb->tstamp.tv64 == 0) @@ -238,8 +239,6 @@ static void ipt_ulog_packet(struct net *net, } else if (loginfo->prefix[0] != '\0') strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); - else - *(pm->prefix) = '\0'; if (in && in->hard_header_len > 0 && skb->mac_header != skb->network_header && @@ -251,13 +250,9 @@ static void ipt_ulog_packet(struct net *net, if (in) strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); - else - pm->indev_name[0] = '\0'; if (out) strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); - else - pm->outdev_name[0] = '\0'; /* copy_len <= skb->len, so can't fail. */ if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 50af5b45c050..e08a74a243a8 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -33,20 +33,21 @@ static const struct xt_table packet_filter = { }; static unsigned int -iptable_filter_hook(unsigned int hook, struct sk_buff *skb, +iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net; - if (hook == NF_INET_LOCAL_OUT && + if (ops->hooknum == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter); + return ipt_do_table(skb, ops->hooknum, in, out, + net->ipv4.iptable_filter); } static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 0d8cd82e0fad..6a5079c34bb3 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -79,19 +79,19 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) /* The work comes in here from netfilter.c. */ static unsigned int -iptable_mangle_hook(unsigned int hook, +iptable_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - if (hook == NF_INET_LOCAL_OUT) + if (ops->hooknum == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, out); - if (hook == NF_INET_POST_ROUTING) - return ipt_do_table(skb, hook, in, out, + if (ops->hooknum == NF_INET_POST_ROUTING) + return ipt_do_table(skb, ops->hooknum, in, out, dev_net(out)->ipv4.iptable_mangle); /* PREROUTING/INPUT/FORWARD: */ - return ipt_do_table(skb, hook, in, out, + return ipt_do_table(skb, ops->hooknum, in, out, dev_net(in)->ipv4.iptable_mangle); } diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 683bfaffed65..ee2886126e3d 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -61,7 +61,7 @@ static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, } static unsigned int -nf_nat_ipv4_fn(unsigned int hooknum, +nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -71,7 +71,7 @@ nf_nat_ipv4_fn(unsigned int hooknum, enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); /* We never see fragments: conntrack defrags on pre-routing * and local-out, and nf_nat_out protects post-routing. @@ -108,7 +108,7 @@ nf_nat_ipv4_fn(unsigned int hooknum, case IP_CT_RELATED_REPLY: if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - hooknum)) + ops->hooknum)) return NF_DROP; else return NF_ACCEPT; @@ -121,14 +121,14 @@ nf_nat_ipv4_fn(unsigned int hooknum, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = nf_nat_rule_find(skb, hooknum, in, out, ct); + ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); - if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) goto oif_changed; } break; @@ -137,11 +137,11 @@ nf_nat_ipv4_fn(unsigned int hooknum, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) goto oif_changed; } - return nf_nat_packet(ct, ctinfo, hooknum, skb); + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); @@ -149,7 +149,7 @@ oif_changed: } static unsigned int -nf_nat_ipv4_in(unsigned int hooknum, +nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -158,7 +158,7 @@ nf_nat_ipv4_in(unsigned int hooknum, unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; - ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); + ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); @@ -167,7 +167,7 @@ nf_nat_ipv4_in(unsigned int hooknum, } static unsigned int -nf_nat_ipv4_out(unsigned int hooknum, +nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -185,7 +185,7 @@ nf_nat_ipv4_out(unsigned int hooknum, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); + ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -207,7 +207,7 @@ nf_nat_ipv4_out(unsigned int hooknum, } static unsigned int -nf_nat_ipv4_local_fn(unsigned int hooknum, +nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -223,7 +223,7 @@ nf_nat_ipv4_local_fn(unsigned int hooknum, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); + ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 1f82aea11df6..b2f7e8f98316 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -20,20 +20,20 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int -iptable_raw_hook(unsigned int hook, struct sk_buff *skb, +iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net; - if (hook == NF_INET_LOCAL_OUT && + if (ops->hooknum == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* root is playing with raw sockets. */ return NF_ACCEPT; net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw); + return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index f867a8d38bf7..c86647ed2078 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -37,21 +37,22 @@ static const struct xt_table security_table = { }; static unsigned int -iptable_security_hook(unsigned int hook, struct sk_buff *skb, +iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { const struct net *net; - if (hook == NF_INET_LOCAL_OUT && + if (ops->hooknum == NF_INET_LOCAL_OUT && (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr))) /* Somebody is playing with raw sockets. */ return NF_ACCEPT; net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security); + return ipt_do_table(skb, ops->hooknum, in, out, + net->ipv4.iptable_security); } static struct nf_hook_ops *sectbl_ops __read_mostly; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 86f5b34a4ed1..ecd8bec411c9 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, return NF_ACCEPT; } -static unsigned int ipv4_helper(unsigned int hooknum, +static unsigned int ipv4_helper(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -121,7 +121,7 @@ static unsigned int ipv4_helper(unsigned int hooknum, ct, ctinfo); } -static unsigned int ipv4_confirm(unsigned int hooknum, +static unsigned int ipv4_confirm(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -147,16 +147,16 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv4_conntrack_in(unsigned int hooknum, +static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb); + return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb); } -static unsigned int ipv4_conntrack_local(unsigned int hooknum, +static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -166,7 +166,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum, if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb); + return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb); } /* Connection tracking may drop packets, but never alters them, so diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 742815518b0f..12e13bd82b5b 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -60,7 +60,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, return IP_DEFRAG_CONNTRACK_OUT + zone; } -static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, +static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -83,7 +83,9 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, #endif /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { - enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb); + enum ip_defrag_users user = + nf_ct_defrag_user(ops->hooknum, skb); + if (nf_ct_ipv4_gather_frags(skb, user)) return NF_STOLEN; } diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c new file mode 100644 index 000000000000..3e67ef1c676f --- /dev/null +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netfilter_arp.h> +#include <net/netfilter/nf_tables.h> + +static struct nft_af_info nft_af_arp __read_mostly = { + .family = NFPROTO_ARP, + .nhooks = NF_ARP_NUMHOOKS, + .owner = THIS_MODULE, +}; + +static int nf_tables_arp_init_net(struct net *net) +{ + net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.arp== NULL) + return -ENOMEM; + + memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp)); + + if (nft_register_afinfo(net, net->nft.arp) < 0) + goto err; + + return 0; +err: + kfree(net->nft.arp); + return -ENOMEM; +} + +static void nf_tables_arp_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.arp); + kfree(net->nft.arp); +} + +static struct pernet_operations nf_tables_arp_net_ops = { + .init = nf_tables_arp_init_net, + .exit = nf_tables_arp_exit_net, +}; + +static unsigned int +nft_do_chain_arp(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, ops, skb, in, out); + + return nft_do_chain_pktinfo(&pkt, ops); +} + +static struct nf_chain_type filter_arp = { + .family = NFPROTO_ARP, + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .hook_mask = (1 << NF_ARP_IN) | + (1 << NF_ARP_OUT) | + (1 << NF_ARP_FORWARD), + .fn = { + [NF_ARP_IN] = nft_do_chain_arp, + [NF_ARP_OUT] = nft_do_chain_arp, + [NF_ARP_FORWARD] = nft_do_chain_arp, + }, +}; + +static int __init nf_tables_arp_init(void) +{ + int ret; + + nft_register_chain_type(&filter_arp); + ret = register_pernet_subsys(&nf_tables_arp_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_arp); + + return ret; +} + +static void __exit nf_tables_arp_exit(void) +{ + unregister_pernet_subsys(&nf_tables_arp_net_ops); + nft_unregister_chain_type(&filter_arp); +} + +module_init(nf_tables_arp_init); +module_exit(nf_tables_arp_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c new file mode 100644 index 000000000000..8f7536be1322 --- /dev/null +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_tables.h> +#include <net/net_namespace.h> +#include <net/ip.h> +#include <net/net_namespace.h> +#include <net/netfilter/nf_tables_ipv4.h> + +static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + if (unlikely(skb->len < sizeof(struct iphdr) || + ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { + if (net_ratelimit()) + pr_info("nf_tables_ipv4: ignoring short SOCK_RAW " + "packet\n"); + return NF_ACCEPT; + } + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + + return nft_do_chain_pktinfo(&pkt, ops); +} + +static struct nft_af_info nft_af_ipv4 __read_mostly = { + .family = NFPROTO_IPV4, + .nhooks = NF_INET_NUMHOOKS, + .owner = THIS_MODULE, + .hooks = { + [NF_INET_LOCAL_OUT] = nft_ipv4_output, + }, +}; + +static int nf_tables_ipv4_init_net(struct net *net) +{ + net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.ipv4 == NULL) + return -ENOMEM; + + memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4)); + + if (nft_register_afinfo(net, net->nft.ipv4) < 0) + goto err; + + return 0; +err: + kfree(net->nft.ipv4); + return -ENOMEM; +} + +static void nf_tables_ipv4_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.ipv4); + kfree(net->nft.ipv4); +} + +static struct pernet_operations nf_tables_ipv4_net_ops = { + .init = nf_tables_ipv4_init_net, + .exit = nf_tables_ipv4_exit_net, +}; + +static unsigned int +nft_do_chain_ipv4(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + + return nft_do_chain_pktinfo(&pkt, ops); +} + +static struct nf_chain_type filter_ipv4 = { + .family = NFPROTO_IPV4, + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), + .fn = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, + [NF_INET_LOCAL_OUT] = nft_ipv4_output, + [NF_INET_FORWARD] = nft_do_chain_ipv4, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, + }, +}; + +static int __init nf_tables_ipv4_init(void) +{ + nft_register_chain_type(&filter_ipv4); + return register_pernet_subsys(&nf_tables_ipv4_net_ops); +} + +static void __exit nf_tables_ipv4_exit(void) +{ + unregister_pernet_subsys(&nf_tables_ipv4_net_ops); + nft_unregister_chain_type(&filter_ipv4); +} + +module_init(nf_tables_ipv4_init); +module_exit(nf_tables_ipv4_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(AF_INET); diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c new file mode 100644 index 000000000000..cf2c792cd971 --- /dev/null +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * Copyright (c) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/ip.h> + +/* + * NAT chains + */ + +static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_nat *nat; + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + struct nft_pktinfo pkt; + unsigned int ret; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); + + nat = nfct_nat(ct); + if (nat == NULL) { + /* Conntrack module was loaded late, can't add extension. */ + if (nf_ct_is_confirmed(ct)) + return NF_ACCEPT; + nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + if (nat == NULL) + return NF_ACCEPT; + } + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED + IP_CT_IS_REPLY: + if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + ops->hooknum)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall through */ + case IP_CT_NEW: + if (nf_nat_initialized(ct, maniptype)) + break; + + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + + ret = nft_do_chain_pktinfo(&pkt, ops); + if (ret != NF_ACCEPT) + return ret; + if (!nf_nat_initialized(ct, maniptype)) { + ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + if (ret != NF_ACCEPT) + return ret; + } + default: + break; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); +} + +static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + __be32 daddr = ip_hdr(skb)->daddr; + unsigned int ret; + + ret = nf_nat_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + ip_hdr(skb)->daddr != daddr) { + skb_dst_drop(skb); + } + return ret; +} + +static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo __maybe_unused; + const struct nf_conn *ct __maybe_unused; + unsigned int ret; + + ret = nf_nat_fn(ops, skb, in, out, okfn); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.src.u3.ip != + ct->tuplehash[!dir].tuple.dst.u3.ip || + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all) + return nf_xfrm_me_harder(skb, AF_INET) == 0 ? + ret : NF_DROP; + } +#endif + return ret; +} + +static unsigned int nf_nat_output(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int ret; + + ret = nf_nat_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.dst.u3.ip != + ct->tuplehash[!dir].tuple.src.u3.ip) { + if (ip_route_me_harder(skb, RTN_UNSPEC)) + ret = NF_DROP; + } +#ifdef CONFIG_XFRM + else if (ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) + if (nf_xfrm_me_harder(skb, AF_INET)) + ret = NF_DROP; +#endif + } + return ret; +} + +static struct nf_chain_type nft_chain_nat_ipv4 = { + .family = NFPROTO_IPV4, + .name = "nat", + .type = NFT_CHAIN_T_NAT, + .hook_mask = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .fn = { + [NF_INET_PRE_ROUTING] = nf_nat_prerouting, + [NF_INET_POST_ROUTING] = nf_nat_postrouting, + [NF_INET_LOCAL_OUT] = nf_nat_output, + [NF_INET_LOCAL_IN] = nf_nat_fn, + }, + .me = THIS_MODULE, +}; + +static int __init nft_chain_nat_init(void) +{ + int err; + + err = nft_register_chain_type(&nft_chain_nat_ipv4); + if (err < 0) + return err; + + return 0; +} + +static void __exit nft_chain_nat_exit(void) +{ + nft_unregister_chain_type(&nft_chain_nat_ipv4); +} + +module_init(nft_chain_nat_init); +module_exit(nft_chain_nat_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c new file mode 100644 index 000000000000..4e6bf9a3d7aa --- /dev/null +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/route.h> +#include <net/ip.h> + +static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + struct nft_pktinfo pkt; + u32 mark; + __be32 saddr, daddr; + u_int8_t tos; + const struct iphdr *iph; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + + mark = skb->mark; + iph = ip_hdr(skb); + saddr = iph->saddr; + daddr = iph->daddr; + tos = iph->tos; + + ret = nft_do_chain_pktinfo(&pkt, ops); + if (ret != NF_DROP && ret != NF_QUEUE) { + iph = ip_hdr(skb); + + if (iph->saddr != saddr || + iph->daddr != daddr || + skb->mark != mark || + iph->tos != tos) + if (ip_route_me_harder(skb, RTN_UNSPEC)) + ret = NF_DROP; + } + return ret; +} + +static struct nf_chain_type nft_chain_route_ipv4 = { + .family = NFPROTO_IPV4, + .name = "route", + .type = NFT_CHAIN_T_ROUTE, + .hook_mask = (1 << NF_INET_LOCAL_OUT), + .fn = { + [NF_INET_LOCAL_OUT] = nf_route_table_hook, + }, + .me = THIS_MODULE, +}; + +static int __init nft_chain_route_init(void) +{ + return nft_register_chain_type(&nft_chain_route_ipv4); +} + +static void __exit nft_chain_route_exit(void) +{ + nft_unregister_chain_type(&nft_chain_route_ipv4); +} + +module_init(nft_chain_route_init); +module_exit(nft_chain_route_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "route"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c new file mode 100644 index 000000000000..fff5ba1a33b7 --- /dev/null +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/icmp.h> + +struct nft_reject { + enum nft_reject_types type:8; + u8 icmp_code; +}; + +static void nft_reject_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + icmp_send(pkt->skb, ICMP_DEST_UNREACH, priv->icmp_code, 0); + break; + case NFT_REJECT_TCP_RST: + break; + } + + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { + [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, + [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, +}; + +static int nft_reject_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, priv->type)) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_reject_type; +static const struct nft_expr_ops nft_reject_ops = { + .type = &nft_reject_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_type __read_mostly = { + .name = "reject", + .ops = &nft_reject_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_module_init(void) +{ + return nft_register_expr(&nft_reject_type); +} + +static void __exit nft_reject_module_exit(void) +{ + nft_unregister_expr(&nft_reject_type); +} + +module_init(nft_reject_module_init); +module_exit(nft_reject_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_EXPR("reject"); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6011615e810d..d2d325382b13 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -295,7 +295,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", dst_entries_get_slow(&ipv4_dst_ops), - st->in_hit, + 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, @@ -303,16 +303,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) st->in_martian_dst, st->in_martian_src, - st->out_hit, + 0, /* st->out_hit */ st->out_slow_tot, st->out_slow_mc, - st->gc_total, - st->gc_ignored, - st->gc_goal_miss, - st->gc_dst_overflow, - st->in_hlist_search, - st->out_hlist_search + 0, /* st->gc_total */ + 0, /* st->gc_ignored */ + 0, /* st->gc_goal_miss */ + 0, /* st->gc_dst_overflow */ + 0, /* st->in_hlist_search */ + 0 /* st->out_hlist_search */ ); return 0; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 3b64c59b4109..b95331e6c077 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -25,15 +25,7 @@ extern int sysctl_tcp_syncookies; -__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; -EXPORT_SYMBOL(syncookie_secret); - -static __init int init_syncookies(void) -{ - get_random_bytes(syncookie_secret, sizeof(syncookie_secret)); - return 0; -} -__initcall(init_syncookies); +static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -44,8 +36,11 @@ static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); + __u32 *tmp; + + net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); + tmp = __get_cpu_var(ipv4_cookie_scratch); memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; tmp[1] = (__force u32)daddr; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c08f096d46b5..d5b1390eebbe 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -200,49 +200,6 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } -static int ipv4_tcp_mem(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - unsigned long vec[3]; - struct net *net = current->nsproxy->net_ns; -#ifdef CONFIG_MEMCG_KMEM - struct mem_cgroup *memcg; -#endif - - struct ctl_table tmp = { - .data = &vec, - .maxlen = sizeof(vec), - .mode = ctl->mode, - }; - - if (!write) { - ctl->data = &net->ipv4.sysctl_tcp_mem; - return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos); - } - - ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); - if (ret) - return ret; - -#ifdef CONFIG_MEMCG_KMEM - rcu_read_lock(); - memcg = mem_cgroup_from_task(current); - - tcp_prot_mem(memcg, vec[0], 0); - tcp_prot_mem(memcg, vec[1], 1); - tcp_prot_mem(memcg, vec[2], 2); - rcu_read_unlock(); -#endif - - net->ipv4.sysctl_tcp_mem[0] = vec[0]; - net->ipv4.sysctl_tcp_mem[1] = vec[1]; - net->ipv4.sysctl_tcp_mem[2] = vec[2]; - - return 0; -} - static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -274,6 +231,11 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } + /* Generate a dummy secret but don't publish it. This + * is needed so we don't regenerate a new key on the + * first invocation of tcp_fastopen_cookie_gen + */ + tcp_fastopen_init_key_once(false); tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); } @@ -552,6 +514,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_mem", + .maxlen = sizeof(sysctl_tcp_mem), + .data = &sysctl_tcp_mem, + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { .procname = "tcp_wmem", .data = &sysctl_tcp_wmem, .maxlen = sizeof(sysctl_tcp_wmem), @@ -860,12 +829,6 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_local_port_range, }, - { - .procname = "tcp_mem", - .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem), - .mode = 0644, - .proc_handler = ipv4_tcp_mem, - }, { } }; @@ -875,32 +838,15 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table = ipv4_net_table; if (!net_eq(net, &init_net)) { + int i; + table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); if (table == NULL) goto err_alloc; - table[0].data = - &net->ipv4.sysctl_icmp_echo_ignore_all; - table[1].data = - &net->ipv4.sysctl_icmp_echo_ignore_broadcasts; - table[2].data = - &net->ipv4.sysctl_icmp_ignore_bogus_error_responses; - table[3].data = - &net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr; - table[4].data = - &net->ipv4.sysctl_icmp_ratelimit; - table[5].data = - &net->ipv4.sysctl_icmp_ratemask; - table[6].data = - &net->ipv4.sysctl_ping_group_range; - table[7].data = - &net->ipv4.sysctl_tcp_ecn; - table[8].data = - &net->ipv4.sysctl_local_ports.range; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; + /* Update the variables to point into the current struct net */ + for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) + table[i].data += (void *)net - (void *)&init_net; } /* @@ -917,8 +863,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_local_ports.range[0] = 32768; net->ipv4.sysctl_local_ports.range[1] = 61000; - tcp_init_mem(net); - net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); if (net->ipv4.ipv4_hdr == NULL) goto err_reg; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index be4b161802e8..8e8529d3c8c9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -288,9 +288,11 @@ int sysctl_tcp_min_tso_segs __read_mostly = 2; struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); +long sysctl_tcp_mem[3] __read_mostly; int sysctl_tcp_wmem[3] __read_mostly; int sysctl_tcp_rmem[3] __read_mostly; +EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); @@ -3097,13 +3099,13 @@ static int __init set_thash_entries(char *str) } __setup("thash_entries=", set_thash_entries); -void tcp_init_mem(struct net *net) +static void tcp_init_mem(void) { unsigned long limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); - net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; - net->ipv4.sysctl_tcp_mem[1] = limit; - net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; + sysctl_tcp_mem[1] = limit; + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; } void __init tcp_init(void) @@ -3165,7 +3167,7 @@ void __init tcp_init(void) sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); - tcp_init_mem(&init_net); + tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index ab7bd35bb312..766032b4a6c3 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -14,6 +14,20 @@ struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); +void tcp_fastopen_init_key_once(bool publish) +{ + static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + + /* tcp_fastopen_reset_cipher publishes the new context + * atomically, so we allow this race happening here. + * + * All call sites of tcp_fastopen_cookie_gen also check + * for a valid cookie, so this is an acceptable risk. + */ + if (net_get_random_once(key, sizeof(key)) && publish) + tcp_fastopen_reset_cipher(key, sizeof(key)); +} + static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = @@ -70,6 +84,8 @@ void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, __be32 path[4] = { src, dst, 0, 0 }; struct tcp_fastopen_context *ctx; + tcp_fastopen_init_key_once(true); + rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { @@ -78,14 +94,3 @@ void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, } rcu_read_unlock(); } - -static int __init tcp_fastopen_init(void) -{ - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; - - get_random_bytes(key, sizeof(key)); - tcp_fastopen_reset_cipher(key, sizeof(key)); - return 0; -} - -late_initcall(tcp_fastopen_init); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb651a069a6c..63095b218b4a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2903,7 +2903,8 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + flag & FLAG_ACKED) seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; if (seq_rtt < 0) @@ -2918,14 +2919,19 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) +static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) { struct tcp_sock *tp = tcp_sk(sk); s32 seq_rtt = -1; - if (tp->lsndtime && !tp->total_retrans) - seq_rtt = tcp_time_stamp - tp->lsndtime; - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); + if (synack_stamp && !tp->total_retrans) + seq_rtt = tcp_time_stamp - synack_stamp; + + /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets + * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() + */ + if (!tp->srtt) + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); } static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) @@ -3028,6 +3034,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, s32 seq_rtt = -1; s32 ca_seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); + bool rtt_update; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); @@ -3104,14 +3111,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || - (flag & FLAG_ACKED)) - tcp_rearm_rto(sk); + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); @@ -3150,6 +3156,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ca_ops->pkts_acked(sk, pkts_acked, rtt_us); } + } else if (skb && rtt_update && sack_rtt >= 0 && + sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) { + /* Do not re-arm RTO if the sack RTT is measured from data sent + * after when the head was last (re)transmitted. Otherwise the + * timeout may continue to extend in loss recovery. + */ + tcp_rearm_rto(sk); } #if FASTRETRANS_DEBUG > 0 @@ -3338,7 +3351,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) tcp_init_cwnd_reduction(sk, true); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); - tcp_set_ca_state(sk, TCP_CA_Open); + tcp_try_keep_open(sk); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBERECOVERY); } @@ -5626,6 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct request_sock *req; int queued = 0; bool acceptable; + u32 synack_stamp; tp->rx_opt.saw_tstamp = 0; @@ -5708,9 +5722,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * so release it. */ if (req) { + synack_stamp = tcp_rsk(req)->snt_synack; tp->total_retrans = req->num_retrans; reqsk_fastopen_remove(sk, req, false); } else { + synack_stamp = tp->lsndtime; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_init_congestion_control(sk); @@ -5733,7 +5749,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_synack_rtt_meas(sk, req); + tcp_synack_rtt_meas(sk, synack_stamp); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -5751,6 +5767,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } else tcp_init_metrics(sk); + tcp_update_pacing_rate(sk); + /* Prevent spurious tcp_cwnd_restart() on first data packet */ tp->lsndtime = tcp_time_stamp; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 114d1b748cbb..300ab2c93f29 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2749,6 +2749,7 @@ struct proto tcp_prot = { .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, + .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 559d4ae6ebf4..03e9154f7e68 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -6,15 +6,10 @@ #include <linux/memcontrol.h> #include <linux/module.h> -static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) -{ - return container_of(cg_proto, struct tcp_memcontrol, cg_proto); -} - static void memcg_tcp_enter_memory_pressure(struct sock *sk) { if (sk->sk_cgrp->memory_pressure) - *sk->sk_cgrp->memory_pressure = 1; + sk->sk_cgrp->memory_pressure = 1; } EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure); @@ -27,34 +22,24 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) */ struct res_counter *res_parent = NULL; struct cg_proto *cg_proto, *parent_cg; - struct tcp_memcontrol *tcp; struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct net *net = current->nsproxy->net_ns; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return 0; - tcp = tcp_from_cgproto(cg_proto); - - tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0]; - tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1]; - tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2]; - tcp->tcp_memory_pressure = 0; + cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; + cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; + cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; + cg_proto->memory_pressure = 0; + cg_proto->memcg = memcg; parent_cg = tcp_prot.proto_cgroup(parent); if (parent_cg) - res_parent = parent_cg->memory_allocated; - - res_counter_init(&tcp->tcp_memory_allocated, res_parent); - percpu_counter_init(&tcp->tcp_sockets_allocated, 0); + res_parent = &parent_cg->memory_allocated; - cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure; - cg_proto->memory_pressure = &tcp->tcp_memory_pressure; - cg_proto->sysctl_mem = tcp->tcp_prot_mem; - cg_proto->memory_allocated = &tcp->tcp_memory_allocated; - cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated; - cg_proto->memcg = memcg; + res_counter_init(&cg_proto->memory_allocated, res_parent); + percpu_counter_init(&cg_proto->sockets_allocated, 0); return 0; } @@ -63,21 +48,17 @@ EXPORT_SYMBOL(tcp_init_cgroup); void tcp_destroy_cgroup(struct mem_cgroup *memcg) { struct cg_proto *cg_proto; - struct tcp_memcontrol *tcp; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return; - tcp = tcp_from_cgproto(cg_proto); - percpu_counter_destroy(&tcp->tcp_sockets_allocated); + percpu_counter_destroy(&cg_proto->sockets_allocated); } EXPORT_SYMBOL(tcp_destroy_cgroup); static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) { - struct net *net = current->nsproxy->net_ns; - struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; u64 old_lim; int i; @@ -90,16 +71,14 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) if (val > RES_COUNTER_MAX) val = RES_COUNTER_MAX; - tcp = tcp_from_cgproto(cg_proto); - - old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); - ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val); + old_lim = res_counter_read_u64(&cg_proto->memory_allocated, RES_LIMIT); + ret = res_counter_set_limit(&cg_proto->memory_allocated, val); if (ret) return ret; for (i = 0; i < 3; i++) - tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, - net->ipv4.sysctl_tcp_mem[i]); + cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT, + sysctl_tcp_mem[i]); if (val == RES_COUNTER_MAX) clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); @@ -156,28 +135,24 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) { - struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return default_val; - tcp = tcp_from_cgproto(cg_proto); - return res_counter_read_u64(&tcp->tcp_memory_allocated, type); + return res_counter_read_u64(&cg_proto->memory_allocated, type); } static u64 tcp_read_usage(struct mem_cgroup *memcg) { - struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; - tcp = tcp_from_cgproto(cg_proto); - return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); + return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE); } static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -205,54 +180,25 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) { struct mem_cgroup *memcg; - struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; memcg = mem_cgroup_from_css(css); cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return 0; - tcp = tcp_from_cgproto(cg_proto); switch (event) { case RES_MAX_USAGE: - res_counter_reset_max(&tcp->tcp_memory_allocated); + res_counter_reset_max(&cg_proto->memory_allocated); break; case RES_FAILCNT: - res_counter_reset_failcnt(&tcp->tcp_memory_allocated); + res_counter_reset_failcnt(&cg_proto->memory_allocated); break; } return 0; } -unsigned long long tcp_max_memory(const struct mem_cgroup *memcg) -{ - struct tcp_memcontrol *tcp; - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg); - if (!cg_proto) - return 0; - - tcp = tcp_from_cgproto(cg_proto); - return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); -} - -void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx) -{ - struct tcp_memcontrol *tcp; - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return; - - tcp = tcp_from_cgproto(cg_proto); - - tcp->tcp_prot_mem[idx] = val; -} - static struct cftype tcp_files[] = { { .name = "kmem.tcp.limit_in_bytes", diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4a2a84110dfb..2ab09cbae74d 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -671,8 +671,9 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; write_seqlock_bh(&fastopen_seqlock); - tfom->mss = mss; - if (cookie->len > 0) + if (mss) + tfom->mss = mss; + if (cookie && cookie->len > 0) tfom->cookie = *cookie; if (syn_lost) { ++tfom->syn_loss; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 3a7525e6c086..a2b68a108eae 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -14,10 +14,11 @@ #include <net/tcp.h> #include <net/protocol.h> -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, +struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int sum_truesize = 0; struct tcphdr *th; unsigned int thlen; unsigned int seq; @@ -56,6 +57,8 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | SKB_GSO_GRE | + SKB_GSO_IPIP | + SKB_GSO_SIT | SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | 0) || @@ -102,13 +105,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, if (copy_destructor) { skb->destructor = gso_skb->destructor; skb->sk = gso_skb->sk; - /* {tcp|sock}_wfree() use exact truesize accounting : - * sum(skb->truesize) MUST be exactly be gso_skb->truesize - * So we account mss bytes of 'true size' for each segment. - * The last segment will contain the remaining. - */ - skb->truesize = mss; - gso_skb->truesize -= mss; + sum_truesize += skb->truesize; } skb = skb->next; th = tcp_hdr(skb); @@ -125,7 +122,9 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, if (copy_destructor) { swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); - swap(gso_skb->truesize, skb->truesize); + sum_truesize += skb->truesize; + atomic_add(sum_truesize - gso_skb->truesize, + &skb->sk->sk_wmem_alloc); } delta = htonl(oldlen + (skb_tail_pointer(skb) - @@ -139,7 +138,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, out: return segs; } -EXPORT_SYMBOL(tcp_tso_segment); +EXPORT_SYMBOL(tcp_gso_segment); struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -320,7 +319,7 @@ static int tcp4_gro_complete(struct sk_buff *skb) static const struct net_offload tcpv4_offload = { .callbacks = { .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_tso_segment, + .gso_segment = tcp_gso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, }, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2822ad021a48..672854664ff5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -850,15 +850,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); - /* If congestion control is doing timestamping, we must - * take such a timestamp before we potentially clone/copy. - */ - if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) - __net_timestamp(skb); - - if (likely(clone_it)) { + if (clone_it) { const struct sk_buff *fclone = skb + 1; + /* If congestion control is doing timestamping, we must + * take such a timestamp before we potentially clone/copy. + */ + if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) + __net_timestamp(skb); + if (unlikely(skb->fclone == SKB_FCLONE_ORIG && fclone->fclone == SKB_FCLONE_CLONE)) NET_INC_STATS_BH(sock_net(sk), @@ -986,8 +986,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { - if (skb->len <= mss_now || !sk_can_gso(sk) || - skb->ip_summed == CHECKSUM_NONE) { + /* Make sure we own this skb before messing gso_size/gso_segs */ + WARN_ON_ONCE(skb_cloned(skb)); + + if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. */ @@ -1067,9 +1069,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; - if (skb_cloned(skb) && - skb_is_nonlinear(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; /* Get a new skb... force flag on. */ @@ -2344,6 +2344,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int oldpcount = tcp_skb_pcount(skb); if (unlikely(oldpcount > 1)) { + if (skb_unclone(skb, GFP_ATOMIC)) + return -ENOMEM; tcp_init_tso_segs(sk, skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } @@ -2351,21 +2353,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tcp_retrans_try_collapse(sk, skb, cur_mss); - /* Some Solaris stacks overoptimize and ignore the FIN on a - * retransmit when old data is attached. So strip it off - * since it is cheap to do so and saves bytes on the network. - */ - if (skb->len > 0 && - (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && - tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { - if (!pskb_trim(skb, 0)) { - /* Reuse, even though it does some unnecessary work */ - tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, - TCP_SKB_CB(skb)->tcp_flags); - skb->ip_summed = CHECKSUM_NONE; - } - } - /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index af07b5b23ebf..64f0354c84c7 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -156,12 +156,16 @@ static bool retransmits_timed_out(struct sock *sk, static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); int retry_until; bool do_reset, syn_set = false; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (icsk->icsk_retransmits) + if (icsk->icsk_retransmits) { dst_negative_advice(sk); + if (tp->syn_fastopen || tp->syn_data) + tcp_fastopen_cache_set(sk, 0, NULL, true); + } retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; syn_set = true; } else { diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index 6c0eea2f8249..0531b99d8637 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -15,10 +15,10 @@ struct vegas { u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ }; -extern void tcp_vegas_init(struct sock *sk); -extern void tcp_vegas_state(struct sock *sk, u8 ca_state); -extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); -extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); -extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); +void tcp_vegas_init(struct sock *sk); +void tcp_vegas_state(struct sock *sk, u8 ca_state); +void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); +void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); +void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); #endif /* __TCP_VEGAS_H */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9f27bb800607..89909dd730dd 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -407,6 +407,18 @@ static inline int compute_score2(struct sock *sk, struct net *net, return score; } +static unsigned int udp_ehashfn(struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) +{ + static u32 udp_ehash_secret __read_mostly; + + net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); + + return __inet_ehashfn(laddr, lport, faddr, fport, + udp_ehash_secret + net_hash_mix(net)); +} + /* called with read_rcu_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, @@ -430,8 +442,8 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - hash = inet_ehashfn(net, daddr, hnum, - saddr, sport); + hash = udp_ehashfn(net, daddr, hnum, + saddr, sport); matches = 1; } } else if (score == badness && reuseport) { @@ -511,8 +523,8 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { - hash = inet_ehashfn(net, daddr, hnum, - saddr, sport); + hash = udp_ehashfn(net, daddr, hnum, + saddr, sport); matches = 1; } } else if (score == badness && reuseport) { diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 5a681e298b90..f3c27899f62b 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -5,30 +5,30 @@ #include <net/protocol.h> #include <net/inet_common.h> -extern int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int ); -extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); +int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); +void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); -extern int udp_v4_get_port(struct sock *sk, unsigned short snum); +int udp_v4_get_port(struct sock *sk, unsigned short snum); -extern int udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -extern int udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); +int udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +int udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); #ifdef CONFIG_COMPAT -extern int compat_udp_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -extern int compat_udp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); +int compat_udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +int compat_udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); #endif -extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len); -extern int udp_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags); -extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -extern void udp_destroy_sock(struct sock *sk); +int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len); +int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, + int flags); +int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS -extern int udp4_seq_show(struct seq_file *seq, void *v); +int udp4_seq_show(struct seq_file *seq, void *v); #endif #endif /* _UDP4_IMPL_H */ diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f35eccaa855e..83206de2bc76 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -52,6 +52,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | + SKB_GSO_IPIP | SKB_GSO_GRE | SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 9a459be24af7..e1a63930a967 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -104,9 +104,14 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) const struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; + int oif = 0; + + if (skb_dst(skb)) + oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; + fl4->flowi4_oif = reverse ? skb->skb_iif : oif; if (!ip_is_fragment(iph)) { switch (iph->protocol) { @@ -235,7 +240,7 @@ static struct dst_ops xfrm4_dst_ops = { .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = 1024, + .gc_thresh = 32768, }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { |