diff options
Diffstat (limited to 'net/ipv4')
49 files changed, 810 insertions, 778 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index c6c8ad1d4b6d..47a0a6649a9d 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,7 +43,6 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o -obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f00499a46927..54cccdd8b1e3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -121,6 +121,7 @@ #endif #include <net/l3mdev.h> +#include <trace/events/sock.h> /* The inetsw table contains everything that inet_create needs to * build a new socket. @@ -789,7 +790,8 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int addr_len = 0; int err; - sock_rps_record_flow(sk); + if (likely(!(flags & MSG_ERRQUEUE))) + sock_rps_record_flow(sk); err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); @@ -1220,6 +1222,19 @@ int inet_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL(inet_sk_rebuild_header); +void inet_sk_set_state(struct sock *sk, int state) +{ + trace_inet_sock_set_state(sk, sk->sk_state, state); + sk->sk_state = state; +} +EXPORT_SYMBOL(inet_sk_set_state); + +void inet_sk_state_store(struct sock *sk, int newstate) +{ + trace_inet_sock_set_state(sk, sk->sk_state, newstate); + smp_store_release(&sk->sk_state, newstate); +} + struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 6c231b43974d..f28f06c91ead 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1425,7 +1425,6 @@ static int arp_seq_open(struct inode *inode, struct file *file) } static const struct file_operations arp_seq_fops = { - .owner = THIS_MODULE, .open = arp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 61fe6e4d23fc..296d0b956bfe 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -121,14 +121,32 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; + struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x; + + if (xo && (xo->flags & XFRM_DEV_RESUME)) + x = skb->sp->xvec[skb->sp->len - 1]; + else + x = skb_dst(skb)->xfrm; tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); kfree(tmp); - xfrm_output_resume(skb, err); + + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + if (err) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); + kfree_skb(skb); + return; + } + + skb_push(skb, skb->data - skb_mac_header(skb)); + secpath_reset(skb); + xfrm_dev_resume(skb); + } else { + xfrm_output_resume(skb, err); + } } /* Move ESP header back into place. */ @@ -825,17 +843,13 @@ static int esp_init_aead(struct xfrm_state *x) char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - u32 mask = 0; err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) goto error; - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(aead_name, 0, mask); + aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -865,7 +879,6 @@ static int esp_init_authenc(struct xfrm_state *x) char authenc_name[CRYPTO_MAX_ALG_NAME]; unsigned int keylen; int err; - u32 mask = 0; err = -EINVAL; if (!x->ealg) @@ -891,10 +904,7 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; } - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(authenc_name, 0, mask); + aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index b1338e576d00..32fbd9ba3609 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -109,75 +109,36 @@ static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { - __u32 seq; - int err = 0; - struct sk_buff *skb2; struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; - struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); if (!xo) - goto out; - - seq = xo->seq.low; + return ERR_PTR(-EINVAL); x = skb->sp->xvec[skb->sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) - goto out; + return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) - goto out; + return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP)) + if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || + (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - segs = x->outer_mode->gso_segment(x, skb, esp_features); - if (IS_ERR_OR_NULL(segs)) - goto out; - - __skb_pull(skb, skb->data - skb_mac_header(skb)); - - skb2 = segs; - do { - struct sk_buff *nskb = skb2->next; - - xo = xfrm_offload(skb2); - xo->flags |= XFRM_GSO_SEGMENT; - xo->seq.low = seq; - xo->seq.hi = xfrm_replay_seqhi(x, seq); + xo->flags |= XFRM_GSO_SEGMENT; - if(!(features & NETIF_F_HW_ESP)) - xo->flags |= CRYPTO_FALLBACK; - - x->outer_mode->xmit(x, skb2); - - err = x->type_offload->xmit(x, skb2, esp_features); - if (err) { - kfree_skb_list(segs); - return ERR_PTR(err); - } - - if (!skb_is_gso(skb2)) - seq++; - else - seq += skb_shinfo(skb2)->gso_segs; - - skb_push(skb2, skb2->mac_len); - skb2 = nskb; - } while (skb2); - -out: - return segs; + return x->outer_mode->gso_segment(x, skb, esp_features); } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) @@ -204,6 +165,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; + __u32 seq; esp.inplace = true; @@ -242,23 +204,30 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ return esp.nfrags; } + seq = xo->seq.low; + esph = esp.esph; esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(xo->seq.low); - } else { - ip_hdr(skb)->tot_len = htons(skb->len); - ip_send_check(ip_hdr(skb)); + esph->seq_no = htonl(seq); + + if (!skb_is_gso(skb)) + xo->seq.low++; + else + xo->seq.low += skb_shinfo(skb)->gso_segs; } + esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); + + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); + if (hw_offload) return 0; - esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); - err = esp_output_tail(x, skb, &esp); if (err) return err; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5ddc4aefff12..5530cd6fdbc7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2334,7 +2334,6 @@ static int fib_triestat_seq_open(struct inode *inode, struct file *file) } static const struct file_operations fib_triestat_fops = { - .owner = THIS_MODULE, .open = fib_triestat_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2521,7 +2520,6 @@ static int fib_trie_seq_open(struct inode *inode, struct file *file) } static const struct file_operations fib_trie_fops = { - .owner = THIS_MODULE, .open = fib_trie_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2715,7 +2713,6 @@ static int fib_route_seq_open(struct inode *inode, struct file *file) } static const struct file_operations fib_route_fops = { - .owner = THIS_MODULE, .open = fib_route_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 726f6b608274..02f00be12bb0 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2832,7 +2832,6 @@ static int igmp_mc_seq_open(struct inode *inode, struct file *file) } static const struct file_operations igmp_mc_seq_fops = { - .owner = THIS_MODULE, .open = igmp_mc_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2979,7 +2978,6 @@ static int igmp_mcf_seq_open(struct inode *inode, struct file *file) } static const struct file_operations igmp_mcf_seq_fops = { - .owner = THIS_MODULE, .open = igmp_mcf_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4ca46dc08e63..12410ec6f7f7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -685,7 +685,7 @@ static void reqsk_timer_handler(struct timer_list *t) int max_retries, thresh; u8 defer_accept; - if (sk_state_load(sk_listener) != TCP_LISTEN) + if (inet_sk_state_load(sk_listener) != TCP_LISTEN) goto drop; max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; @@ -783,7 +783,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); - newsk->sk_state = TCP_SYN_RECV; + inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; @@ -877,7 +877,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk_state_store(sk, TCP_LISTEN); + inet_sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); @@ -888,7 +888,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) return 0; } - sk->sk_state = TCP_CLOSE; + inet_sk_set_state(sk, TCP_CLOSE); return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c9c35b61a027..a383f299ce24 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -564,12 +564,18 @@ static int inet_diag_bc_run(const struct nlattr *_bc, case INET_DIAG_BC_JMP: yes = 0; break; + case INET_DIAG_BC_S_EQ: + yes = entry->sport == op[1].no; + break; case INET_DIAG_BC_S_GE: yes = entry->sport >= op[1].no; break; case INET_DIAG_BC_S_LE: yes = entry->sport <= op[1].no; break; + case INET_DIAG_BC_D_EQ: + yes = entry->dport == op[1].no; + break; case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; break; @@ -802,8 +808,10 @@ static int inet_diag_bc_audit(const struct nlattr *attr, if (!valid_devcond(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_S_EQ: case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: + case INET_DIAG_BC_D_EQ: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: if (!valid_port_comparison(bc, len, &min_len)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e7d15fb0d94d..37b7da0b975d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,6 +19,7 @@ #include <linux/slab.h> #include <linux/wait.h> #include <linux/vmalloc.h> +#include <linux/bootmem.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> @@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) } EXPORT_SYMBOL_GPL(__inet_inherit_port); +static struct inet_listen_hashbucket * +inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) +{ + u32 hash; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + hash = ipv6_portaddr_hash(sock_net(sk), + &sk->sk_v6_rcv_saddr, + inet_sk(sk)->inet_num); + else +#endif + hash = ipv4_portaddr_hash(sock_net(sk), + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + return inet_lhash2_bucket(h, hash); +} + +static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + if (sk->sk_reuseport && sk->sk_family == AF_INET6) + hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + else + hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + ilb2->count++; + spin_unlock(&ilb2->lock); +} + +static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2 || + WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); + ilb2->count--; + spin_unlock(&ilb2->lock); +} + static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif, bool exact_dif) @@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net, */ /* called with rcu_read_lock() : No refcount taken on the socket */ +static struct sock *inet_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const __be32 saddr, __be16 sport, + const __be32 daddr, const unsigned short hnum, + const int dif, const int sdif) +{ + bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = compute_score(sk, net, hnum, daddr, + dif, sdif, exact_dif); + if (score > hiscore) { + if (sk->sk_reuseport) { + phash = inet_ehashfn(net, daddr, hnum, + saddr, sport); + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; + } + result = sk; + hiscore = score; + } + } + + return result; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -216,32 +305,57 @@ struct sock *__inet_lookup_listener(struct net *net, { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; + int score, hiscore = 0; + unsigned int hash2; u32 phash = 0; + if (ilb->count <= 10 || !hashinfo->lhash2) + goto port_lookup; + + /* Too many sk in the ilb bucket (which is hashed by port alone). + * Try lhash2 (which is hashed by port and addr) instead. + */ + + hash2 = ipv4_portaddr_hash(net, daddr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + return result; + + /* Lookup lhash2 with INADDR_ANY */ + + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + return inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + +port_lookup: sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, phash, skb, doff); if (result) return result; - matches = 1; } result = sk; hiscore = score; - } else if (score == hiscore && reuseport) { - matches++; - if (reciprocal_scale(phash, matches) == 0) - result = sk; - phash = next_pseudo_random32(phash); } } return result; @@ -430,7 +544,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { percpu_counter_inc(sk->sk_prot->orphan_count); - sk->sk_state = TCP_CLOSE; + inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } @@ -483,6 +597,8 @@ int __inet_hash(struct sock *sk, struct sock *osk) hlist_add_tail_rcu(&sk->sk_node, &ilb->head); else hlist_add_head_rcu(&sk->sk_node, &ilb->head); + inet_hash2(hashinfo, sk); + ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: @@ -509,28 +625,35 @@ EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb; spinlock_t *lock; bool listener = false; - int done; if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { - lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &ilb->lock; listener = true; } else { lock = inet_ehash_lockp(hashinfo, sk->sk_hash); } spin_lock_bh(lock); + if (sk_unhashed(sk)) + goto unlock; + if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - if (listener) - done = __sk_del_node_init(sk); - else - done = __sk_nulls_del_node_init_rcu(sk); - if (done) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + if (listener) { + inet_unhash2(hashinfo, sk); + __sk_del_node_init(sk); + ilb->count--; + } else { + __sk_nulls_del_node_init_rcu(sk); + } + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); +unlock: spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(inet_unhash); @@ -665,10 +788,37 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_HEAD(&h->listening_hash[i].head); + h->listening_hash[i].count = 0; } + + h->lhash2 = NULL; } EXPORT_SYMBOL_GPL(inet_hashinfo_init); +void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, + unsigned long numentries, int scale, + unsigned long low_limit, + unsigned long high_limit) +{ + unsigned int i; + + h->lhash2 = alloc_large_system_hash(name, + sizeof(*h->lhash2), + numentries, + scale, + 0, + NULL, + &h->lhash2_mask, + low_limit, + high_limit); + + for (i = 0; i <= h->lhash2_mask; i++) { + spin_lock_init(&h->lhash2[i].lock); + INIT_HLIST_HEAD(&h->lhash2[i].head); + h->lhash2[i].count = 0; + } +} + int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index b563e0c46bac..c3ea4906d237 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -97,7 +97,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ -void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, +void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo) { const struct inet_sock *inet = inet_sk(sk); @@ -119,18 +119,6 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, spin_lock(lock); - /* - * Step 2: Hash TW into tcp ehash chain. - * Notes : - * - tw_refcnt is set to 4 because : - * - We have one reference from bhash chain. - * - We have one reference from ehash chain. - * - We have one reference from timer. - * - One reference for ourself (our caller will release it). - * We can use atomic_set() because prior spin_lock()/spin_unlock() - * committed into memory all tw fields. - */ - refcount_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ @@ -138,8 +126,19 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(lock); + + /* tw_refcnt is set to 3 because we have : + * - one reference for bhash chain. + * - one reference for ehash chain. + * - one reference for timer. + * We can use atomic_set() because prior spin_lock()/spin_unlock() + * committed into memory all tw fields. + * Also note that after this point, we lost our implicit reference + * so we are not allowed to use tw anymore. + */ + refcount_set(&tw->tw_refcnt, 3); } -EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); +EXPORT_SYMBOL_GPL(inet_twsk_hashdance); static void tw_timer_handler(struct timer_list *t) { @@ -271,14 +270,14 @@ restart: continue; tw = inet_twsk(sk); if ((tw->tw_family != family) || - atomic_read(&twsk_net(tw)->count)) + refcount_read(&twsk_net(tw)->count)) continue; if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt))) continue; if (unlikely((tw->tw_family != family) || - atomic_read(&twsk_net(tw)->count))) { + refcount_read(&twsk_net(tw)->count))) { inet_twsk_put(tw); goto restart; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 45ffd3d045d2..b61f2285816d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -114,7 +114,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate); + __be32 id, u32 index, + bool truncate, bool is_ipv4); static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; @@ -255,34 +256,43 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, { struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; + struct erspan_base_hdr *ershdr; + struct erspan_metadata *pkt_md; struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; - struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 index; + int ver; int len; itn = net_generic(net, erspan_net_id); len = gre_hdr_len + sizeof(*ershdr); + /* Check based hdr len */ if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; iph = ip_hdr(skb); - ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len); + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET; /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); - index = ershdr->md.index; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); if (tunnel) { + len = gre_hdr_len + erspan_hdr_len(ver); + if (unlikely(!pskb_may_pull(skb, len))) + return PACKET_REJECT; + + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + pkt_md = (struct erspan_metadata *)(ershdr + 1); + if (__iptunnel_pull_header(skb, - gre_hdr_len + sizeof(*ershdr), + len, htons(ETH_P_TEB), false, false) < 0) goto drop; @@ -303,15 +313,27 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, return PACKET_REJECT; md = ip_tunnel_info_opts(&tun_dst->u.tun_info); - if (!md) - return PACKET_REJECT; + memcpy(md, pkt_md, sizeof(*md)); + md->version = ver; - md->index = index; info = &tun_dst->u.tun_info; info->key.tun_flags |= TUNNEL_ERSPAN_OPT; info->options_len = sizeof(*md); } else { - tunnel->index = ntohl(index); + tunnel->erspan_ver = ver; + if (ver == 1) { + tunnel->index = ntohl(pkt_md->u.index); + } else { + u16 md2_flags; + u16 dir, hwid; + + md2_flags = ntohs(pkt_md->u.md2.flags); + dir = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + tunnel->dir = dir; + tunnel->hwid = hwid; + } + } skb_reset_mac_header(skb); @@ -405,14 +427,17 @@ static int gre_rcv(struct sk_buff *skb) if (hdr_len < 0) goto drop; - if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) { + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; + goto out; } if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; +out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); @@ -560,6 +585,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, bool truncate = false; struct flowi4 fl; int tunnel_hlen; + int version; __be16 df; tun_info = skb_tunnel_info(skb); @@ -568,9 +594,13 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto err_free_rt; /* ERSPAN has fixed 8 byte GRE header */ - tunnel_hlen = 8 + sizeof(struct erspanhdr); + version = md->version; + tunnel_hlen = 8 + erspan_hdr_len(version); rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); if (!rt) @@ -584,12 +614,23 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, truncate = true; } - md = ip_tunnel_info_opts(tun_info); - if (!md) - goto err_free_rt; + if (version == 1) { + erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), + ntohl(md->u.index), truncate, true); + } else if (version == 2) { + u16 md2_flags; + u8 direction; + u16 hwid; - erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), - ntohl(md->index), truncate); + md2_flags = ntohs(md->u.md2.flags); + direction = (md2_flags & DIR_MASK) >> DIR_OFFSET; + hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET; + + erspan_build_header_v2(skb, tunnel_id_to_key32(key->tun_id), + direction, hwid, truncate, true); + } else { + goto err_free_rt; + } gre_build_header(skb, 8, TUNNEL_SEQ, htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); @@ -668,52 +709,6 @@ free_skb: return NETDEV_TX_OK; } -static inline u8 tos_to_cos(u8 tos) -{ - u8 dscp, cos; - - dscp = tos >> 2; - cos = dscp >> 3; - return cos; -} - -static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate) -{ - struct iphdr *iphdr = ip_hdr(skb); - struct ethhdr *eth = eth_hdr(skb); - enum erspan_encap_type enc_type; - struct erspanhdr *ershdr; - struct qtag_prefix { - __be16 eth_type; - __be16 tci; - } *qp; - u16 vlan_tci = 0; - - enc_type = ERSPAN_ENCAP_NOVLAN; - - /* If mirrored packet has vlan tag, extract tci and - * perserve vlan header in the mirrored frame. - */ - if (eth->h_proto == htons(ETH_P_8021Q)) { - qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); - vlan_tci = ntohs(qp->tci); - enc_type = ERSPAN_ENCAP_INFRAME; - } - - skb_push(skb, sizeof(*ershdr)); - ershdr = (struct erspanhdr *)skb->data; - memset(ershdr, 0, sizeof(*ershdr)); - - ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | - (ERSPAN_VERSION << VER_OFFSET)); - ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | - ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) | - (enc_type << EN_OFFSET & EN_MASK) | - ((truncate << T_OFFSET) & T_MASK)); - ershdr->md.index = htonl(index & INDEX_MASK); -} - static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -737,7 +732,14 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate); + if (tunnel->erspan_ver == 1) + erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, + truncate, true); + else + erspan_build_header_v2(skb, tunnel->parms.o_key, + tunnel->dir, tunnel->hwid, + truncate, true); + tunnel->parms.o_flags &= ~TUNNEL_KEY; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); return NETDEV_TX_OK; @@ -1209,13 +1211,32 @@ static int ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); - if (data[IFLA_GRE_ERSPAN_INDEX]) { - t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (data[IFLA_GRE_ERSPAN_VER]) { + t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - if (t->index & ~INDEX_MASK) + if (t->erspan_ver != 1 && t->erspan_ver != 2) return -EINVAL; } + if (t->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (t->index & ~INDEX_MASK) + return -EINVAL; + } + } else if (t->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (t->dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + if (data[IFLA_GRE_ERSPAN_HWID]) { + t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + if (t->hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + return 0; } @@ -1282,7 +1303,7 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + - sizeof(struct erspanhdr); + erspan_hdr_len(tunnel->erspan_ver); t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; @@ -1413,6 +1434,12 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_GRE_ERSPAN_INDEX */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_VER */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_DIR */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_HWID */ + nla_total_size(2) + 0; } @@ -1455,9 +1482,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } - if (t->index) + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) goto nla_put_failure; + } else if (t->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } return 0; @@ -1493,6 +1529,9 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index abdebca848c9..e9e488e72900 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1322,7 +1322,6 @@ static int pnp_seq_open(struct inode *indoe, struct file *file) } static const struct file_operations pnp_seq_fops = { - .owner = THIS_MODULE, .open = pnp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index fd5f19c988e4..a819fab45d17 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3045,7 +3045,6 @@ static int ipmr_vif_open(struct inode *inode, struct file *file) } static const struct file_operations ipmr_vif_fops = { - .owner = THIS_MODULE, .open = ipmr_vif_open, .read = seq_read, .llseek = seq_lseek, @@ -3198,7 +3197,6 @@ static int ipmr_mfc_open(struct inode *inode, struct file *file) } static const struct file_operations ipmr_mfc_fops = { - .owner = THIS_MODULE, .open = ipmr_mfc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index c0cc6aa8cfaa..e6774ccb7731 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -80,35 +80,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t } EXPORT_SYMBOL(ip_route_me_harder); -/* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - */ - -struct ip_rt_info { - __be32 daddr; - __be32 saddr; - u_int8_t tos; - u_int32_t mark; -}; - -static void nf_ip_saveroute(const struct sk_buff *skb, - struct nf_queue_entry *entry) -{ - struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct iphdr *iph = ip_hdr(skb); - - rt_info->tos = iph->tos; - rt_info->daddr = iph->daddr; - rt_info->saddr = iph->saddr; - rt_info->mark = skb->mark; - } -} - -static int nf_ip_reroute(struct net *net, struct sk_buff *skb, - const struct nf_queue_entry *entry) +int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -119,10 +91,12 @@ static int nf_ip_reroute(struct net *net, struct sk_buff *skb, skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(net, skb, RTN_UNSPEC); + return ip_route_me_harder(entry->state.net, skb, + RTN_UNSPEC); } return 0; } +EXPORT_SYMBOL_GPL(nf_ip_reroute); __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol) @@ -155,9 +129,9 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, } EXPORT_SYMBOL(nf_ip_checksum); -static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) +__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u_int8_t protocol) { const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; @@ -175,9 +149,10 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, } return csum; } +EXPORT_SYMBOL_GPL(nf_ip_checksum_partial); -static int nf_ip_route(struct net *net, struct dst_entry **dst, - struct flowi *fl, bool strict __always_unused) +int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, + bool strict __always_unused) { struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); if (IS_ERR(rt)) @@ -185,19 +160,4 @@ static int nf_ip_route(struct net *net, struct dst_entry **dst, *dst = &rt->dst; return 0; } - -static const struct nf_afinfo nf_ip_afinfo = { - .family = AF_INET, - .checksum = nf_ip_checksum, - .checksum_partial = nf_ip_checksum_partial, - .route = nf_ip_route, - .saveroute = nf_ip_saveroute, - .reroute = nf_ip_reroute, - .route_key_size = sizeof(struct ip_rt_info), -}; - -static int __init ipv4_netfilter_init(void) -{ - return nf_register_afinfo(&nf_ip_afinfo); -} -subsys_initcall(ipv4_netfilter_init); +EXPORT_SYMBOL_GPL(nf_ip_route); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index c11eb1744ab1..7d5d444964aa 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -72,11 +72,20 @@ endif # NF_TABLES_IPV4 config NF_TABLES_ARP tristate "ARP nf_tables support" + select NETFILTER_FAMILY_ARP help This option enables the ARP support for nf_tables. endif # NF_TABLES +config NF_FLOW_TABLE_IPV4 + select NF_FLOW_TABLE + tristate "Netfilter flow table IPv4 module" + help + This option adds the flow table IPv4 support. + + To compile it as a module, choose M here. + config NF_DUP_IPV4 tristate "Netfilter IPv4 packet duplication to alternate destination" depends on !NF_CONNTRACK || NF_CONNTRACK @@ -392,6 +401,7 @@ endif # IP_NF_IPTABLES config IP_NF_ARPTABLES tristate "ARP tables support" select NETFILTER_XTABLES + select NETFILTER_FAMILY_ARP depends on NETFILTER_ADVANCED help arptables is a general, extensible packet identification framework. diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index adcdae358365..8bb1f0c7a375 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -43,6 +43,9 @@ obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o +# flow table support +obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o + # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 0c3c944a7b72..bf8a5340f15e 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -810,9 +810,8 @@ static int get_info(struct net *net, void __user *user, if (compat) xt_compat_lock(NFPROTO_ARP); #endif - t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), - "arptable_%s", name); - if (t) { + t = xt_request_find_table_lock(net, NFPROTO_ARP, name); + if (!IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -841,7 +840,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = -ENOENT; + ret = PTR_ERR(t); #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(NFPROTO_ARP); @@ -866,7 +865,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; if (get.size == private->size) @@ -878,7 +877,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); return ret; } @@ -903,10 +902,9 @@ static int __do_replace(struct net *net, const char *name, goto out; } - t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), - "arptable_%s", name); - if (!t) { - ret = -ENOENT; + t = xt_request_find_table_lock(net, NFPROTO_ARP, name); + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } @@ -1020,8 +1018,8 @@ static int do_add_counters(struct net *net, const void __user *user, return PTR_ERR(paddc); t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name); - if (!t) { - ret = -ENOENT; + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free; } @@ -1408,7 +1406,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; @@ -1423,7 +1421,7 @@ static int compat_get_entries(struct net *net, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); xt_compat_unlock(NFPROTO_ARP); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2e0d339028bb..0b975aa2d363 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -973,9 +973,8 @@ static int get_info(struct net *net, void __user *user, if (compat) xt_compat_lock(AF_INET); #endif - t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), - "iptable_%s", name); - if (t) { + t = xt_request_find_table_lock(net, AF_INET, name); + if (!IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1005,7 +1004,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = -ENOENT; + ret = PTR_ERR(t); #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(AF_INET); @@ -1030,7 +1029,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1041,7 +1040,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); return ret; } @@ -1064,10 +1063,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto out; } - t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), - "iptable_%s", name); - if (!t) { - ret = -ENOENT; + t = xt_request_find_table_lock(net, AF_INET, name); + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } @@ -1181,8 +1179,8 @@ do_add_counters(struct net *net, const void __user *user, return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET, tmp.name); - if (!t) { - ret = -ENOENT; + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free; } @@ -1625,7 +1623,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); @@ -1639,7 +1637,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); xt_compat_unlock(AF_INET); return ret; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 69060e3abe85..c29a6ca6c6d6 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -776,7 +776,6 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, } static const struct file_operations clusterip_proc_fops = { - .owner = THIS_MODULE, .open = clusterip_proc_open, .read = seq_read, .write = clusterip_proc_write, diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 7667f223d7f8..9ac92ea7b93c 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -38,12 +38,6 @@ static unsigned int iptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (state->hook == NF_INET_LOCAL_OUT && - (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr))) - /* root is playing with raw sockets. */ - return NF_ACCEPT; - return ipt_do_table(skb, state, state->net->ipv4.iptable_filter); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index aebdb337fd7e..dea138ca8925 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -49,11 +49,6 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) u_int32_t mark; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - /* Save things which could affect route */ mark = skb->mark; iph = ip_hdr(skb); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index a1a07b338ccf..0f7255cc65ee 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -72,6 +72,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_in, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, @@ -79,6 +80,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_out, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, @@ -86,6 +88,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_local_fn, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, @@ -93,6 +96,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_fn, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 2642ecd2645c..a869d1fea7d9 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -26,12 +26,6 @@ static unsigned int iptable_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (state->hook == NF_INET_LOCAL_OUT && - (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr))) - /* root is playing with raw sockets. */ - return NF_ACCEPT; - return ipt_do_table(skb, state, state->net->ipv4.iptable_raw); } diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index ff226596e4b5..e5379fe57b64 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -43,12 +43,6 @@ static unsigned int iptable_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (state->hook == NF_INET_LOCAL_OUT && - (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr))) - /* Somebody is playing with raw sockets. */ - return NF_ACCEPT; - return ipt_do_table(skb, state, state->net->ipv4.iptable_security); } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 89af9d88ca21..de213a397ea8 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -154,11 +154,6 @@ static unsigned int ipv4_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */ return NF_ACCEPT; @@ -368,7 +363,7 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("ip_conntrack"); MODULE_LICENSE("GPL"); -static struct nf_conntrack_l4proto *builtin_l4proto4[] = { +static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = { &nf_conntrack_l4proto_tcp4, &nf_conntrack_l4proto_udp4, &nf_conntrack_l4proto_icmp, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 1849fedd9b81..5c15beafa711 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -22,7 +22,7 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> -static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; +static const unsigned int nf_ct_icmp_timeout = 30*HZ; static inline struct nf_icmp_net *icmp_pernet(struct net *net) { @@ -351,7 +351,7 @@ static struct nf_proto_net *icmp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.icmp.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l3proto = PF_INET, .l4proto = IPPROTO_ICMP, diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c new file mode 100644 index 000000000000..b2d01eb25f2c --- /dev/null +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -0,0 +1,284 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/ip.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/neighbour.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_tables.h> +/* For layer 4 checksum field offset. */ +#include <linux/tcp.h> +#include <linux/udp.h> + +static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); + + return 0; +} + +static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&udph->check, skb, addr, + new_addr, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, __be32 addr, + __be32 new_addr) +{ + switch (iph->protocol) { + case IPPROTO_TCP: + if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, + struct iphdr *iph, unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + __be32 addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + default: + return -1; + } + csum_replace4(&iph->check, addr, new_addr); + + return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); +} + +static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, + struct iphdr *iph, unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + __be32 addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); +} + +static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, + enum flow_offload_tuple_dir dir) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int thoff = iph->ihl * 4; + + if (flow->flags & FLOW_OFFLOAD_SNAT && + (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || + nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) + return -1; + if (flow->flags & FLOW_OFFLOAD_DNAT && + (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || + nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) + return -1; + + return 0; +} + +static bool ip_has_options(unsigned int thoff) +{ + return thoff != sizeof(struct iphdr); +} + +static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + unsigned int thoff; + struct iphdr *iph; + + if (!pskb_may_pull(skb, sizeof(*iph))) + return -1; + + iph = ip_hdr(skb); + thoff = iph->ihl * 4; + + if (ip_is_fragment(iph) || + unlikely(ip_has_options(thoff))) + return -1; + + if (iph->protocol != IPPROTO_TCP && + iph->protocol != IPPROTO_UDP) + return -1; + + thoff = iph->ihl * 4; + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v4.s_addr = iph->saddr; + tuple->dst_v4.s_addr = iph->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET; + tuple->l4proto = iph->protocol; + tuple->iifidx = dev->ifindex; + + return 0; +} + +/* Based on ip_exceeds_mtu(). */ +static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) + return false; + + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + return false; + + return true; +} + +static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt) +{ + u32 mtu; + + mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); + if (__nf_flow_exceeds_mtu(skb, mtu)) + return true; + + return false; +} + +unsigned int +nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple tuple = {}; + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + struct net_device *outdev; + const struct rtable *rt; + struct iphdr *iph; + __be32 nexthop; + + if (skb->protocol != htons(ETH_P_IP)) + return NF_ACCEPT; + + if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0) + return NF_ACCEPT; + + tuplehash = flow_offload_lookup(flow_table, &tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + + rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + if (unlikely(nf_flow_exceeds_mtu(skb, rt))) + return NF_ACCEPT; + + if (skb_try_make_writable(skb, sizeof(*iph))) + return NF_DROP; + + if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && + nf_flow_nat_ip(flow, skb, dir) < 0) + return NF_DROP; + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + iph = ip_hdr(skb); + ip_decrease_ttl(iph); + + skb->dev = outdev; + nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + + return NF_STOLEN; +} +EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); + +static struct nf_flowtable_type flowtable_ipv4 = { + .family = NFPROTO_IPV4, + .params = &nf_flow_offload_rhash_params, + .gc = nf_flow_offload_work_gc, + .hook = nf_flow_offload_ip_hook, + .owner = THIS_MODULE, +}; + +static int __init nf_flow_ipv4_module_init(void) +{ + nft_register_flowtable_type(&flowtable_ipv4); + + return 0; +} + +static void __exit nf_flow_ipv4_module_exit(void) +{ + nft_unregister_flowtable_type(&flowtable_ipv4); +} + +module_init(nf_flow_ipv4_module_init); +module_exit(nf_flow_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NF_FLOWTABLE(AF_INET); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 0443ca4120b0..f7ff6a364d7b 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -356,11 +356,6 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb, #endif unsigned int ret; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && @@ -396,11 +391,6 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, unsigned int ret; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 4bbc273b45e8..f84c17763f6f 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -21,7 +21,8 @@ nft_do_chain_arp(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_unspec(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb); return nft_do_chain(&pkt, priv); } @@ -30,12 +31,6 @@ static struct nft_af_info nft_af_arp __read_mostly = { .family = NFPROTO_ARP, .nhooks = NF_ARP_NUMHOOKS, .owner = THIS_MODULE, - .nops = 1, - .hooks = { - [NF_ARP_IN] = nft_do_chain_arp, - [NF_ARP_OUT] = nft_do_chain_arp, - [NF_ARP_FORWARD] = nft_do_chain_arp, - }, }; static int nf_tables_arp_init_net(struct net *net) @@ -73,6 +68,10 @@ static const struct nf_chain_type filter_arp = { .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | (1 << NF_ARP_OUT), + .hooks = { + [NF_ARP_IN] = nft_do_chain_arp, + [NF_ARP_OUT] = nft_do_chain_arp, + }, }; static int __init nf_tables_arp_init(void) diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 2840a29b2e04..f4675253f1e6 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -24,40 +24,17 @@ static unsigned int nft_do_chain_ipv4(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv4_output(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (unlikely(skb->len < sizeof(struct iphdr) || - ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { - if (net_ratelimit()) - pr_info("nf_tables_ipv4: ignoring short SOCK_RAW " - "packet\n"); - return NF_ACCEPT; - } - - return nft_do_chain_ipv4(priv, skb, state); -} - -struct nft_af_info nft_af_ipv4 __read_mostly = { +static struct nft_af_info nft_af_ipv4 __read_mostly = { .family = NFPROTO_IPV4, .nhooks = NF_INET_NUMHOOKS, .owner = THIS_MODULE, - .nops = 1, - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, - [NF_INET_LOCAL_OUT] = nft_ipv4_output, - [NF_INET_FORWARD] = nft_do_chain_ipv4, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, - }, }; -EXPORT_SYMBOL_GPL(nft_af_ipv4); static int nf_tables_ipv4_init_net(struct net *net) { @@ -97,6 +74,13 @@ static const struct nf_chain_type filter_ipv4 = { (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, + [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4, + [NF_INET_FORWARD] = nft_do_chain_ipv4, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, + }, }; static int __init nf_tables_ipv4_init(void) diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index f5c66a7a4bf2..f2a490981594 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -33,7 +33,8 @@ static unsigned int nft_nat_do_chain(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); return nft_do_chain(&pkt, priv); } diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 30493beb611a..d965c225b9f6 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -33,12 +33,8 @@ static unsigned int nf_route_table_hook(void *priv, const struct iphdr *iph; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - nft_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); mark = skb->mark; iph = ip_hdr(skb); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 9f37c4727861..dc5edc8f7564 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -83,7 +83,6 @@ static int sockstat_seq_open(struct inode *inode, struct file *file) } static const struct file_operations sockstat_seq_fops = { - .owner = THIS_MODULE, .open = sockstat_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -467,7 +466,6 @@ static int snmp_seq_open(struct inode *inode, struct file *file) } static const struct file_operations snmp_seq_fops = { - .owner = THIS_MODULE, .open = snmp_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -515,7 +513,6 @@ static int netstat_seq_open(struct inode *inode, struct file *file) } static const struct file_operations netstat_seq_fops = { - .owner = THIS_MODULE, .open = netstat_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 5e570aa9e43b..136544b36a46 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -1119,7 +1119,6 @@ static int raw_v4_seq_open(struct inode *inode, struct file *file) } static const struct file_operations raw_seq_fops = { - .owner = THIS_MODULE, .open = raw_v4_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4e153b23bcec..49cc1c1df1ba 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -240,7 +240,6 @@ static int rt_cache_seq_open(struct inode *inode, struct file *file) } static const struct file_operations rt_cache_seq_fops = { - .owner = THIS_MODULE, .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -331,7 +330,6 @@ static int rt_cpu_seq_open(struct inode *inode, struct file *file) } static const struct file_operations rt_cpu_seq_fops = { - .owner = THIS_MODULE, .open = rt_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -369,7 +367,6 @@ static int rt_acct_proc_open(struct inode *inode, struct file *file) } static const struct file_operations rt_acct_proc_fops = { - .owner = THIS_MODULE, .open = rt_acct_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -1106,7 +1103,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); + __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f08eebe60446..d7cf861bf699 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -283,8 +283,6 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> -#include <trace/events/tcp.h> - struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -500,11 +498,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) const struct tcp_sock *tp = tcp_sk(sk); int state; - sock_rps_record_flow(sk); - sock_poll_wait(file, sk_sleep(sk), wait); - state = sk_state_load(sk); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); @@ -1106,12 +1102,15 @@ static int linear_payload_sz(bool first_skb) return 0; } -static int select_size(const struct sock *sk, bool sg, bool first_skb) +static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc) { const struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sg) { + if (zc) + return 0; + if (sk_can_gso(sk)) { tmp = linear_payload_sz(first_skb); } else { @@ -1188,7 +1187,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; bool process_backlog = false; - bool sg; + bool sg, zc = false; long timeo; flags = msg->msg_flags; @@ -1206,7 +1205,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) + zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG; + if (!zc) uarg->zerocopy = 0; } @@ -1283,6 +1283,7 @@ restart: if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; + int linear; new_segment: /* Allocate new segment. If the interface is SG, @@ -1296,9 +1297,8 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - skb = sk_stream_alloc_skb(sk, - select_size(sk, sg, first_skb), - sk->sk_allocation, + linear = select_size(sk, sg, first_skb, zc); + skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, first_skb); if (!skb) goto wait_for_memory; @@ -1327,13 +1327,13 @@ new_segment: copy = msg_data_left(msg); /* Where to copy to? */ - if (skb_availroom(skb) > 0) { + if (skb_availroom(skb) > 0 && !zc) { /* We have some space in skb head. Superb! */ copy = min_t(int, copy, skb_availroom(skb)); err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); if (err) goto do_fault; - } else if (!uarg || !uarg->zerocopy) { + } else if (!zc) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1373,8 +1373,10 @@ new_segment: pfrag->offset += copy; } else { err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); - if (err == -EMSGSIZE || err == -EEXIST) + if (err == -EMSGSIZE || err == -EEXIST) { + tcp_mark_push(tp, skb); goto new_segment; + } if (err < 0) goto do_error; copy = err; @@ -1731,8 +1733,8 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb, } /* Similar to __sock_recv_timestamp, but does not require an skb */ -void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping *tss) +static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping *tss) { struct timeval tv; bool has_timestamping = false; @@ -2040,8 +2042,6 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; - trace_tcp_set_state(sk, oldstate, state); - switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) @@ -2065,7 +2065,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk_state_store(sk, state); + inet_sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); @@ -2920,7 +2920,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (sk->sk_type != SOCK_STREAM) return; - info->tcpi_state = sk_state_load(sk); + info->tcpi_state = inet_sk_state_load(sk); /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); @@ -3578,6 +3578,9 @@ void __init tcp_init(void) percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); inet_hashinfo_init(&tcp_hashinfo); + inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", + thash_entries, 21, /* one slot per 2 MB*/ + 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index abbf0edcf6c2..81148f7a2323 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -24,7 +24,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, { struct tcp_info *info = _info; - if (sk_state_load(sk) == TCP_LISTEN) { + if (inet_sk_state_load(sk) == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; } else if (sk->sk_type == SOCK_STREAM) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 78c192ee03a4..018a48477355 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -379,18 +379,9 @@ fastopen: bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { - unsigned long last_syn_loss = 0; const struct dst_entry *dst; - int syn_loss = 0; - tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); - - /* Recurring FO SYN losses: no cookie or data in SYN */ - if (syn_loss > 1 && - time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { - cookie->len = -1; - return false; - } + tcp_fastopen_cache_get(sk, mss, cookie); /* Firewall blackhole issue check */ if (tcp_fastopen_active_should_disable(sk)) { @@ -448,6 +439,8 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * following circumstances: * 1. client side TFO socket receives out of order FIN * 2. client side TFO socket receives out of order RST + * 3. client side TFO socket has timed out three times consecutively during + * or after handshake * We disable active side TFO globally for 1hr at first. Then if it * happens again, we disable it for 2h, then 4h, 8h, ... * And we reset the timeout back to 1hr when we see a successful active @@ -524,3 +517,20 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) dst_release(dst); } } + +void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired) +{ + u32 timeouts = inet_csk(sk)->icsk_retransmits; + struct tcp_sock *tp = tcp_sk(sk); + + /* Broken middle-boxes may black-hole Fast Open connection during or + * even after the handshake. Be extremely conservative and pause + * Fast Open globally after hitting the third consecutive timeout or + * exceeding the configured timeout limit. + */ + if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) && + (timeouts == 2 || (timeouts < 2 && expired))) { + tcp_fastopen_active_disable(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } +} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 45f750e85714..ff71b18d9682 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -578,8 +578,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 copied; int time; - int copied; tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); @@ -602,38 +602,31 @@ void tcp_rcv_space_adjust(struct sock *sk) if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvwin, rcvmem, rcvbuf; + int rcvmem, rcvbuf; + u64 rcvwin, grow; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. */ - rcvwin = (copied << 1) + 16 * tp->advmss; + rcvwin = ((u64)copied << 1) + 16 * tp->advmss; - /* If rate increased by 25%, - * assume slow start, rcvwin = 3 * copied - * If rate increased by 50%, - * assume sender can use 2x growth, rcvwin = 4 * copied - */ - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) - rcvwin <<= 1; - else - rcvwin += (rcvwin >> 1); - } + /* Accommodate for sender rate increase (eg. slow start) */ + grow = rcvwin * (copied - tp->rcvq_space.space); + do_div(grow, tp->rcvq_space.space); + rcvwin += (grow << 1); rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + do_div(rcvwin, tp->advmss); + rcvbuf = min_t(u64, rcvwin * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; /* Make the window clamp follow along. */ - tp->window_clamp = rcvwin; + tp->window_clamp = tcp_win_from_space(sk, rcvbuf); } } tp->rcvq_space.space = copied; @@ -5306,6 +5299,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, unsigned int len = skb->len; struct tcp_sock *tp = tcp_sk(sk); + /* TCP congestion window tracking */ + trace_tcp_probe(sk, skb); + tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 94e28350f420..95738aa0d8a6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1911,7 +1911,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Clean up the MD5 key list, if any */ if (tp->md5sig_info) { tcp_clear_md5_list(sk); - kfree_rcu(tp->md5sig_info, rcu); + kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); tp->md5sig_info = NULL; } #endif @@ -2281,7 +2281,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) timer_expires = jiffies; } - state = sk_state_load(sk); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) rx_queue = sk->sk_ack_backlog; else @@ -2358,7 +2358,6 @@ out: } static const struct file_operations tcp_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = tcp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 7097f92d16e5..03b51cdcc731 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -546,8 +546,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) static DEFINE_SEQLOCK(fastopen_seqlock); void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, - struct tcp_fastopen_cookie *cookie, - int *syn_loss, unsigned long *last_syn_loss) + struct tcp_fastopen_cookie *cookie) { struct tcp_metrics_block *tm; @@ -564,8 +563,6 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, *cookie = tfom->cookie; if (cookie->len <= 0 && tfom->try_exp == 1) cookie->exp = true; - *syn_loss = tfom->syn_loss; - *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; } while (read_seqretry(&fastopen_seqlock, seq)); } rcu_read_unlock(); @@ -895,7 +892,7 @@ static void tcp_metrics_flush_all(struct net *net) pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : - !atomic_read(&tm_net(tm)->count); + !refcount_read(&tm_net(tm)->count); if (match) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b079b619b60c..a8384b0c11f8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -316,9 +316,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) */ local_bh_disable(); inet_twsk_schedule(tw, timeo); - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - inet_twsk_put(tw); + /* Linkage updates. + * Note that access to tw after this point is illegal. + */ + inet_twsk_hashdance(tw, sk, &tcp_hashinfo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a4d214c7b506..95461f02ac9a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1944,7 +1944,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, in_flight = tcp_packets_in_flight(tp); - BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); + BUG_ON(tcp_skb_pcount(skb) <= 1); + BUG_ON(tp->snd_cwnd <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; @@ -2414,15 +2415,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; /* Schedule a loss probe in 2*RTT for SACK capable connections - * in Open state, that are either limited by cwnd or application. + * not in loss recovery, that are either limited by cwnd or application. */ if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || - icsk->icsk_ca_state != TCP_CA_Open) - return false; - - if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && - !tcp_write_queue_empty(sk)) + (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_CWR)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c deleted file mode 100644 index 697f4c67b2e3..000000000000 --- a/net/ipv4/tcp_probe.c +++ /dev/null @@ -1,301 +0,0 @@ -/* - * tcpprobe - Observe the TCP flow with kprobes. - * - * The idea for this came from Werner Almesberger's umlsim - * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/kprobes.h> -#include <linux/socket.h> -#include <linux/tcp.h> -#include <linux/slab.h> -#include <linux/proc_fs.h> -#include <linux/module.h> -#include <linux/ktime.h> -#include <linux/time.h> -#include <net/net_namespace.h> - -#include <net/tcp.h> - -MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>"); -MODULE_DESCRIPTION("TCP cwnd snooper"); -MODULE_LICENSE("GPL"); -MODULE_VERSION("1.1"); - -static int port __read_mostly; -MODULE_PARM_DESC(port, "Port to match (0=all)"); -module_param(port, int, 0); - -static unsigned int bufsize __read_mostly = 4096; -MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); -module_param(bufsize, uint, 0); - -static unsigned int fwmark __read_mostly; -MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); -module_param(fwmark, uint, 0); - -static int full __read_mostly; -MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); -module_param(full, int, 0); - -static const char procname[] = "tcpprobe"; - -struct tcp_log { - ktime_t tstamp; - union { - struct sockaddr raw; - struct sockaddr_in v4; - struct sockaddr_in6 v6; - } src, dst; - u16 length; - u32 snd_nxt; - u32 snd_una; - u32 snd_wnd; - u32 rcv_wnd; - u32 snd_cwnd; - u32 ssthresh; - u32 srtt; -}; - -static struct { - spinlock_t lock; - wait_queue_head_t wait; - ktime_t start; - u32 lastcwnd; - - unsigned long head, tail; - struct tcp_log *log; -} tcp_probe; - -static inline int tcp_probe_used(void) -{ - return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); -} - -static inline int tcp_probe_avail(void) -{ - return bufsize - tcp_probe_used() - 1; -} - -#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ - do { \ - si4.sin_family = AF_INET; \ - si4.sin_port = inet->inet_##mem##port; \ - si4.sin_addr.s_addr = inet->inet_##mem##addr; \ - } while (0) \ - -/* - * Hook inserted to be called before each receive packet. - * Note: arguments must match tcp_rcv_established()! - */ -static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th) -{ - unsigned int len = skb->len; - const struct tcp_sock *tp = tcp_sk(sk); - const struct inet_sock *inet = inet_sk(sk); - - /* Only update if port or skb mark matches */ - if (((port == 0 && fwmark == 0) || - ntohs(inet->inet_dport) == port || - ntohs(inet->inet_sport) == port || - (fwmark > 0 && skb->mark == fwmark)) && - (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { - - spin_lock(&tcp_probe.lock); - /* If log fills, just silently drop */ - if (tcp_probe_avail() > 1) { - struct tcp_log *p = tcp_probe.log + tcp_probe.head; - - p->tstamp = ktime_get(); - switch (sk->sk_family) { - case AF_INET: - tcp_probe_copy_fl_to_si4(inet, p->src.v4, s); - tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); - break; - case AF_INET6: - memset(&p->src.v6, 0, sizeof(p->src.v6)); - memset(&p->dst.v6, 0, sizeof(p->dst.v6)); -#if IS_ENABLED(CONFIG_IPV6) - p->src.v6.sin6_family = AF_INET6; - p->src.v6.sin6_port = inet->inet_sport; - p->src.v6.sin6_addr = inet6_sk(sk)->saddr; - - p->dst.v6.sin6_family = AF_INET6; - p->dst.v6.sin6_port = inet->inet_dport; - p->dst.v6.sin6_addr = sk->sk_v6_daddr; -#endif - break; - default: - BUG(); - } - - p->length = len; - p->snd_nxt = tp->snd_nxt; - p->snd_una = tp->snd_una; - p->snd_cwnd = tp->snd_cwnd; - p->snd_wnd = tp->snd_wnd; - p->rcv_wnd = tp->rcv_wnd; - p->ssthresh = tcp_current_ssthresh(sk); - p->srtt = tp->srtt_us >> 3; - - tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); - } - tcp_probe.lastcwnd = tp->snd_cwnd; - spin_unlock(&tcp_probe.lock); - - wake_up(&tcp_probe.wait); - } - - jprobe_return(); -} - -static struct jprobe tcp_jprobe = { - .kp = { - .symbol_name = "tcp_rcv_established", - }, - .entry = jtcp_rcv_established, -}; - -static int tcpprobe_open(struct inode *inode, struct file *file) -{ - /* Reset (empty) log */ - spin_lock_bh(&tcp_probe.lock); - tcp_probe.head = tcp_probe.tail = 0; - tcp_probe.start = ktime_get(); - spin_unlock_bh(&tcp_probe.lock); - - return 0; -} - -static int tcpprobe_sprint(char *tbuf, int n) -{ - const struct tcp_log *p - = tcp_probe.log + tcp_probe.tail; - struct timespec64 ts - = ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start)); - - return scnprintf(tbuf, n, - "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", - (unsigned long)ts.tv_sec, - (unsigned long)ts.tv_nsec, - &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, - p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); -} - -static ssize_t tcpprobe_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - int error = 0; - size_t cnt = 0; - - if (!buf) - return -EINVAL; - - while (cnt < len) { - char tbuf[256]; - int width; - - /* Wait for data in buffer */ - error = wait_event_interruptible(tcp_probe.wait, - tcp_probe_used() > 0); - if (error) - break; - - spin_lock_bh(&tcp_probe.lock); - if (tcp_probe.head == tcp_probe.tail) { - /* multiple readers race? */ - spin_unlock_bh(&tcp_probe.lock); - continue; - } - - width = tcpprobe_sprint(tbuf, sizeof(tbuf)); - - if (cnt + width < len) - tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1); - - spin_unlock_bh(&tcp_probe.lock); - - /* if record greater than space available - return partial buffer (so far) */ - if (cnt + width >= len) - break; - - if (copy_to_user(buf + cnt, tbuf, width)) - return -EFAULT; - cnt += width; - } - - return cnt == 0 ? error : cnt; -} - -static const struct file_operations tcpprobe_fops = { - .owner = THIS_MODULE, - .open = tcpprobe_open, - .read = tcpprobe_read, - .llseek = noop_llseek, -}; - -static __init int tcpprobe_init(void) -{ - int ret = -ENOMEM; - - /* Warning: if the function signature of tcp_rcv_established, - * has been changed, you also have to change the signature of - * jtcp_rcv_established, otherwise you end up right here! - */ - BUILD_BUG_ON(__same_type(tcp_rcv_established, - jtcp_rcv_established) == 0); - - init_waitqueue_head(&tcp_probe.wait); - spin_lock_init(&tcp_probe.lock); - - if (bufsize == 0) - return -EINVAL; - - bufsize = roundup_pow_of_two(bufsize); - tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); - if (!tcp_probe.log) - goto err0; - - if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops)) - goto err0; - - ret = register_jprobe(&tcp_jprobe); - if (ret) - goto err1; - - pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n", - port, fwmark, bufsize); - return 0; - err1: - remove_proc_entry(procname, init_net.proc_net); - err0: - kfree(tcp_probe.log); - return ret; -} -module_init(tcpprobe_init); - -static __exit void tcpprobe_exit(void) -{ - remove_proc_entry(procname, init_net.proc_net); - unregister_jprobe(&tcp_jprobe); - kfree(tcp_probe.log); -} -module_exit(tcpprobe_exit); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 968fda198376..6db3124cdbda 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -183,11 +183,6 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { dst_negative_advice(sk); - if (tp->syn_fastopen || tp->syn_data) - tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (tp->syn_data && icsk->icsk_retransmits == 1) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); } else if (!tp->syn_data && !tp->syn_fastopen) { sk_rethink_txhash(sk); } @@ -195,17 +190,6 @@ static int tcp_write_timeout(struct sock *sk) expired = icsk->icsk_retransmits >= retry_until; } else { if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { - /* Some middle-boxes may black-hole Fast Open _after_ - * the handshake. Therefore we conservatively disable - * Fast Open on this path on recurring timeouts after - * successful Fast Open. - */ - if (tp->syn_data_acked) { - tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); - } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -228,6 +212,7 @@ static int tcp_write_timeout(struct sock *sk) expired = retransmits_timed_out(sk, retry_until, icsk->icsk_user_timeout); } + tcp_fastopen_active_detect_blackhole(sk, expired); if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e4ff25c947c5..853321555a4e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -357,18 +357,12 @@ fail: } EXPORT_SYMBOL(udp_lib_get_port); -static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, - unsigned int port) -{ - return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; -} - int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); + ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = - udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); + ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; @@ -445,7 +439,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; result = NULL; @@ -454,23 +448,16 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } badness = score; result = sk; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -488,11 +475,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; bool exact_dif = udp_lib_exact_dif_match(net, skb); - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; if (hslot->count > 10) { - hash2 = udp4_portaddr_hash(net, daddr, hnum); + hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) @@ -503,7 +490,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; - hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; /* avoid searching the same slot again. */ if (unlikely(slot2 == old_slot2)) @@ -526,23 +513,16 @@ begin: score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -1775,7 +1755,7 @@ EXPORT_SYMBOL(udp_lib_rehash); static void udp_v4_rehash(struct sock *sk) { - u16 new_hash = udp4_portaddr_hash(sock_net(sk), + u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); udp_lib_rehash(sk, new_hash); @@ -1966,9 +1946,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; if (use_hash2) { - hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udptable->mask; - hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask; + hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); @@ -2200,7 +2180,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); + unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); @@ -2510,8 +2490,6 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) mask |= POLLIN | POLLRDNORM; - sock_rps_record_flow(sk); - /* Check for false positives due to checksum errors */ if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) @@ -2736,7 +2714,6 @@ int udp4_seq_show(struct seq_file *seq, void *v) } static const struct file_operations udp_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = udp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 59f10fe9782e..f96614e9b9a5 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -75,7 +75,6 @@ static struct inet_protosw udplite4_protosw = { #ifdef CONFIG_PROC_FS static const struct file_operations udplite_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = udp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index e6265e2c274e..8affc6d83d58 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -62,7 +62,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - top_iph->ttl = ip4_dst_hoplimit(dst->child); + top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; @@ -105,18 +105,15 @@ static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x, { __skb_push(skb, skb->mac_len); return skb_mac_gso_segment(skb, features); - } static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); - if (xo->flags & XFRM_GSO_SEGMENT) { - skb->network_header = skb->network_header - x->props.header_len; + if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + sizeof(struct iphdr); - } skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len); |