diff options
Diffstat (limited to 'net/ipv4')
70 files changed, 2198 insertions, 566 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 300b06888fdf..6e7baaf814c6 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -430,6 +430,14 @@ config INET_UDP_DIAG Support for UDP socket monitoring interface used by the ss tool. If unsure, say Y. +config INET_RAW_DIAG + tristate "RAW: socket monitoring interface" + depends on INET_DIAG && (IPV6 || IPV6=n) + default n + ---help--- + Support for RAW socket monitoring interface used by the ss tool. + If unsure, say Y. + config INET_DIAG_DESTROY bool "INET: allow privileged process to administratively close sockets" depends on INET_DIAG @@ -715,6 +723,7 @@ config DEFAULT_TCP_CONG default "reno" if DEFAULT_RENO default "dctcp" if DEFAULT_DCTCP default "cdg" if DEFAULT_CDG + default "bbr" if DEFAULT_BBR default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bc6a6c8b9bcd..48af58a5686e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o +obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1effc986739e..1830e6f0e9cc 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -374,8 +374,18 @@ lookup_protocol: if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) + if (err) { + sk_common_release(sk); + goto out; + } + } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { sk_common_release(sk); + goto out; + } } out: return err; @@ -533,9 +543,9 @@ EXPORT_SYMBOL(inet_dgram_connect); static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) { - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; /* Basic assumption: if someone sets sk->sk_err, he _must_ @@ -545,13 +555,12 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) */ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { release_sock(sk); - timeo = schedule_timeout(timeo); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); if (signal_pending(current) || !timeo) break; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; return timeo; } @@ -1234,7 +1243,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ - if (fixedid && !(iph->frag_off & htons(IP_DF))) + if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) goto out; } @@ -1391,7 +1400,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d95631d09248..20fb25e3027b 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -476,7 +476,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) esph = (void *)skb_push(skb, 4); *seqhi = esph->spi; esph->spi = esph->seq_no; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi); + esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; aead_request_set_callback(req, 0, esp_input_done_esn, skb); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c3b80478226e..dbad5a1c161a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -151,7 +151,7 @@ static void fib_replace_table(struct net *net, struct fib_table *old, int fib_unmerge(struct net *net) { - struct fib_table *old, *new; + struct fib_table *old, *new, *main_table; /* attempt to fetch local table if it has been allocated */ old = fib_get_table(net, RT_TABLE_LOCAL); @@ -162,11 +162,21 @@ int fib_unmerge(struct net *net) if (!new) return -ENOMEM; + /* table is already unmerged */ + if (new == old) + return 0; + /* replace merged table with clean table */ - if (new != old) { - fib_replace_table(net, old, new); - fib_free_table(old); - } + fib_replace_table(net, old, new); + fib_free_table(old); + + /* attempt to fetch main table if it has been allocated */ + main_table = fib_get_table(net, RT_TABLE_MAIN); + if (!main_table) + return 0; + + /* flush local entries from main table */ + fib_table_flush_external(main_table); return 0; } @@ -610,6 +620,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, + [RTA_UID] = { .type = NLA_U32 }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, @@ -1208,6 +1219,8 @@ static int __net_init ip_fib_net_init(struct net *net) int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; + net->ipv4.fib_seq = 0; + /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 388d3e21629b..c1bc1e92de0e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi) #endif call_rcu(&fi->rcu, free_fib_info_rcu); } +EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 31cef3602585..1b0e7d1f5217 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -84,25 +84,114 @@ #include <trace/events/fib.h> #include "fib_lookup.h" -static BLOCKING_NOTIFIER_HEAD(fib_chain); +static unsigned int fib_seq_sum(void) +{ + unsigned int fib_seq = 0; + struct net *net; + + rtnl_lock(); + for_each_net(net) + fib_seq += net->ipv4.fib_seq; + rtnl_unlock(); + + return fib_seq; +} + +static ATOMIC_NOTIFIER_HEAD(fib_chain); -int register_fib_notifier(struct notifier_block *nb) +static int call_fib_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info) { - return blocking_notifier_chain_register(&fib_chain, nb); + info->net = net; + return nb->notifier_call(nb, event_type, info); +} + +static void fib_rules_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type) +{ +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct fib_notifier_info info; + + if (net->ipv4.fib_has_custom_rules) + call_fib_notifier(nb, net, event_type, &info); +#endif +} + +static void fib_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type); + +static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, u32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id, u32 nlflags) +{ + struct fib_entry_notifier_info info = { + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .tb_id = tb_id, + .nlflags = nlflags, + }; + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static bool fib_dump_is_consistent(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + unsigned int fib_seq) +{ + atomic_notifier_chain_register(&fib_chain, nb); + if (fib_seq == fib_seq_sum()) + return true; + atomic_notifier_chain_unregister(&fib_chain, nb); + if (cb) + cb(nb); + return false; +} + +#define FIB_DUMP_MAX_RETRIES 5 +int register_fib_notifier(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb)) +{ + int retries = 0; + + do { + unsigned int fib_seq = fib_seq_sum(); + struct net *net; + + /* Mutex semantics guarantee that every change done to + * FIB tries before we read the change sequence counter + * is now visible to us. + */ + rcu_read_lock(); + for_each_net_rcu(net) { + fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD); + fib_notify(net, nb, FIB_EVENT_ENTRY_ADD); + } + rcu_read_unlock(); + + if (fib_dump_is_consistent(nb, cb, fib_seq)) + return 0; + } while (++retries < FIB_DUMP_MAX_RETRIES); + + return -EBUSY; } EXPORT_SYMBOL(register_fib_notifier); int unregister_fib_notifier(struct notifier_block *nb) { - return blocking_notifier_chain_unregister(&fib_chain, nb); + return atomic_notifier_chain_unregister(&fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + net->ipv4.fib_seq++; info->net = net; - return blocking_notifier_call_chain(&fib_chain, event_type, info); + return atomic_notifier_call_chain(&fib_chain, event_type, info); } static int call_fib_entry_notifiers(struct net *net, @@ -719,6 +808,13 @@ static unsigned char update_suffix(struct key_vector *tn) { unsigned char slen = tn->pos; unsigned long stride, i; + unsigned char slen_max; + + /* only vector 0 can have a suffix length greater than or equal to + * tn->pos + tn->bits, the second highest node will have a suffix + * length at most of tn->pos + tn->bits - 1 + */ + slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen); /* search though the list of children looking for nodes that might * have a suffix greater than the one we currently have. This is @@ -736,12 +832,8 @@ static unsigned char update_suffix(struct key_vector *tn) slen = n->slen; i &= ~(stride - 1); - /* if slen covers all but the last bit we can stop here - * there will be nothing longer than that since only node - * 0 and 1 << (bits - 1) could have that as their suffix - * length. - */ - if ((slen + 1) >= (tn->pos + tn->bits)) + /* stop searching if we have hit the maximum possible value */ + if (slen >= slen_max) break; } @@ -913,39 +1005,27 @@ static struct key_vector *resize(struct trie *t, struct key_vector *tn) return collapse(t, tn); /* update parent in case halve failed */ - tp = node_parent(tn); - - /* Return if at least one deflate was run */ - if (max_work != MAX_WORK) - return tp; - - /* push the suffix length to the parent node */ - if (tn->slen > tn->pos) { - unsigned char slen = update_suffix(tn); - - if (slen > tp->slen) - tp->slen = slen; - } - - return tp; + return node_parent(tn); } -static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l) +static void node_pull_suffix(struct key_vector *tn, unsigned char slen) { - while ((tp->slen > tp->pos) && (tp->slen > l->slen)) { - if (update_suffix(tp) > l->slen) + unsigned char node_slen = tn->slen; + + while ((node_slen > tn->pos) && (node_slen > slen)) { + slen = update_suffix(tn); + if (node_slen == slen) break; - tp = node_parent(tp); + + tn = node_parent(tn); + node_slen = tn->slen; } } -static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l) +static void node_push_suffix(struct key_vector *tn, unsigned char slen) { - /* if this is a new leaf then tn will be NULL and we can sort - * out parent suffix lengths as a part of trie_rebalance - */ - while (tn->slen < l->slen) { - tn->slen = l->slen; + while (tn->slen < slen) { + tn->slen = slen; tn = node_parent(tn); } } @@ -1066,6 +1146,7 @@ static int fib_insert_node(struct trie *t, struct key_vector *tp, } /* Case 3: n is NULL, and will just insert a new leaf */ + node_push_suffix(tp, new->fa_slen); NODE_INIT_PARENT(l, tp); put_child_root(tp, key, l); trie_rebalance(t, tp); @@ -1107,7 +1188,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, /* if we added to the tail node then we need to update slen */ if (l->slen < new->fa_slen) { l->slen = new->fa_slen; - leaf_push_suffix(tp, l); + node_push_suffix(tp, new->fa_slen); } return 0; @@ -1499,6 +1580,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, * out parent suffix lengths as a part of trie_rebalance */ if (hlist_empty(&l->leaf)) { + if (tp->slen == l->slen) + node_pull_suffix(tp, tp->pos); put_child_root(tp, l->key, NULL); node_free(l); trie_rebalance(t, tp); @@ -1511,7 +1594,7 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, /* update the trie with the latest suffix length */ l->slen = fa->fa_slen; - leaf_pull_suffix(tp, l); + node_pull_suffix(tp, fa->fa_slen); } /* Caller must hold RTNL. */ @@ -1743,8 +1826,10 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) local_l = fib_find_node(lt, &local_tp, l->key); if (fib_insert_alias(lt, local_tp, local_l, new_fa, - NULL, l->key)) + NULL, l->key)) { + kmem_cache_free(fn_alias_kmem, new_fa); goto out; + } } /* stop loop if key wrapped back to 0 */ @@ -1760,6 +1845,75 @@ out: return NULL; } +/* Caller must hold RTNL */ +void fib_table_flush_external(struct fib_table *tb) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct hlist_node *tmp; + struct fib_alias *fa; + + /* walk trie in reverse order */ + for (;;) { + unsigned char slen = 0; + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; + + /* cannot resize the trie vector */ + if (IS_TRIE(pn)) + break; + + /* update the suffix to address pulled leaves */ + if (pn->slen > pn->pos) + update_suffix(pn); + + /* resize completed node */ + pn = resize(t, pn); + cindex = get_index(pkey, pn); + + continue; + } + + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; + + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; + + continue; + } + + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + /* if alias was cloned to local then we just + * need to remove the local copy from main + */ + if (tb->tb_id != fa->tb_id) { + hlist_del_rcu(&fa->fa_list); + alias_free_mem_rcu(fa); + continue; + } + + /* record local slen */ + slen = fa->fa_slen; + } + + /* update leaf slen */ + n->slen = slen; + + if (hlist_empty(&n->leaf)) { + put_child_root(pn, n->key, NULL); + node_free(n); + } + } +} + /* Caller must hold RTNL. */ int fib_table_flush(struct net *net, struct fib_table *tb) { @@ -1782,6 +1936,10 @@ int fib_table_flush(struct net *net, struct fib_table *tb) if (IS_TRIE(pn)) break; + /* update the suffix to address pulled leaves */ + if (pn->slen > pn->pos) + update_suffix(pn); + /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); @@ -1834,6 +1992,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb) return found; } +static void fib_leaf_notify(struct net *net, struct key_vector *l, + struct fib_table *tb, struct notifier_block *nb, + enum fib_event_type event_type) +{ + struct fib_alias *fa; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi) + continue; + + /* local and main table can share the same trie, + * so don't notify twice for the same entry. + */ + if (tb->tb_id != fa->tb_id) + continue; + + call_fib_entry_notifier(nb, net, event_type, l->key, + KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, + fa->fa_type, fa->tb_id, 0); + } +} + +static void fib_table_notify(struct net *net, struct fib_table *tb, + struct notifier_block *nb, + enum fib_event_type event_type) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *l, *tp = t->kv; + t_key key = 0; + + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { + fib_leaf_notify(net, l, tb, nb, event_type); + + key = l->key + 1; + /* stop in case of wrap around */ + if (key < l->key) + break; + } +} + +static void fib_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type) +{ + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb_hlist) + fib_table_notify(net, tb, nb, event_type); + } +} + static void __trie_free_rcu(struct rcu_head *head) { struct fib_table *tb = container_of(head, struct fib_table, rcu); @@ -2413,22 +2627,19 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, struct key_vector *l, **tp = &iter->tnode; t_key key; - /* use cache location of next-to-find key */ + /* use cached location of previously found key */ if (iter->pos > 0 && pos >= iter->pos) { - pos -= iter->pos; key = iter->key; } else { - iter->pos = 0; + iter->pos = 1; key = 0; } - while ((l = leaf_walk_rcu(tp, key)) != NULL) { + pos -= iter->pos; + + while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) { key = l->key + 1; iter->pos++; - - if (--pos <= 0) - break; - l = NULL; /* handle unlikely case of a key wrap */ @@ -2437,7 +2648,7 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, } if (l) - iter->key = key; /* remember it */ + iter->key = l->key; /* remember it */ else iter->pos = 0; /* forget it */ @@ -2465,7 +2676,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) return fib_route_get_idx(iter, *pos); iter->pos = 0; - iter->key = 0; + iter->key = KEY_MAX; return SEQ_START_TOKEN; } @@ -2474,7 +2685,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct key_vector *l = NULL; - t_key key = iter->key; + t_key key = iter->key + 1; ++*pos; @@ -2483,7 +2694,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) l = leaf_walk_rcu(&iter->tnode, key); if (l) { - iter->key = l->key + 1; + iter->key = l->key; iter->pos++; } else { iter->pos = 0; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index cf50f7e2b012..805f6607f8d9 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -249,7 +249,7 @@ static struct sk_buff **fou_gro_receive(struct sock *sk, if (!ops || !ops->callbacks.gro_receive) goto out_unlock; - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); @@ -441,7 +441,7 @@ next_proto: if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) goto out_unlock; - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); flush = 0; out_unlock: @@ -622,14 +622,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) return err; } -static struct genl_family fou_nl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = FOU_GENL_NAME, - .version = FOU_GENL_VERSION, - .maxattr = FOU_ATTR_MAX, - .netnsok = true, -}; +static struct genl_family fou_nl_family; static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_PORT] = { .type = NLA_U16, }, @@ -831,6 +824,17 @@ static const struct genl_ops fou_nl_ops[] = { }, }; +static struct genl_family fou_nl_family __ro_after_init = { + .hdrsize = 0, + .name = FOU_GENL_NAME, + .version = FOU_GENL_VERSION, + .maxattr = FOU_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = fou_nl_ops, + .n_ops = ARRAY_SIZE(fou_nl_ops), +}; + size_t fou_encap_hlen(struct ip_tunnel_encap *e) { return sizeof(struct udphdr); @@ -1086,8 +1090,7 @@ static int __init fou_init(void) if (ret) goto exit; - ret = genl_register_family_with_ops(&fou_nl_family, - fou_nl_ops); + ret = genl_register_family(&fou_nl_family); if (ret < 0) goto unregister; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 96e0efecefa6..d5cac99170b1 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -229,7 +229,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ skb_gro_postpull_rcsum(skb, greh, grehlen); - pp = ptype->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out_unlock: diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 38abe70e595f..f79d7a8ab1c6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -425,6 +425,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_mark = mark; + fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); @@ -473,11 +474,12 @@ static struct rtable *icmp_route_lookup(struct net *net, param->replyopts.opt.opt.faddr : iph->saddr); fl4->saddr = saddr; fl4->flowi4_mark = mark; + fl4->flowi4_uid = sock_net_uid(net, NULL); fl4->flowi4_tos = RT_TOS(tos); fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev); + fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key_hash(net, fl4, @@ -502,7 +504,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_in->dev, + if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) @@ -1045,12 +1047,12 @@ int icmp_rcv(struct sk_buff *skb) if (success) { consume_skb(skb); - return 0; + return NET_RX_SUCCESS; } drop: kfree_skb(skb); - return 0; + return NET_RX_DROP; csum_error: __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS); error: diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 606cc3e85d2b..15db786d50ed 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -162,7 +162,7 @@ static int unsolicited_report_interval(struct in_device *in_dev) } static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); -static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); +static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_clear_delrec(struct in_device *in_dev); static int sf_setstate(struct ip_mc_list *pmc); static void sf_markstate(struct ip_mc_list *pmc); @@ -1130,10 +1130,15 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) spin_unlock_bh(&in_dev->mc_tomb_lock); } -static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) +/* + * restore ip_mc_list deleted records + */ +static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc, *pmc_prev; - struct ip_sf_list *psf, *psf_next; + struct ip_sf_list *psf; + struct net *net = dev_net(in_dev->dev); + __be32 multiaddr = im->multiaddr; spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; @@ -1149,16 +1154,26 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) in_dev->mc_tomb = pmc->next; } spin_unlock_bh(&in_dev->mc_tomb_lock); + + spin_lock_bh(&im->lock); if (pmc) { - for (psf = pmc->tomb; psf; psf = psf_next) { - psf_next = psf->sf_next; - kfree(psf); + im->interface = pmc->interface; + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + im->sfmode = pmc->sfmode; + if (pmc->sfmode == MCAST_INCLUDE) { + im->tomb = pmc->tomb; + im->sources = pmc->sources; + for (psf = im->sources; psf; psf = psf->sf_next) + psf->sf_crcount = im->crcount; } in_dev_put(pmc->interface); - kfree(pmc); } + spin_unlock_bh(&im->lock); } +/* + * flush ip_mc_list deleted records + */ static void igmpv3_clear_delrec(struct in_device *in_dev) { struct ip_mc_list *pmc, *nextpmc; @@ -1366,7 +1381,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) ip_mc_hash_add(in_dev, im); #ifdef CONFIG_IP_MULTICAST - igmpv3_del_delrec(in_dev, im->multiaddr); + igmpv3_del_delrec(in_dev, im); #endif igmp_group_added(im); if (!in_dev->dead) @@ -1626,8 +1641,12 @@ void ip_mc_remap(struct in_device *in_dev) ASSERT_RTNL(); - for_each_pmc_rtnl(in_dev, pmc) + for_each_pmc_rtnl(in_dev, pmc) { +#ifdef CONFIG_IP_MULTICAST + igmpv3_del_delrec(in_dev, pmc); +#endif igmp_group_added(pmc); + } } /* Device going down */ @@ -1648,7 +1667,6 @@ void ip_mc_down(struct in_device *in_dev) in_dev->mr_gq_running = 0; if (del_timer(&in_dev->mr_gq_timer)) __in_dev_put(in_dev); - igmpv3_clear_delrec(in_dev); #endif ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); @@ -1688,8 +1706,12 @@ void ip_mc_up(struct in_device *in_dev) #endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); - for_each_pmc_rtnl(in_dev, pmc) + for_each_pmc_rtnl(in_dev, pmc) { +#ifdef CONFIG_IP_MULTICAST + igmpv3_del_delrec(in_dev, pmc); +#endif igmp_group_added(pmc); + } } /* @@ -1704,13 +1726,13 @@ void ip_mc_destroy_dev(struct in_device *in_dev) /* Deactivate timers */ ip_mc_down(in_dev); +#ifdef CONFIG_IP_MULTICAST + igmpv3_clear_delrec(in_dev); +#endif while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; - - /* We've dropped the groups in ip_mc_down already */ - ip_mc_clear_src(i); ip_ma_put(i); } } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 61a9deec2993..d5d3ead0a6c3 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -415,7 +415,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num)); + htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -452,7 +452,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num)); + htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e4d16fc5bbb3..4dea33e5f295 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -200,6 +200,15 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) goto errout; + /* + * RAW sockets might have user-defined protocols assigned, + * so report the one supplied on socket creation. + */ + if (sk->sk_type == SOCK_RAW) { + if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol)) + goto errout; + } + if (!icsk) { handler->idiag_get_info(sk, r, NULL); goto out; @@ -852,10 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); - int i, num, s_i, s_num; u32 idiag_states = r->idiag_states; - bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + int i, num, s_i, s_num; + struct sock *sk; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; @@ -863,16 +873,15 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(idiag_states & TCPF_LISTEN)) + if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; - struct sock *sk; num = 0; ilb = &hashinfo->listening_hash[i]; - spin_lock_bh(&ilb->lock); + spin_lock(&ilb->lock); sk_for_each(sk, &ilb->head) { struct inet_sock *inet = inet_sk(sk); @@ -892,26 +901,18 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (r->id.idiag_dport || - cb->args[3] > 0) - goto next_listen; - if (inet_csk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { - spin_unlock_bh(&ilb->lock); + spin_unlock(&ilb->lock); goto done; } next_listen: - cb->args[3] = 0; - cb->args[4] = 0; ++num; } - spin_unlock_bh(&ilb->lock); + spin_unlock(&ilb->lock); s_num = 0; - cb->args[3] = 0; - cb->args[4] = 0; } skip_listen_ht: cb->args[0] = 1; @@ -921,13 +922,14 @@ skip_listen_ht: if (!(idiag_states & ~TCPF_LISTEN)) goto out; +#define SKARR_SZ 16 for (i = s_i; i <= hashinfo->ehash_mask; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; spinlock_t *lock = inet_ehash_lockp(hashinfo, i); struct hlist_nulls_node *node; - struct sock *sk; - - num = 0; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; if (hlist_nulls_empty(&head->chain)) continue; @@ -935,9 +937,12 @@ skip_listen_ht: if (i > s_i) s_num = 0; +next_chunk: + num = 0; + accum = 0; spin_lock_bh(lock); sk_nulls_for_each(sk, node, &head->chain) { - int state, res; + int state; if (!net_eq(sock_net(sk), net)) continue; @@ -961,21 +966,35 @@ skip_listen_ht: if (!inet_diag_bc_sk(bc, sk)) goto next_normal; - res = sk_diag_fill(sk, skb, r, + sock_hold(sk); + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + break; +next_normal: + ++num; + } + spin_unlock_bh(lock); + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = sk_diag_fill(sk_arr[idx], skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin); - if (res < 0) { - spin_unlock_bh(lock); - goto done; + if (res < 0) + num = num_arr[idx]; } -next_normal: - ++num; + sock_gen_put(sk_arr[idx]); } - - spin_unlock_bh(lock); + if (res < 0) + break; cond_resched(); + if (accum == SKARR_SZ) { + s_num = num + 1; + goto next_chunk; + } } done: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 77c20a489218..ca97835bfec4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -25,6 +25,7 @@ #include <net/inet_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/tcp.h> #include <net/sock_reuseport.h> static u32 inet_ehashfn(const struct net *net, const __be32 laddr, @@ -172,7 +173,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif) + const int dif, bool exact_dif) { int score = -1; struct inet_sock *inet = inet_sk(sk); @@ -186,7 +187,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score += 4; @@ -215,11 +216,12 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore = 0, matches = 0, reuseport = 0; + bool exact_dif = inet_exact_dif_match(net, skb); struct sock *sk, *result = NULL; u32 phash = 0; sk_for_each_rcu(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif); + score = compute_score(sk, net, hnum, daddr, dif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 8b4ffd216839..9f0a7b96646f 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -117,7 +117,7 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; + IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 576f705d8180..78fd62048335 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -113,8 +113,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); -static int ipgre_net_id __read_mostly; -static int gre_tap_net_id __read_mostly; +static unsigned int ipgre_net_id __read_mostly; +static unsigned int gre_tap_net_id __read_mostly; static void ipgre_err(struct sk_buff *skb, u32 info, const struct tnl_ptk_info *tpi) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 05d105832bdb..9ffc2625cddd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -74,6 +74,7 @@ #include <net/checksum.h> #include <net/inetpeer.h> #include <net/lwtunnel.h> +#include <linux/bpf-cgroup.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> @@ -107,6 +108,8 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) if (unlikely(!skb)) return 0; + skb->protocol = htons(ETH_P_IP); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); @@ -239,19 +242,23 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *segs; int ret = 0; - /* common case: fragmentation of segments is not allowed, - * or seglen is <= mtu + /* common case: seglen is <= mtu */ - if (((IPCB(skb)->flags & IPSKB_FRAG_SEGS) == 0) || - skb_gso_validate_mtu(skb, mtu)) + if (skb_gso_validate_mtu(skb, mtu)) return ip_finish_output2(net, sk, skb); - /* Slowpath - GSO segment length is exceeding the dst MTU. + /* Slowpath - GSO segment length exceeds the egress MTU. * - * This can happen in two cases: - * 1) TCP GRO packet, DF bit not set - * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly - * from host network stack. + * This can happen in several cases: + * - Forwarding of a TCP GRO skb, when DF flag is not set. + * - Forwarding of an skb that arrived on a virtualization interface + * (virtio-net/vhost/tap) with TSO/GSO size set by other network + * stack. + * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an + * interface with a smaller MTU. + * - Arriving GRO skb (or GSO skb in a virtualized environment) that is + * bridged to a NETIF_F_TSO tunnel stacked over an interface with an + * insufficent MTU. */ features = netif_skb_features(skb); BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); @@ -281,6 +288,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -299,6 +313,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_mc_finish_output(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + + return dev_loopback_xmit(net, sk, skb); +} + int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); @@ -336,7 +364,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -352,7 +380,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, @@ -538,7 +566,6 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, { struct iphdr *iph; int ptr; - struct net_device *dev; struct sk_buff *skb2; unsigned int mtu, hlen, left, len, ll_rs; int offset; @@ -546,8 +573,6 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rtable *rt = skb_rtable(skb); int err = 0; - dev = rt->dst.dev; - /* for offloaded checksums cleanup checksum before fragmentation */ if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) @@ -580,7 +605,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, */ if (skb_has_frag_list(skb)) { struct sk_buff *frag, *frag2; - int first_len = skb_pagelen(skb); + unsigned int first_len = skb_pagelen(skb); if (first_len - hlen > mtu || ((first_len - hlen) & 7) || @@ -1582,7 +1607,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, } oif = arg->bound_dev_if; - oif = oif ? : skb->skb_iif; + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) + oif = skb->skb_iif; flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark), @@ -1590,7 +1616,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, - tcp_hdr(skb)->source, tcp_hdr(skb)->dest); + tcp_hdr(skb)->source, tcp_hdr(skb)->dest, + arg->uid); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index af4919792b6a..8b13881ed064 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -97,8 +97,19 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); } +static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb) +{ + int val; + + if (IPCB(skb)->frag_max_size == 0) + return; + + val = IPCB(skb)->frag_max_size; + put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val); +} + static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, - int offset) + int tlen, int offset) { __wsum csum = skb->csum; @@ -106,8 +117,9 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, return; if (offset != 0) - csum = csum_sub(csum, csum_partial(skb_transport_header(skb), - offset, 0)); + csum = csum_sub(csum, + csum_partial(skb_transport_header(skb) + tlen, + offset, 0)); put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); } @@ -152,10 +164,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); } -void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, - int offset) +void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, int tlen, int offset) { - struct inet_sock *inet = inet_sk(skb->sk); + struct inet_sock *inet = inet_sk(sk); unsigned int flags = inet->cmsg_flags; /* Ordered by supposed usage frequency */ @@ -216,7 +228,10 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, } if (flags & IP_CMSG_CHECKSUM) - ip_cmsg_recv_checksum(msg, skb, offset); + ip_cmsg_recv_checksum(msg, skb, tlen, offset); + + if (flags & IP_CMSG_RECVFRAGSIZE) + ip_cmsg_recv_fragsize(msg, skb); } EXPORT_SYMBOL(ip_cmsg_recv_offset); @@ -613,6 +628,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_MULTICAST_LOOP: case IP_RECVORIGDSTADDR: case IP_CHECKSUM: + case IP_RECVFRAGSIZE: if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -725,6 +741,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, } } break; + case IP_RECVFRAGSIZE: + if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) + goto e_inval; + if (val) + inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE; + else + inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE; + break; case IP_TOS: /* This sets both TOS and Precedence */ if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; @@ -1356,6 +1380,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_CHECKSUM: val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0; break; + case IP_RECVFRAGSIZE: + val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0; + break; case IP_TOS: val = inet->tos; break; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 5719d6ba0824..823abaef006b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -358,6 +358,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, { struct ip_tunnel *nt; struct net_device *dev; + int t_hlen; BUG_ON(!itn->fb_tunnel_dev); dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); @@ -367,6 +368,9 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, dev->mtu = ip_tunnel_bind_dev(dev); nt = netdev_priv(dev); + t_hlen = nt->hlen + sizeof(struct iphdr); + dev->min_mtu = ETH_MIN_MTU; + dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; ip_tunnel_add(itn, nt); return nt; } @@ -929,7 +933,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) int t_hlen = tunnel->hlen + sizeof(struct iphdr); int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; - if (new_mtu < 68) + if (new_mtu < ETH_MIN_MTU) return -EINVAL; if (new_mtu > max_mtu) { @@ -990,7 +994,7 @@ int ip_tunnel_get_iflink(const struct net_device *dev) } EXPORT_SYMBOL(ip_tunnel_get_iflink); -int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, +int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); @@ -1192,7 +1196,7 @@ void ip_tunnel_uninit(struct net_device *dev) EXPORT_SYMBOL_GPL(ip_tunnel_uninit); /* Do least required initialization, rest of init is done in tunnel_init call */ -void ip_tunnel_setup(struct net_device *dev, int net_id) +void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) { struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->ip_tnl_net_id = net_id; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 777bc1883870..fed3d29f9eb3 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -63,7 +63,6 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); struct net_device *dev = skb->dev; - int skb_iif = skb->skb_iif; struct iphdr *iph; int err; @@ -73,16 +72,6 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - if (skb_iif && !(df & htons(IP_DF))) { - /* Arrived from an ingress interface, got encapsulated, with - * fragmentation of encapulating frames allowed. - * If skb is gso, the resulting encapsulated network segments - * may exceed dst mtu. - * Allow IP Fragmentation of segments. - */ - IPCB(skb)->flags |= IPSKB_FRAG_SEGS; - } - /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5d7944f394d9..8b14f1404c8f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -46,7 +46,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly; -static int vti_net_id __read_mostly; +static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index c9392589c415..79489f017854 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -121,7 +121,7 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -static int ipip_net_id __read_mostly; +static unsigned int ipip_net_id __read_mostly; static int ipip_tunnel_init(struct net_device *dev); static struct rtnl_link_ops ipip_link_ops __read_mostly; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5f006e13de56..665505d86b12 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -137,6 +137,9 @@ static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, .flags = FIB_LOOKUP_NOREF, }; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi4_to_flowi(flp4)); + err = fib_rules_lookup(net->ipv4.mr_rules_ops, flowi4_to_flowi(flp4), 0, &arg); if (err < 0) @@ -163,7 +166,9 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp, return -EINVAL; } - mrt = ipmr_get_table(rule->fr_net, rule->table); + arg->table = fib_rule_get_table(rule, arg); + + mrt = ipmr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -1749,7 +1754,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->dev->stats.tx_bytes += skb->len; } - IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; + IPCB(skb)->flags |= IPSKB_FORWARDED; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output @@ -1809,6 +1814,12 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, /* Wrong interface: drop packet and (maybe) send PIM assert. */ if (mrt->vif_table[vif].dev != skb->dev) { + struct net_device *mdev; + + mdev = l3mdev_master_dev_rcu(mrt->vif_table[vif].dev); + if (mdev == skb->dev) + goto forward; + if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. * Very complicated situation... @@ -2053,7 +2064,7 @@ static int pim_rcv(struct sk_buff *skb) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); - if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) || + if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) || (pim->flags & PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index c3776ff6749f..b3cc1335adbc 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -24,10 +24,11 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; + struct net_device *dev = skb_dst(skb)->dev; unsigned int hh_len; if (addr_type == RTN_UNSPEC) - addr_type = inet_addr_type(net, saddr); + addr_type = inet_addr_type_dev_table(net, dev, saddr); if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) flags |= FLOWI_FLAG_ANYSRC; else @@ -40,6 +41,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t fl4.saddr = saddr; fl4.flowi4_tos = RT_TOS(iph->tos); fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; + if (!fl4.flowi4_oif) + fl4.flowi4_oif = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; fl4.flowi4_flags = flags; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d613309e3e5d..c11eb1744ab1 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -25,6 +25,12 @@ config NF_CONNTRACK_IPV4 To compile it as a module, choose M here. If unsure, say N. +config NF_SOCKET_IPV4 + tristate "IPv4 socket lookup support" + help + This option enables the IPv4 socket lookup infrastructure. This is + is required by the iptables socket match. + if NF_TABLES config NF_TABLES_IPV4 @@ -54,6 +60,14 @@ config NFT_DUP_IPV4 help This module enables IPv4 packet duplication support for nf_tables. +config NFT_FIB_IPV4 + select NFT_FIB + tristate "nf_tables fib / ip route lookup support" + help + This module enables IPv4 FIB lookups, e.g. for reverse path filtering. + It also allows query of the FIB for the route type, e.g. local, unicast, + multicast or blackhole. + endif # NF_TABLES_IPV4 config NF_TABLES_ARP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 853328f8fd05..f462fee66ac8 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -14,6 +14,8 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o # defrag obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o +obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o + # logging obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o @@ -34,6 +36,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b31df597fd37..1258a9ab62ef 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -217,11 +217,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, */ e = get_entry(table_base, private->hook_entry[hook]); - acpar.net = state->net; - acpar.in = state->in; - acpar.out = state->out; - acpar.hooknum = hook; - acpar.family = NFPROTO_ARP; + acpar.state = state; acpar.hotdrop = false; arp = arp_hdr(skb); @@ -415,17 +411,15 @@ static inline int check_target(struct arpt_entry *e, const char *name) } static inline int -find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) +find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, + struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; - unsigned long pcnt; int ret; - pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(pcnt)) + if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; - e->counters.pcnt = pcnt; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, @@ -443,7 +437,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) err: module_put(t->u.kernel.target->me); out: - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); return ret; } @@ -523,7 +517,7 @@ static inline void cleanup_entry(struct arpt_entry *e) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in @@ -532,6 +526,7 @@ static inline void cleanup_entry(struct arpt_entry *e) static int translate_table(struct xt_table_info *newinfo, void *entry0, const struct arpt_replace *repl) { + struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct arpt_entry *iter; unsigned int *offsets; unsigned int i; @@ -594,7 +589,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { - ret = find_check_entry(iter, repl->name, repl->size); + ret = find_check_entry(iter, repl->name, repl->size, + &alloc_state); if (ret != 0) break; ++i; @@ -809,7 +805,7 @@ static int get_info(struct net *net, void __user *user, #endif t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -838,7 +834,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(NFPROTO_ARP); @@ -863,7 +859,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { const struct xt_table_info *private = t->private; if (get.size == private->size) @@ -875,7 +871,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; return ret; } @@ -902,8 +898,8 @@ static int __do_replace(struct net *net, const char *name, t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); - if (IS_ERR_OR_NULL(t)) { - ret = t ? PTR_ERR(t) : -ENOENT; + if (!t) { + ret = -ENOENT; goto free_newinfo_counters_untrans; } @@ -1018,8 +1014,8 @@ static int do_add_counters(struct net *net, const void __user *user, return PTR_ERR(paddc); t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name); - if (IS_ERR_OR_NULL(t)) { - ret = t ? PTR_ERR(t) : -ENOENT; + if (!t) { + ret = -ENOENT; goto free; } @@ -1201,8 +1197,8 @@ static int translate_compat_table(struct xt_table_info **pinfo, newinfo->number = compatr->num_entries; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { - newinfo->hook_entry[i] = info->hook_entry[i]; - newinfo->underflow[i] = info->underflow[i]; + newinfo->hook_entry[i] = compatr->hook_entry[i]; + newinfo->underflow[i] = compatr->underflow[i]; } entry1 = newinfo->entries; pos = entry1; @@ -1408,7 +1404,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { const struct xt_table_info *private = t->private; struct xt_table_info info; @@ -1423,7 +1419,7 @@ static int compat_get_entries(struct net *net, module_put(t->me); xt_table_unlock(t); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; xt_compat_unlock(NFPROTO_ARP); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 7c00ce90adb8..308b456723f0 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -261,11 +261,7 @@ ipt_do_table(struct sk_buff *skb, acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; acpar.thoff = ip_hdrlen(skb); acpar.hotdrop = false; - acpar.net = state->net; - acpar.in = state->in; - acpar.out = state->out; - acpar.family = NFPROTO_IPV4; - acpar.hooknum = hook; + acpar.state = state; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); local_bh_disable(); @@ -535,7 +531,8 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name) static int find_check_entry(struct ipt_entry *e, struct net *net, const char *name, - unsigned int size) + unsigned int size, + struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; @@ -543,12 +540,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - unsigned long pcnt; - pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(pcnt)) + if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; - e->counters.pcnt = pcnt; j = 0; mtpar.net = net; @@ -586,7 +580,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, cleanup_match(ematch, net); } - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); return ret; } @@ -674,7 +668,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - xt_percpu_counter_free(e->counters.pcnt); + xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in @@ -683,6 +677,7 @@ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ipt_replace *repl) { + struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct ipt_entry *iter; unsigned int *offsets; unsigned int i; @@ -742,7 +737,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { - ret = find_check_entry(iter, net, repl->name, repl->size); + ret = find_check_entry(iter, net, repl->name, repl->size, + &alloc_state); if (ret != 0) break; ++i; @@ -977,7 +973,7 @@ static int get_info(struct net *net, void __user *user, #endif t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), "iptable_%s", name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1007,7 +1003,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(AF_INET); @@ -1032,7 +1028,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { const struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1043,7 +1039,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; return ret; } @@ -1068,8 +1064,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), "iptable_%s", name); - if (IS_ERR_OR_NULL(t)) { - ret = t ? PTR_ERR(t) : -ENOENT; + if (!t) { + ret = -ENOENT; goto free_newinfo_counters_untrans; } @@ -1184,8 +1180,8 @@ do_add_counters(struct net *net, const void __user *user, return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET, tmp.name); - if (IS_ERR_OR_NULL(t)) { - ret = t ? PTR_ERR(t) : -ENOENT; + if (!t) { + ret = -ENOENT; goto free; } @@ -1630,7 +1626,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); - if (!IS_ERR_OR_NULL(t)) { + if (t) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); @@ -1644,7 +1640,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = t ? PTR_ERR(t) : -ENOENT; + ret = -ENOENT; xt_compat_unlock(AF_INET); return ret; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 4a9e6db9df8d..21db00d0362b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -62,7 +62,7 @@ struct clusterip_config { static const struct file_operations clusterip_proc_fops; #endif -static int clusterip_net_id __read_mostly; +static unsigned int clusterip_net_id __read_mostly; struct clusterip_net { struct list_head configs; @@ -419,7 +419,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) } cipinfo->config = config; - ret = nf_ct_l3proto_try_module_get(par->family); + ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); @@ -444,7 +444,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) clusterip_config_put(cipinfo->config); - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_get(par->net, par->family); } #ifdef CONFIG_COMPAT diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index da7f02a0b868..a03e4e7ef5f9 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -41,7 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) pr_debug("bad rangesize %u\n", mr->rangesize); return -EINVAL; } - return 0; + return nf_ct_netns_get(par->net, par->family); } static unsigned int @@ -55,7 +55,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) range.min_proto = mr->range[0].min; range.max_proto = mr->range[0].max; - return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out); + return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range, + xt_out(par)); +} + +static void masquerade_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_netns_put(par->net, par->family); } static struct xt_target masquerade_tg_reg __read_mostly = { @@ -66,6 +72,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = { .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg_check, + .destroy = masquerade_tg_destroy, .me = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 1d16c0f28df0..8bd0d7b26632 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -34,7 +34,7 @@ static unsigned int reject_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_reject_info *reject = par->targinfo; - int hook = par->hooknum; + int hook = xt_hooknum(par); switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: @@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(par->net, skb, hook); + nf_send_reset(xt_net(par), skb, hook); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index db5b87509446..30c0de53e254 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -263,12 +263,12 @@ static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct net *net = par->net; + struct net *net = xt_net(par); struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; - if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) + if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP)) return NF_DROP; th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); @@ -418,12 +418,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par) e->ip.invflags & XT_INV_PROTO) return -EINVAL; - return nf_ct_l3proto_try_module_get(par->family); + return nf_ct_netns_get(par->net, par->family); } static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(par->family); + nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg4_reg __read_mostly = { diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 78cc64eddfc1..f273098e48fd 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -83,10 +83,12 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) return true ^ invert; iph = ip_hdr(skb); - if (ipv4_is_multicast(iph->daddr)) { - if (ipv4_is_zeronet(iph->saddr)) - return ipv4_is_local_multicast(iph->daddr) ^ invert; + if (ipv4_is_zeronet(iph->saddr)) { + if (ipv4_is_lbcast(iph->daddr) || + ipv4_is_local_multicast(iph->daddr)) + return true ^ invert; } + flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); @@ -95,7 +97,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_tos = RT_TOS(iph->tos); flow.flowi4_scope = RT_SCOPE_UNIVERSE; - return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert; + return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 713c09a74b90..fcfd071f4705 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -31,6 +31,13 @@ #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #include <net/netfilter/nf_log.h> +static int conntrack4_net_id __read_mostly; +static DEFINE_MUTEX(register_ipv4_hooks); + +struct conntrack4_net { + unsigned int users; +}; + static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_tuple *tuple) { @@ -307,9 +314,42 @@ static struct nf_sockopt_ops so_getorigdst = { .owner = THIS_MODULE, }; -static int ipv4_init_net(struct net *net) +static int ipv4_hooks_register(struct net *net) { - return 0; + struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); + int err = 0; + + mutex_lock(®ister_ipv4_hooks); + + cnet->users++; + if (cnet->users > 1) + goto out_unlock; + + err = nf_defrag_ipv4_enable(net); + if (err) { + cnet->users = 0; + goto out_unlock; + } + + err = nf_register_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + + if (err) + cnet->users = 0; + out_unlock: + mutex_unlock(®ister_ipv4_hooks); + return err; +} + +static void ipv4_hooks_unregister(struct net *net) +{ + struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); + + mutex_lock(®ister_ipv4_hooks); + if (cnet->users && (--cnet->users == 0)) + nf_unregister_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + mutex_unlock(®ister_ipv4_hooks); } struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { @@ -325,7 +365,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .nlattr_to_tuple = ipv4_nlattr_to_tuple, .nla_policy = ipv4_nla_policy, #endif - .init_net = ipv4_init_net, + .net_ns_get = ipv4_hooks_register, + .net_ns_put = ipv4_hooks_unregister, .me = THIS_MODULE, }; @@ -336,52 +377,50 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("ip_conntrack"); MODULE_LICENSE("GPL"); +static struct nf_conntrack_l4proto *builtin_l4proto4[] = { + &nf_conntrack_l4proto_tcp4, + &nf_conntrack_l4proto_udp4, + &nf_conntrack_l4proto_icmp, +#ifdef CONFIG_NF_CT_PROTO_DCCP + &nf_conntrack_l4proto_dccp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + &nf_conntrack_l4proto_sctp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + &nf_conntrack_l4proto_udplite4, +#endif +}; + static int ipv4_net_init(struct net *net) { int ret = 0; - ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4); - if (ret < 0) { - pr_err("nf_conntrack_tcp4: pernet registration failed\n"); - goto out_tcp; - } - ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4); - if (ret < 0) { - pr_err("nf_conntrack_udp4: pernet registration failed\n"); - goto out_udp; - } - ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp); - if (ret < 0) { - pr_err("nf_conntrack_icmp4: pernet registration failed\n"); - goto out_icmp; - } + ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); + if (ret < 0) + return ret; ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4); if (ret < 0) { pr_err("nf_conntrack_ipv4: pernet registration failed\n"); - goto out_ipv4; + nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); } - return 0; -out_ipv4: - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); -out_icmp: - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); -out_udp: - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4); -out_tcp: return ret; } static void ipv4_net_exit(struct net *net) { nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4); - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); - nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4); + nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); } static struct pernet_operations ipv4_net_ops = { .init = ipv4_net_init, .exit = ipv4_net_exit, + .id = &conntrack4_net_id, + .size = sizeof(struct conntrack4_net), }; static int __init nf_conntrack_l3proto_ipv4_init(void) @@ -389,7 +428,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) int ret = 0; need_conntrack(); - nf_defrag_ipv4_enable(); ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) { @@ -403,46 +441,21 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) goto cleanup_sockopt; } - ret = nf_register_hooks(ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register hooks.\n"); + ret = nf_ct_l4proto_register(builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); + if (ret < 0) goto cleanup_pernet; - } - - ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n"); - goto cleanup_hooks; - } - - ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n"); - goto cleanup_tcp4; - } - - ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n"); - goto cleanup_udp4; - } ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); if (ret < 0) { pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); - goto cleanup_icmpv4; + goto cleanup_l4proto; } return ret; - cleanup_icmpv4: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); - cleanup_udp4: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); - cleanup_tcp4: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4); - cleanup_hooks: - nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); +cleanup_l4proto: + nf_ct_l4proto_unregister(builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); cleanup_pernet: unregister_pernet_subsys(&ipv4_net_ops); cleanup_sockopt: @@ -454,10 +467,8 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) { synchronize_net(); nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4); - nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); + nf_ct_l4proto_unregister(builtin_l4proto4, + ARRAY_SIZE(builtin_l4proto4)); unregister_pernet_subsys(&ipv4_net_ops); nf_unregister_sockopt(&so_getorigdst); } diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index d88da36b383c..49bd6a54404f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -11,6 +11,7 @@ #include <linux/netfilter.h> #include <linux/module.h> #include <linux/skbuff.h> +#include <net/netns/generic.h> #include <net/route.h> #include <net/ip.h> @@ -22,6 +23,8 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> +static DEFINE_MUTEX(defrag4_mutex); + static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { @@ -102,18 +105,50 @@ static struct nf_hook_ops ipv4_defrag_ops[] = { }, }; +static void __net_exit defrag4_net_exit(struct net *net) +{ + if (net->nf.defrag_ipv4) { + nf_unregister_net_hooks(net, ipv4_defrag_ops, + ARRAY_SIZE(ipv4_defrag_ops)); + net->nf.defrag_ipv4 = false; + } +} + +static struct pernet_operations defrag4_net_ops = { + .exit = defrag4_net_exit, +}; + static int __init nf_defrag_init(void) { - return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); + return register_pernet_subsys(&defrag4_net_ops); } static void __exit nf_defrag_fini(void) { - nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); + unregister_pernet_subsys(&defrag4_net_ops); } -void nf_defrag_ipv4_enable(void) +int nf_defrag_ipv4_enable(struct net *net) { + int err = 0; + + might_sleep(); + + if (net->nf.defrag_ipv4) + return 0; + + mutex_lock(&defrag4_mutex); + if (net->nf.defrag_ipv4) + goto out_unlock; + + err = nf_register_net_hooks(net, ipv4_defrag_ops, + ARRAY_SIZE(ipv4_defrag_ops)); + if (err == 0) + net->nf.defrag_ipv4 = true; + + out_unlock: + mutex_unlock(&defrag4_mutex); + return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c new file mode 100644 index 000000000000..a83d558e1aae --- /dev/null +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2007-2008 BalaBit IT Ltd. + * Author: Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/icmp.h> +#include <net/sock.h> +#include <net/inet_sock.h> +#include <net/netfilter/nf_socket.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#endif + +static int +extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, + __be32 *raddr, __be32 *laddr, + __be16 *rport, __be16 *lport) +{ + unsigned int outside_hdrlen = ip_hdrlen(skb); + struct iphdr *inside_iph, _inside_iph; + struct icmphdr *icmph, _icmph; + __be16 *ports, _ports[2]; + + icmph = skb_header_pointer(skb, outside_hdrlen, + sizeof(_icmph), &_icmph); + if (icmph == NULL) + return 1; + + switch (icmph->type) { + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_REDIRECT: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return 1; + } + + inside_iph = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr), + sizeof(_inside_iph), &_inside_iph); + if (inside_iph == NULL) + return 1; + + if (inside_iph->protocol != IPPROTO_TCP && + inside_iph->protocol != IPPROTO_UDP) + return 1; + + ports = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr) + + (inside_iph->ihl << 2), + sizeof(_ports), &_ports); + if (ports == NULL) + return 1; + + /* the inside IP packet is the one quoted from our side, thus + * its saddr is the local address */ + *protocol = inside_iph->protocol; + *laddr = inside_iph->saddr; + *lport = ports[0]; + *raddr = inside_iph->daddr; + *rport = ports[1]; + + return 0; +} + +static struct sock * +nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, + const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in) +{ + switch (protocol) { + case IPPROTO_TCP: + return inet_lookup(net, &tcp_hashinfo, skb, doff, + saddr, sport, daddr, dport, + in->ifindex); + case IPPROTO_UDP: + return udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + } + return NULL; +} + +struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, + const struct net_device *indev) +{ + __be32 uninitialized_var(daddr), uninitialized_var(saddr); + __be16 uninitialized_var(dport), uninitialized_var(sport); + const struct iphdr *iph = ip_hdr(skb); + struct sk_buff *data_skb = NULL; + u8 uninitialized_var(protocol); +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + enum ip_conntrack_info ctinfo; + struct nf_conn const *ct; +#endif + int doff = 0; + + if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { + struct udphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(_hdr), &_hdr); + if (hp == NULL) + return NULL; + + protocol = iph->protocol; + saddr = iph->saddr; + sport = hp->source; + daddr = iph->daddr; + dport = hp->dest; + data_skb = (struct sk_buff *)skb; + doff = iph->protocol == IPPROTO_TCP ? + ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) : + ip_hdrlen(skb) + sizeof(*hp); + + } else if (iph->protocol == IPPROTO_ICMP) { + if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, + &sport, &dport)) + return NULL; + } else { + return NULL; + } + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Do the lookup with the original socket address in + * case this is a reply packet of an established + * SNAT-ted connection. + */ + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct) && + ((iph->protocol != IPPROTO_ICMP && + ctinfo == IP_CT_ESTABLISHED_REPLY) || + (iph->protocol == IPPROTO_ICMP && + ctinfo == IP_CT_RELATED_REPLY)) && + (ct->status & IPS_SRC_NAT_DONE)) { + + daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; + dport = (iph->protocol == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + + return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr, + daddr, sport, dport, indev); +} +EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); +MODULE_DESCRIPTION("Netfilter IPv4 socket lookup infrastructure"); diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index bf855e64fc45..0af3d8df70dd 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -28,9 +28,9 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr, struct in_addr gw = { .s_addr = (__force __be32)regs->data[priv->sreg_addr], }; - int oif = regs->data[priv->sreg_dev]; + int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; - nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif); + nf_dup_ipv4(nft_net(pkt), pkt->skb, nft_hook(pkt), &gw, oif); } static int nft_dup_ipv4_init(const struct nft_ctx *ctx, @@ -59,7 +59,9 @@ static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); - if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr)) + goto nla_put_failure; + if (priv->sreg_dev && nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c new file mode 100644 index 000000000000..965b1a161369 --- /dev/null +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -0,0 +1,241 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_fib.h> + +#include <net/ip_fib.h> +#include <net/route.h> + +/* don't try to find route from mcast/bcast/zeronet */ +static __be32 get_saddr(__be32 addr) +{ + if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) || + ipv4_is_zeronet(addr)) + return 0; + return addr; +} + +static bool fib4_is_local(const struct sk_buff *skb) +{ + const struct rtable *rt = skb_rtable(skb); + + return rt && (rt->rt_flags & RTCF_LOCAL); +} + +#define DSCP_BITS 0xfc + +void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + u32 *dst = ®s->data[priv->dreg]; + const struct net_device *dev = NULL; + const struct iphdr *iph; + __be32 addr; + + if (priv->flags & NFTA_FIB_F_IIF) + dev = nft_in(pkt); + else if (priv->flags & NFTA_FIB_F_OIF) + dev = nft_out(pkt); + + iph = ip_hdr(pkt->skb); + if (priv->flags & NFTA_FIB_F_DADDR) + addr = iph->daddr; + else + addr = iph->saddr; + + *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); +} +EXPORT_SYMBOL_GPL(nft_fib4_eval_type); + +static int get_ifindex(const struct net_device *dev) +{ + return dev ? dev->ifindex : 0; +} + +void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + const struct iphdr *iph; + struct fib_result res; + struct flowi4 fl4 = { + .flowi4_scope = RT_SCOPE_UNIVERSE, + .flowi4_iif = LOOPBACK_IFINDEX, + }; + const struct net_device *oif; + struct net_device *found; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int i; +#endif + + /* + * Do not set flowi4_oif, it restricts results (for example, asking + * for oif 3 will get RTN_UNICAST result even if the daddr exits + * on another interface. + * + * Search results for the desired outinterface instead. + */ + if (priv->flags & NFTA_FIB_F_OIF) + oif = nft_out(pkt); + else if (priv->flags & NFTA_FIB_F_IIF) + oif = nft_in(pkt); + else + oif = NULL; + + if (nft_hook(pkt) == NF_INET_PRE_ROUTING && fib4_is_local(pkt->skb)) { + nft_fib_store_result(dest, priv->result, pkt, LOOPBACK_IFINDEX); + return; + } + + iph = ip_hdr(pkt->skb); + if (ipv4_is_zeronet(iph->saddr)) { + if (ipv4_is_lbcast(iph->daddr) || + ipv4_is_local_multicast(iph->daddr)) { + nft_fib_store_result(dest, priv->result, pkt, + get_ifindex(pkt->skb->dev)); + return; + } + } + + if (priv->flags & NFTA_FIB_F_MARK) + fl4.flowi4_mark = pkt->skb->mark; + + fl4.flowi4_tos = iph->tos & DSCP_BITS; + + if (priv->flags & NFTA_FIB_F_DADDR) { + fl4.daddr = iph->daddr; + fl4.saddr = get_saddr(iph->saddr); + } else { + fl4.daddr = iph->saddr; + fl4.saddr = get_saddr(iph->daddr); + } + + *dest = 0; + + if (fib_lookup(nft_net(pkt), &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) + return; + + switch (res.type) { + case RTN_UNICAST: + break; + case RTN_LOCAL: /* should not appear here, see fib4_is_local() above */ + return; + default: + break; + } + + if (!oif) { + found = FIB_RES_DEV(res); + goto ok; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + for (i = 0; i < res.fi->fib_nhs; i++) { + struct fib_nh *nh = &res.fi->fib_nh[i]; + + if (nh->nh_dev == oif) { + found = nh->nh_dev; + goto ok; + } + } + return; +#else + found = FIB_RES_DEV(res); + if (found != oif) + return; +#endif +ok: + switch (priv->result) { + case NFT_FIB_RESULT_OIF: + *dest = found->ifindex; + break; + case NFT_FIB_RESULT_OIFNAME: + strncpy((char *)dest, found->name, IFNAMSIZ); + break; + default: + WARN_ON_ONCE(1); + break; + } +} +EXPORT_SYMBOL_GPL(nft_fib4_eval); + +static struct nft_expr_type nft_fib4_type; + +static const struct nft_expr_ops nft_fib4_type_ops = { + .type = &nft_fib4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), + .eval = nft_fib4_eval_type, + .init = nft_fib_init, + .dump = nft_fib_dump, + .validate = nft_fib_validate, +}; + +static const struct nft_expr_ops nft_fib4_ops = { + .type = &nft_fib4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), + .eval = nft_fib4_eval, + .init = nft_fib_init, + .dump = nft_fib_dump, + .validate = nft_fib_validate, +}; + +static const struct nft_expr_ops * +nft_fib4_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + enum nft_fib_result result; + + if (!tb[NFTA_FIB_RESULT]) + return ERR_PTR(-EINVAL); + + result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT])); + + switch (result) { + case NFT_FIB_RESULT_OIF: + return &nft_fib4_ops; + case NFT_FIB_RESULT_OIFNAME: + return &nft_fib4_ops; + case NFT_FIB_RESULT_ADDRTYPE: + return &nft_fib4_type_ops; + default: + return ERR_PTR(-EOPNOTSUPP); + } +} + +static struct nft_expr_type nft_fib4_type __read_mostly = { + .name = "fib", + .select_ops = &nft_fib4_select_ops, + .policy = nft_fib_policy, + .maxattr = NFTA_FIB_MAX, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, +}; + +static int __init nft_fib4_module_init(void) +{ + return nft_register_expr(&nft_fib4_type); +} + +static void __exit nft_fib4_module_exit(void) +{ + nft_unregister_expr(&nft_fib4_type); +} + +module_init(nft_fib4_module_init); +module_exit(nft_fib4_module_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_ALIAS_NFT_AF_EXPR(2, "fib"); diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 51ced81b616c..a0ea8aad1bf1 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,8 +31,14 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, range.max_proto.all = *(__be16 *)®s->data[priv->sreg_proto_max]; } - regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, - &range, pkt->out); + regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt), + &range, nft_out(pkt)); +} + +static void +nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_IPV4); } static struct nft_expr_type nft_masq_ipv4_type; @@ -41,6 +47,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), .eval = nft_masq_ipv4_eval, .init = nft_masq_init, + .destroy = nft_masq_ipv4_destroy, .dump = nft_masq_dump, .validate = nft_masq_validate, }; @@ -77,5 +84,5 @@ module_init(nft_masq_ipv4_module_init); module_exit(nft_masq_ipv4_module_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index c09d4381427e..1650ed23c15d 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -35,8 +35,13 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, mr.range[0].flags |= priv->flags; - regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, - pkt->hook); + regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt)); +} + +static void +nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_IPV4); } static struct nft_expr_type nft_redir_ipv4_type; @@ -45,6 +50,7 @@ static const struct nft_expr_ops nft_redir_ipv4_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), .eval = nft_redir_ipv4_eval, .init = nft_redir_init, + .destroy = nft_redir_ipv4_destroy, .dump = nft_redir_dump, .validate = nft_redir_validate, }; @@ -72,5 +78,5 @@ module_init(nft_redir_ipv4_module_init); module_exit(nft_redir_ipv4_module_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>"); MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index 2c2553b9026c..517ce93699de 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,10 +27,10 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook); + nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(pkt->net, pkt->skb, pkt->hook); + nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt)); break; default: break; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 7cf7d6e380c2..5b2635e69a92 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -657,6 +657,10 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, if (len > 0xFFFF) return -EMSGSIZE; + /* Must have at least a full ICMP header. */ + if (len < icmph_len) + return -EINVAL; + /* * Check the flags. */ @@ -789,7 +793,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); + inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, + sk->sk_uid); security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); @@ -994,7 +999,7 @@ struct proto ping_prot = { .init = ping_init_sock, .close = ping_close, .connect = ip4_datagram_connect, - .disconnect = udp_disconnect, + .disconnect = __udp_disconnect, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = ping_v4_sendmsg, diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 90a85c955872..2300fae11b22 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -89,9 +89,10 @@ struct raw_frag_vec { int hlen; }; -static struct raw_hashinfo raw_v4_hashinfo = { +struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), }; +EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { @@ -120,7 +121,7 @@ void raw_unhash_sk(struct sock *sk) } EXPORT_SYMBOL_GPL(raw_unhash_sk); -static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, +struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif) { sk_for_each_from(sk) { @@ -136,6 +137,7 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, found: return sk; } +EXPORT_SYMBOL_GPL(__raw_v4_lookup); /* * 0 - deliver @@ -604,7 +606,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), - daddr, saddr, 0, 0); + daddr, saddr, 0, 0, sk->sk_uid); if (!inet->hdrincl) { rfv.msg = msg; @@ -693,12 +695,20 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; + u32 tb_id = RT_TABLE_LOCAL; int ret = -EINVAL; int chk_addr_ret; if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); + + if (sk->sk_bound_dev_if) + tb_id = l3mdev_fib_table_by_index(sock_net(sk), + sk->sk_bound_dev_if) ? : tb_id; + + chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr, + tb_id); + ret = -EADDRNOTAVAIL; if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) @@ -912,13 +922,27 @@ static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg } #endif +int raw_abort(struct sock *sk, int err) +{ + lock_sock(sk); + + sk->sk_err = err; + sk->sk_error_report(sk); + __udp_disconnect(sk, 0); + + release_sock(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(raw_abort); + struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, .close = raw_close, .destroy = raw_destroy, .connect = ip4_datagram_connect, - .disconnect = udp_disconnect, + .disconnect = __udp_disconnect, .ioctl = raw_ioctl, .init = raw_init, .setsockopt = raw_setsockopt, @@ -937,6 +961,7 @@ struct proto raw_prot = { .compat_getsockopt = compat_raw_getsockopt, .compat_ioctl = compat_raw_ioctl, #endif + .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c new file mode 100644 index 000000000000..e1a51ca68d23 --- /dev/null +++ b/net/ipv4/raw_diag.c @@ -0,0 +1,266 @@ +#include <linux/module.h> + +#include <linux/inet_diag.h> +#include <linux/sock_diag.h> + +#include <net/inet_sock.h> +#include <net/raw.h> +#include <net/rawv6.h> + +#ifdef pr_fmt +# undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +static struct raw_hashinfo * +raw_get_hashinfo(const struct inet_diag_req_v2 *r) +{ + if (r->sdiag_family == AF_INET) { + return &raw_v4_hashinfo; +#if IS_ENABLED(CONFIG_IPV6) + } else if (r->sdiag_family == AF_INET6) { + return &raw_v6_hashinfo; +#endif + } else { + pr_warn_once("Unexpected inet family %d\n", + r->sdiag_family); + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); + } +} + +/* + * Due to requirement of not breaking user API we can't simply + * rename @pad field in inet_diag_req_v2 structure, instead + * use helper to figure it out. + */ + +static struct sock *raw_lookup(struct net *net, struct sock *from, + const struct inet_diag_req_v2 *req) +{ + struct inet_diag_req_raw *r = (void *)req; + struct sock *sk = NULL; + + if (r->sdiag_family == AF_INET) + sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol, + r->id.idiag_dst[0], + r->id.idiag_src[0], + r->id.idiag_if); +#if IS_ENABLED(CONFIG_IPV6) + else + sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol, + (const struct in6_addr *)r->id.idiag_src, + (const struct in6_addr *)r->id.idiag_dst, + r->id.idiag_if); +#endif + return sk; +} + +static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) +{ + struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); + struct sock *sk = NULL, *s; + int slot; + + if (IS_ERR(hashinfo)) + return ERR_CAST(hashinfo); + + read_lock(&hashinfo->lock); + for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { + sk_for_each(s, &hashinfo->ht[slot]) { + sk = raw_lookup(net, s, r); + if (sk) { + /* + * Grab it and keep until we fill + * diag meaage to be reported, so + * caller should call sock_put then. + * We can do that because we're keeping + * hashinfo->lock here. + */ + sock_hold(sk); + goto out_unlock; + } + } + } +out_unlock: + read_unlock(&hashinfo->lock); + + return sk ? sk : ERR_PTR(-ENOENT); +} + +static int raw_diag_dump_one(struct sk_buff *in_skb, + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *r) +{ + struct net *net = sock_net(in_skb->sk); + struct sk_buff *rep; + struct sock *sk; + int err; + + sk = raw_sock_get(net, r); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + rep = nlmsg_new(sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + 64, + GFP_KERNEL); + if (!rep) { + sock_put(sk); + return -ENOMEM; + } + + err = inet_sk_diag_fill(sk, NULL, rep, r, + sk_user_ns(NETLINK_CB(in_skb).sk), + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); + sock_put(sk); + + if (err < 0) { + kfree_skb(rep); + return err; + } + + err = netlink_unicast(net->diag_nlsk, rep, + NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; + return err; +} + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, + struct nlattr *bc, bool net_admin) +{ + if (!inet_diag_bc_sk(bc, sk)) + return 0; + + return inet_sk_diag_fill(sk, NULL, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh, net_admin); +} + +static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, struct nlattr *bc) +{ + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); + struct net *net = sock_net(skb->sk); + int num, s_num, slot, s_slot; + struct sock *sk = NULL; + + if (IS_ERR(hashinfo)) + return; + + s_slot = cb->args[0]; + num = s_num = cb->args[1]; + + read_lock(&hashinfo->lock); + for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { + num = 0; + + sk_for_each(sk, &hashinfo->ht[slot]) { + struct inet_sock *inet = inet_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) + goto next; + if (sk->sk_family != r->sdiag_family) + goto next; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) + goto out_unlock; +next: + num++; + } + } + +out_unlock: + read_unlock(&hashinfo->lock); + + cb->args[0] = slot; + cb->args[1] = num; +} + +static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *info) +{ + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); +} + +#ifdef CONFIG_INET_DIAG_DESTROY +static int raw_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *r) +{ + struct net *net = sock_net(in_skb->sk); + struct sock *sk; + int err; + + sk = raw_sock_get(net, r); + if (IS_ERR(sk)) + return PTR_ERR(sk); + err = sock_diag_destroy(sk, ECONNABORTED); + sock_put(sk); + return err; +} +#endif + +static const struct inet_diag_handler raw_diag_handler = { + .dump = raw_diag_dump, + .dump_one = raw_diag_dump_one, + .idiag_get_info = raw_diag_get_info, + .idiag_type = IPPROTO_RAW, + .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = raw_diag_destroy, +#endif +}; + +static void __always_unused __check_inet_diag_req_raw(void) +{ + /* + * Make sure the two structures are identical, + * except the @pad field. + */ +#define __offset_mismatch(m1, m2) \ + (offsetof(struct inet_diag_req_v2, m1) != \ + offsetof(struct inet_diag_req_raw, m2)) + + BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) != + sizeof(struct inet_diag_req_raw)); + BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family)); + BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol)); + BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext)); + BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol)); + BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states)); + BUILD_BUG_ON(__offset_mismatch(id, id)); +#undef __offset_mismatch +} + +static int __init raw_diag_init(void) +{ + return inet_diag_register(&raw_diag_handler); +} + +static void __exit raw_diag_exit(void) +{ + inet_diag_unregister(&raw_diag_handler); +} + +module_init(raw_diag_init); +module_exit(raw_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 62d4d90c1389..fa5c037227cb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -507,7 +507,8 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) } EXPORT_SYMBOL(__ip_select_ident); -static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, +static void __build_flow_key(const struct net *net, struct flowi4 *fl4, + const struct sock *sk, const struct iphdr *iph, int oif, u8 tos, u8 prot, u32 mark, int flow_flags) @@ -523,19 +524,21 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, flowi4_init_output(fl4, oif, mark, tos, RT_SCOPE_UNIVERSE, prot, flow_flags, - iph->daddr, iph->saddr, 0, 0); + iph->daddr, iph->saddr, 0, 0, + sock_net_uid(net, sk)); } static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, const struct sock *sk) { + const struct net *net = dev_net(skb->dev); const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; u8 tos = RT_TOS(iph->tos); u8 prot = iph->protocol; u32 mark = skb->mark; - __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); + __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) @@ -552,7 +555,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), - daddr, inet->inet_saddr, 0, 0); + daddr, inet->inet_saddr, 0, 0, sk->sk_uid); rcu_read_unlock(); } @@ -753,7 +756,9 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow goto reject_redirect; } - n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); + if (!n) + n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); @@ -800,7 +805,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf rt = (struct rtable *) dst; - __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0); + __build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } @@ -1018,7 +1023,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, if (!mark) mark = IP4_REPLY_MARK(net, skb->mark); - __build_flow_key(&fl4, NULL, iph, oif, + __build_flow_key(net, &fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { @@ -1034,7 +1039,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct flowi4 fl4; struct rtable *rt; - __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); if (!fl4.flowi4_mark) fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); @@ -1053,6 +1058,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) struct rtable *rt; struct dst_entry *odst = NULL; bool new = false; + struct net *net = sock_net(sk); bh_lock_sock(sk); @@ -1066,7 +1072,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) goto out; } - __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = (struct rtable *)odst; if (odst->obsolete && !odst->ops->check(odst, 0)) { @@ -1106,7 +1112,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net, struct flowi4 fl4; struct rtable *rt; - __build_flow_key(&fl4, NULL, iph, oif, + __build_flow_key(net, &fl4, NULL, iph, oif, RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { @@ -1121,9 +1127,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; + struct net *net = sock_net(sk); - __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - rt = __ip_route_output_key(sock_net(sk), &fl4); + __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); @@ -1596,6 +1603,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) spin_unlock_bh(&fnhe_lock); } +static void set_lwt_redirect(struct rtable *rth) +{ + if (lwtunnel_output_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_output = rth->dst.output; + rth->dst.output = lwtunnel_output; + } + + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_input = rth->dst.input; + rth->dst.input = lwtunnel_input; + } +} + /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1685,14 +1705,7 @@ rt_cache: rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); - if (lwtunnel_output_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_output = rth->dst.output; - rth->dst.output = lwtunnel_output; - } - if (lwtunnel_input_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_input = rth->dst.input; - rth->dst.input = lwtunnel_input; - } + set_lwt_redirect(rth); skb_dst_set(skb, &rth->dst); out: err = 0; @@ -1919,8 +1932,18 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } + if (do_cache) { - if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { + struct fib_nh *nh = &FIB_RES_NH(res); + + rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + WARN_ON(rth->dst.input == lwtunnel_input); + rth->dst.lwtstate->orig_input = rth->dst.input; + rth->dst.input = lwtunnel_input; + } + + if (unlikely(!rt_cache_route(nh, rth))) { rth->dst.flags |= DST_NOCACHE; rt_add_uncached_list(rth); } @@ -1980,25 +2003,35 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, */ if (ipv4_is_multicast(daddr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); + int our = 0; + + if (in_dev) + our = ip_check_mc_rcu(in_dev, daddr, saddr, + ip_hdr(skb)->protocol); + + /* check l3 master if no match yet */ + if ((!in_dev || !our) && netif_is_l3_slave(dev)) { + struct in_device *l3_in_dev; - if (in_dev) { - int our = ip_check_mc_rcu(in_dev, daddr, saddr, - ip_hdr(skb)->protocol); - if (our + l3_in_dev = __in_dev_get_rcu(skb->dev); + if (l3_in_dev) + our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, + ip_hdr(skb)->protocol); + } + + res = -EINVAL; + if (our #ifdef CONFIG_IP_MROUTE - || - (!ipv4_is_local_multicast(daddr) && - IN_DEV_MFORWARD(in_dev)) + || + (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif - ) { - int res = ip_route_input_mc(skb, daddr, saddr, - tos, dev, our); - rcu_read_unlock(); - return res; - } + ) { + res = ip_route_input_mc(skb, daddr, saddr, + tos, dev, our); } rcu_read_unlock(); - return -EINVAL; + return res; } res = ip_route_input_slow(skb, daddr, saddr, tos, dev); rcu_read_unlock(); @@ -2138,8 +2171,7 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - if (lwtunnel_output_redirect(rth->dst.lwtstate)) - rth->dst.output = lwtunnel_output; + set_lwt_redirect(rth); return rth; } @@ -2266,7 +2298,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif && - !netif_index_is_l3_master(net, fl4->flowi4_oif)) { + (ipv4_is_multicast(fl4->daddr) || + !netif_index_is_l3_master(net, fl4->flowi4_oif))) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2493,6 +2526,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; + if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && + nla_put_u32(skb, RTA_UID, + from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) + goto nla_put_failure; + error = rt->dst.error; if (rt_is_input_route(rt)) { @@ -2545,6 +2583,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) int mark; struct sk_buff *skb; u32 table_id = RT_TABLE_MAIN; + kuid_t uid; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) @@ -2572,6 +2611,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + if (tb[RTA_UID]) + uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); + else + uid = (iif ? INVALID_UID : current_uid()); memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; @@ -2579,6 +2622,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_tos = rtm->rtm_tos; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; + fl4.flowi4_uid = uid; if (iif) { struct net_device *dev; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index e3c4043c27de..3e88467d70ee 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -334,6 +334,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq = tcp_rsk(req); treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; + treq->ts_off = 0; req->mss = mss; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; @@ -372,7 +373,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, th->source, th->dest); + ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1cb67de106fe..80bc36b25de2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -96,11 +96,11 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low container_of(table->data, struct net, ipv4.ping_group_range.range); unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } /* Update system visible IP port range */ @@ -109,10 +109,10 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig kgid_t *data = table->data; struct net *net = container_of(table->data, struct net, ipv4.ping_group_range.range); - write_seqlock(&net->ipv4.ip_local_ports.lock); + write_seqlock(&net->ipv4.ping_group_range.lock); data[0] = low; data[1] = high; - write_sequnlock(&net->ipv4.ip_local_ports.lock); + write_sequnlock(&net->ipv4.ping_group_range.lock); } /* Validate changes from /proc interface. */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3251fe71f39f..1ef3165114ba 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,7 +279,6 @@ #include <asm/uaccess.h> #include <asm/ioctls.h> -#include <asm/unaligned.h> #include <net/busy_poll.h> int sysctl_tcp_min_tso_segs __read_mostly = 2; @@ -405,7 +404,6 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; - u64_stats_init(&tp->syncp); tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; tcp_enable_early_retrans(tp); @@ -665,9 +663,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, if (tcp_should_autocork(sk, skb, size_goal)) { /* avoid atomic op if TSQ_THROTTLED bit is already set */ - if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { + if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); - set_bit(TSQ_THROTTLED, &tp->tsq_flags); + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. @@ -998,8 +996,11 @@ do_error: goto out; out_err: /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && + err == -EAGAIN)) { sk->sk_write_space(sk); + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } return sk_stream_error(sk, flags, err); } @@ -1164,7 +1165,7 @@ restart: err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto out_err; + goto do_error; sg = !!(sk->sk_route_caps & NETIF_F_SG); @@ -1241,7 +1242,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i == sysctl_max_skb_frags || !sg) { + if (i >= sysctl_max_skb_frags || !sg) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1333,8 +1334,11 @@ do_error: out_err: err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ - if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && + err == -EAGAIN)) { sk->sk_write_space(sk); + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } release_sock(sk); return err; } @@ -2302,7 +2306,7 @@ EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && - ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); + (sk->sk_state != TCP_LISTEN); } static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len) @@ -2704,15 +2708,33 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_setsockopt); #endif +static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, + struct tcp_info *info) +{ + u64 stats[__TCP_CHRONO_MAX], total = 0; + enum tcp_chrono i; + + for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { + stats[i] = tp->chrono_stat[i - 1]; + if (i == tp->chrono_type) + stats[i] += tcp_time_stamp - tp->chrono_start; + stats[i] *= USEC_PER_SEC / HZ; + total += stats[i]; + } + + info->tcpi_busy_time = total; + info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; + info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; +} + /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp, intv; - unsigned int start; - int notsent_bytes; u64 rate64; + bool slow; u32 rate; memset(info, 0, sizeof(*info)); @@ -2721,6 +2743,27 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_state = sk_state_load(sk); + /* Report meaningful fields for all TCP states, including listeners */ + rate = READ_ONCE(sk->sk_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + info->tcpi_pacing_rate = rate64; + + rate = READ_ONCE(sk->sk_max_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + info->tcpi_max_pacing_rate = rate64; + + info->tcpi_reordering = tp->reordering; + info->tcpi_snd_cwnd = tp->snd_cwnd; + + if (info->tcpi_state == TCP_LISTEN) { + /* listeners aliased fields : + * tcpi_unacked -> Number of children ready for accept() + * tcpi_sacked -> max backlog + */ + info->tcpi_unacked = sk->sk_ack_backlog; + info->tcpi_sacked = sk->sk_max_ack_backlog; + return; + } info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2748,13 +2791,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - if (info->tcpi_state == TCP_LISTEN) { - info->tcpi_unacked = sk->sk_ack_backlog; - info->tcpi_sacked = sk->sk_max_ack_backlog; - } else { - info->tcpi_unacked = tp->packets_out; - info->tcpi_sacked = tp->sacked_out; - } + info->tcpi_unacked = tp->packets_out; + info->tcpi_sacked = tp->sacked_out; + info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; info->tcpi_fackets = tp->fackets_out; @@ -2768,34 +2807,25 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rtt = tp->srtt_us >> 3; info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; - info->tcpi_snd_cwnd = tp->snd_cwnd; info->tcpi_advmss = tp->advmss; - info->tcpi_reordering = tp->reordering; info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; - rate = READ_ONCE(sk->sk_pacing_rate); - rate64 = rate != ~0U ? rate : ~0ULL; - put_unaligned(rate64, &info->tcpi_pacing_rate); + slow = lock_sock_fast(sk); - rate = READ_ONCE(sk->sk_max_pacing_rate); - rate64 = rate != ~0U ? rate : ~0ULL; - put_unaligned(rate64, &info->tcpi_max_pacing_rate); + info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; + info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); + tcp_get_info_chrono_stats(tp, info); + + unlock_sock_fast(sk, slow); - do { - start = u64_stats_fetch_begin_irq(&tp->syncp); - put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked); - put_unaligned(tp->bytes_received, &info->tcpi_bytes_received); - } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; - notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); - info->tcpi_notsent_bytes = max(0, notsent_bytes); - info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_in = tp->data_segs_in; info->tcpi_data_segs_out = tp->data_segs_out; @@ -2806,11 +2836,31 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (rate && intv) { rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; do_div(rate64, intv); - put_unaligned(rate64, &info->tcpi_delivery_rate); + info->tcpi_delivery_rate = rate64; } } EXPORT_SYMBOL_GPL(tcp_get_info); +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *stats; + struct tcp_info info; + + stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); + if (!stats) + return NULL; + + tcp_get_info_chrono_stats(tp, &info); + nla_put_u64_64bit(stats, TCP_NLA_BUSY, + info.tcpi_busy_time, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, + info.tcpi_rwnd_limited, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, + info.tcpi_sndbuf_limited, TCP_NLA_PAD); + return stats; +} + static int do_tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 0ea66c2c9344..b89bce4c721e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -14,6 +14,36 @@ * observed, or adjust the sending rate if it estimates there is a * traffic policer, in order to keep the drop rate reasonable. * + * Here is a state transition diagram for BBR: + * + * | + * V + * +---> STARTUP ----+ + * | | | + * | V | + * | DRAIN ----+ + * | | | + * | V | + * +---> PROBE_BW ----+ + * | ^ | | + * | | | | + * | +----+ | + * | | + * +---- PROBE_RTT <--+ + * + * A BBR flow starts in STARTUP, and ramps up its sending rate quickly. + * When it estimates the pipe is full, it enters DRAIN to drain the queue. + * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. + * A long-lived BBR flow spends the vast majority of its time remaining + * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth + * in a fair manner, with a small, bounded queue. *If* a flow has been + * continuously sending for the entire min_rtt window, and hasn't seen an RTT + * sample that matches or decreases its min_rtt estimate for 10 seconds, then + * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe + * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if + * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; + * otherwise we enter STARTUP to try to fill the pipe. + * * BBR is described in detail in: * "BBR: Congestion-Based Congestion Control", * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, @@ -51,7 +81,7 @@ enum bbr_mode { BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ BBR_DRAIN, /* drain any queue created during startup */ BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ - BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */ + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ }; /* BBR congestion control block */ diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 1294af4e0127..79c4817abc94 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -68,8 +68,9 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret = 0; - /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) { + /* all algorithms must implement these */ + if (!ca->ssthresh || !ca->undo_cwnd || + !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } @@ -200,8 +201,10 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; - if (sk->sk_state != TCP_CLOSE) + if (sk->sk_state != TCP_CLOSE) { + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); tcp_init_congestion_control(sk); + } } /* Manage refcounts on socket close. */ @@ -441,10 +444,19 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); +u32 tcp_reno_undo_cwnd(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + return max(tp->snd_cwnd, tp->snd_ssthresh << 1); +} +EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); + struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, }; diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 10d728b6804c..5f5e5936760e 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -56,6 +56,7 @@ struct dctcp { u32 next_seq; u32 ce_state; u32 delayed_ack_reserved; + u32 loss_cwnd; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ @@ -96,6 +97,7 @@ static void dctcp_init(struct sock *sk) ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->delayed_ack_reserved = 0; + ca->loss_cwnd = 0; ca->ce_state = 0; dctcp_reset(tp, ca); @@ -111,9 +113,10 @@ static void dctcp_init(struct sock *sk) static u32 dctcp_ssthresh(struct sock *sk) { - const struct dctcp *ca = inet_csk_ca(sk); + struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); + ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } @@ -308,12 +311,20 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, return 0; } +static u32 dctcp_cwnd_undo(struct sock *sk) +{ + const struct dctcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + static struct tcp_congestion_ops dctcp __read_mostly = { .init = dctcp_init, .in_ack_event = dctcp_update_alpha, .cwnd_event = dctcp_cwnd_event, .ssthresh = dctcp_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = dctcp_cwnd_undo, .set_state = dctcp_state, .get_info = dctcp_get_info, .flags = TCP_CONG_NEEDS_ECN, @@ -324,6 +335,7 @@ static struct tcp_congestion_ops dctcp __read_mostly = { static struct tcp_congestion_ops dctcp_reno __read_mostly = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, .get_info = dctcp_get_info, .owner = THIS_MODULE, .name = "dctcp-reno", diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index db7842495a64..6d9879e93648 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -94,6 +94,7 @@ static const struct hstcp_aimd_val { struct hstcp { u32 ai; + u32 loss_cwnd; }; static void hstcp_init(struct sock *sk) @@ -150,16 +151,24 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 hstcp_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - const struct hstcp *ca = inet_csk_ca(sk); + struct hstcp *ca = inet_csk_ca(sk); + ca->loss_cwnd = tp->snd_cwnd; /* Do multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); } +static u32 hstcp_cwnd_undo(struct sock *sk) +{ + const struct hstcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, + .undo_cwnd = hstcp_cwnd_undo, .cong_avoid = hstcp_cong_avoid, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 083831e359df..0f7175c3338e 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -166,6 +166,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = hybla_cong_avoid, .set_state = hybla_state, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index c8e6d86be114..60352ff4f5a8 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -48,6 +48,7 @@ struct illinois { u32 end_seq; /* right edge of current RTT */ u32 alpha; /* Additive increase */ u32 beta; /* Muliplicative decrease */ + u32 loss_cwnd; /* cwnd on loss */ u16 acked; /* # packets acked by current ACK */ u8 rtt_above; /* average rtt has gone above threshold */ u8 rtt_low; /* # of rtts measurements below threshold */ @@ -296,10 +297,18 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); + ca->loss_cwnd = tp->snd_cwnd; /* Multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); } +static u32 tcp_illinois_cwnd_undo(struct sock *sk) +{ + const struct illinois *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + /* Extract info for Tcp socket info provided via netlink. */ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) @@ -327,6 +336,7 @@ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, static struct tcp_congestion_ops tcp_illinois __read_mostly = { .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, + .undo_cwnd = tcp_illinois_cwnd_undo, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, .get_info = tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a27b9c0e27c0..6c790754ae3e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,6 +85,7 @@ int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +EXPORT_SYMBOL(sysctl_tcp_timestamps); /* rfc5961 challenge ack rate limiting */ int sysctl_tcp_challenge_ack_limit = 1000; @@ -128,6 +129,23 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define REXMIT_LOST 1 /* retransmit packets marked lost */ #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ +static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb) +{ + static bool __once __read_mostly; + + if (!__once) { + struct net_device *dev; + + __once = true; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); + pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", + dev ? dev->name : "Unknown driver"); + rcu_read_unlock(); + } +} + /* Adapt the MSS value used to make delayed ack decision to the * real world. */ @@ -144,7 +162,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { - icsk->icsk_ack.rcv_mss = len; + icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, + tcp_sk(sk)->advmss); + if (unlikely(icsk->icsk_ack.rcv_mss != len)) + tcp_gro_dev_warn(sk, skb); } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. @@ -2394,10 +2415,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->undo_cwnd) - tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); - else - tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); + tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; @@ -3181,6 +3199,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->lost_skb_hint = NULL; } + if (!skb) + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; @@ -3351,9 +3372,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) u32 delta = ack - tp->snd_una; sock_owned_by_me((struct sock *)tp); - u64_stats_update_begin_raw(&tp->syncp); tp->bytes_acked += delta; - u64_stats_update_end_raw(&tp->syncp); tp->snd_una = ack; } @@ -3363,9 +3382,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) u32 delta = seq - tp->rcv_nxt; sock_owned_by_me((struct sock *)tp); - u64_stats_update_begin_raw(&tp->syncp); tp->bytes_received += delta; - u64_stats_update_end_raw(&tp->syncp); tp->rcv_nxt = seq; } @@ -5063,8 +5080,11 @@ static void tcp_check_space(struct sock *sk) /* pairs with tcp_poll() */ smp_mb__after_atomic(); if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); + } } } @@ -6298,13 +6318,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - - /* Accept backlog is full. If we have already queued enough - * of warm entries in syn queue, drop request. It is better than - * clogging syn queue with openreqs with exponentially increasing - * timeout. - */ - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { + if (sk_acceptq_is_full(sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; } @@ -6314,6 +6328,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; tcp_rsk(req)->af_specific = af_ops; + tcp_rsk(req)->ts_off = 0; tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; @@ -6335,6 +6350,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; + if (isn && tmp_opt.tstamp_ok) + af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); + if (!want_cookie && !isn) { /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering @@ -6375,7 +6393,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_release; } - isn = af_ops->init_seq(skb); + isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); } if (!dst) { dst = af_ops->route_req(sk, &fl, req, NULL); @@ -6387,6 +6405,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + tcp_rsk(req)->ts_off = 0; req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bd5e8d10893f..30d81f533ada 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -86,7 +86,6 @@ int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; -EXPORT_SYMBOL(sysctl_tcp_low_latency); #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, @@ -96,12 +95,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static __u32 tcp_v4_init_sequence(const struct sk_buff *skb) +static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff) { return secure_tcp_sequence_number(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, tcp_hdr(skb)->dest, - tcp_hdr(skb)->source); + tcp_hdr(skb)->source, tsoff); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -238,7 +237,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, - usin->sin_port); + usin->sin_port, + &tp->tsoffset); inet->inet_id = tp->write_seq ^ jiffies; @@ -443,7 +443,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); } else { - if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } goto out; @@ -692,6 +692,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) offsetof(struct inet_timewait_sock, tw_bound_dev_if)); arg.tos = ip_hdr(skb)->tos; + arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, @@ -712,7 +713,7 @@ out: outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct net *net, +static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, @@ -727,6 +728,7 @@ static void tcp_v4_send_ack(struct net *net, #endif ]; } rep; + struct net *net = sock_net(sk); struct ip_reply_arg arg; memset(&rep.th, 0, sizeof(struct tcphdr)); @@ -776,6 +778,7 @@ static void tcp_v4_send_ack(struct net *net, if (oif) arg.bound_dev_if = oif; arg.tos = tos; + arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, @@ -791,7 +794,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(sock_net(sk), skb, + tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, @@ -819,10 +822,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * exception of <SYN> segments, MUST be right-shifted by * Rcv.Wind.Shift bits: */ - tcp_v4_send_ack(sock_net(sk), skb, seq, + tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp, + tcp_time_stamp + tcp_rsk(req)->ts_off, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, @@ -1565,6 +1568,21 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_add_backlog); +int tcp_filter(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = (struct tcphdr *)skb->data; + unsigned int eaten = skb->len; + int err; + + err = sk_filter_trim_cap(sk, skb, th->doff * 4); + if (!err) { + eaten -= skb->len; + TCP_SKB_CB(skb)->end_seq -= eaten; + } + return err; +} +EXPORT_SYMBOL(tcp_filter); + /* * From tcp_input.c */ @@ -1677,8 +1695,10 @@ process: nf_reset(skb); - if (sk_filter(sk, skb)) + if (tcp_filter(sk, skb)) goto discard_and_relse; + th = (const struct tcphdr *)skb->data; + iph = ip_hdr(skb); skb->dev = NULL; @@ -1887,13 +1907,12 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; - struct inet_connection_sock *icsk; struct sock *sk = cur; if (!sk) { get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_lock_bh(&ilb->lock); + spin_lock(&ilb->lock); sk = sk_head(&ilb->head); st->offset = 0; goto get_sk; @@ -1909,9 +1928,8 @@ get_sk: continue; if (sk->sk_family == st->family) return sk; - icsk = inet_csk(sk); } - spin_unlock_bh(&ilb->lock); + spin_unlock(&ilb->lock); st->offset = 0; if (++st->bucket < INET_LHTABLE_SIZE) goto get_head; @@ -2119,7 +2137,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); + spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); break; case TCP_SEQ_STATE_ESTABLISHED: if (v) diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index c67ece1390c2..046fd3910873 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -316,6 +316,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) static struct tcp_congestion_ops tcp_lp __read_mostly = { .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_lp_cong_avoid, .pkts_acked = tcp_lp_pkts_acked, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index bf1f3b2b29d1..d46f4d5b1c62 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -742,14 +742,7 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, rcu_read_unlock(); } -static struct genl_family tcp_metrics_nl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = TCP_METRICS_GENL_NAME, - .version = TCP_METRICS_GENL_VERSION, - .maxattr = TCP_METRICS_ATTR_MAX, - .netnsok = true, -}; +static struct genl_family tcp_metrics_nl_family; static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, @@ -1116,6 +1109,17 @@ static const struct genl_ops tcp_metrics_nl_ops[] = { }, }; +static struct genl_family tcp_metrics_nl_family __ro_after_init = { + .hdrsize = 0, + .name = TCP_METRICS_GENL_NAME, + .version = TCP_METRICS_GENL_VERSION, + .maxattr = TCP_METRICS_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = tcp_metrics_nl_ops, + .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops), +}; + static unsigned int tcpmhash_entries; static int __init set_tcpmhash_entries(char *str) { @@ -1179,8 +1183,7 @@ void __init tcp_metrics_init(void) if (ret < 0) panic("Could not allocate the tcp_metrics hash table\n"); - ret = genl_register_family_with_ops(&tcp_metrics_nl_family, - tcp_metrics_nl_ops); + ret = genl_register_family(&tcp_metrics_nl_family); if (ret < 0) panic("Could not register tcp_metrics generic netlink\n"); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6234ebaa7db1..28ce5ee831f5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -532,7 +532,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } - newtp->tsoffset = 0; + newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ if (newtp->af_specific->md5_lookup(sk, newsk)) @@ -581,6 +581,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; + if (tmp_opt.rcv_tsecr) + tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 896e9dfbdb5c..b45101f3d2bd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -640,7 +640,7 @@ static unsigned int tcp_synack_options(struct request_sock *req, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp(skb); + opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off; opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -769,25 +769,26 @@ static void tcp_tasklet_func(unsigned long data) list_del(&tp->tsq_node); sk = (struct sock *)tp; - bh_lock_sock(sk); - - if (!sock_owned_by_user(sk)) { - tcp_tsq_handler(sk); - } else { - /* defer the work to tcp_release_cb() */ - set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); + clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); + + if (!sk->sk_lock.owned && + test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) { + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); + tcp_tsq_handler(sk); + } + bh_unlock_sock(sk); } - bh_unlock_sock(sk); - clear_bit(TSQ_QUEUED, &tp->tsq_flags); sk_free(sk); } } -#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ - (1UL << TCP_WRITE_TIMER_DEFERRED) | \ - (1UL << TCP_DELACK_TIMER_DEFERRED) | \ - (1UL << TCP_MTU_REDUCED_DEFERRED)) +#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ + TCPF_WRITE_TIMER_DEFERRED | \ + TCPF_DELACK_TIMER_DEFERRED | \ + TCPF_MTU_REDUCED_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket @@ -797,18 +798,17 @@ static void tcp_tasklet_func(unsigned long data) */ void tcp_release_cb(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nflags; /* perform an atomic operation only if at least one flag is set */ do { - flags = tp->tsq_flags; + flags = sk->sk_tsq_flags; if (!(flags & TCP_DEFERRED_ALL)) return; nflags = flags & ~TCP_DEFERRED_ALL; - } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); + } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); - if (flags & (1UL << TCP_TSQ_DEFERRED)) + if (flags & TCPF_TSQ_DEFERRED) tcp_tsq_handler(sk); /* Here begins the tricky part : @@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk) */ sock_release_ownership(sk); - if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { + if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk); __sock_put(sk); } - if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { + if (flags & TCPF_DELACK_TIMER_DEFERRED) { tcp_delack_timer_handler(sk); __sock_put(sk); } - if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { + if (flags & TCPF_MTU_REDUCED_DEFERRED) { inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } @@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); + unsigned long flags, nval, oval; int wmem; /* Keep one reference on sk_wmem_alloc. @@ -877,16 +878,25 @@ void tcp_wfree(struct sk_buff *skb) if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; - if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && - !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { - unsigned long flags; + for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { struct tsq_tasklet *tsq; + bool empty; + + if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) + goto out; + + nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; + nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); + if (nval != oval) + continue; /* queue this socket to tasklet queue */ local_irq_save(flags); tsq = this_cpu_ptr(&tsq_tasklet); + empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); - tasklet_schedule(&tsq->tasklet); + if (empty) + tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); return; } @@ -1514,6 +1524,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) if (sysctl_tcp_slow_start_after_idle && (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); + + /* The following conditions together indicate the starvation + * is caused by insufficient sender buffer: + * 1) just sent some data (see tcp_write_xmit) + * 2) not cwnd limited (this else condition) + * 3) no more data to send (null tcp_send_head ) + * 4) application is hitting buffer limit (SOCK_NOSPACE) + */ + if (!tcp_send_head(sk) && sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && + (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); } } @@ -1910,26 +1932,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) */ static int tcp_mtu_probe(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *nskb, *next; struct net *net = sock_net(sk); - int len; int probe_size; int size_needed; - int copy; + int copy, len; int mss_now; int interval; /* Not currently probing/verifying, * not in recovery, * have enough cwnd, and - * not SACKing (the variable headers throw things off) */ - if (!icsk->icsk_mtup.enabled || - icsk->icsk_mtup.probe_size || - inet_csk(sk)->icsk_ca_state != TCP_CA_Open || - tp->snd_cwnd < 11 || - tp->rx_opt.num_sacks || tp->rx_opt.dsack) + * not SACKing (the variable headers throw things off) + */ + if (likely(!icsk->icsk_mtup.enabled || + icsk->icsk_mtup.probe_size || + inet_csk(sk)->icsk_ca_state != TCP_CA_Open || + tp->snd_cwnd < 11 || + tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; /* Use binary search for probe_size between tcp_mss_base, @@ -2069,7 +2091,16 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit <<= factor; if (atomic_read(&sk->sk_wmem_alloc) > limit) { - set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags); + /* Always send the 1st or 2nd skb in write queue. + * No need to wait for TX completion to call us back, + * after softirq/tasklet schedule. + * This helps when TX completions are delayed too much. + */ + if (skb == sk->sk_write_queue.next || + skb->prev == sk->sk_write_queue.next) + return false; + + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. @@ -2081,6 +2112,47 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, return false; } +static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) +{ + const u32 now = tcp_time_stamp; + + if (tp->chrono_type > TCP_CHRONO_UNSPEC) + tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; + tp->chrono_start = now; + tp->chrono_type = new; +} + +void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* If there are multiple conditions worthy of tracking in a + * chronograph then the highest priority enum takes precedence + * over the other conditions. So that if something "more interesting" + * starts happening, stop the previous chrono and start a new one. + */ + if (type > tp->chrono_type) + tcp_chrono_set(tp, type); +} + +void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) +{ + struct tcp_sock *tp = tcp_sk(sk); + + + /* There are multiple conditions worthy of tracking in a + * chronograph, so that the highest priority enum takes + * precedence over the other conditions (see tcp_chrono_start). + * If a condition stops, we only stop chrono tracking if + * it's the "most interesting" or current chrono we are + * tracking and starts busy chrono if we have pending data. + */ + if (tcp_write_queue_empty(sk)) + tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); + else if (type == tp->chrono_type) + tcp_chrono_set(tp, TCP_CHRONO_BUSY); +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -2103,7 +2175,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; - bool is_cwnd_limited = false; + bool is_cwnd_limited = false, is_rwnd_limited = false; u32 max_segs; sent_pkts = 0; @@ -2140,8 +2212,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } - if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { + is_rwnd_limited = true; break; + } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, @@ -2167,6 +2241,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; + if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) + clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -2186,6 +2262,11 @@ repair: break; } + if (is_rwnd_limited) + tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); + else + tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -2514,7 +2595,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, } /* Collapses two adjacent SKB's during retransmission. */ -static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) +static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); @@ -2525,13 +2606,17 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); + if (next_skb_size) { + if (next_skb_size <= skb_availroom(skb)) + skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), + next_skb_size); + else if (!skb_shift(skb, next_skb, next_skb_size)) + return false; + } tcp_highest_sack_combine(sk, next_skb, skb); tcp_unlink_write_queue(next_skb, sk); - skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), - next_skb_size); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -2560,6 +2645,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_skb_collapse_tstamp(skb, next_skb); sk_wmem_free_skb(sk, next_skb); + return true; } /* Check if coalescing SKBs is legal. */ @@ -2567,14 +2653,11 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) return false; - /* TODO: SACK collapsing could be used to remove this condition */ - if (skb_shinfo(skb)->nr_frags != 0) - return false; if (skb_cloned(skb)) return false; if (skb == tcp_send_head(sk)) return false; - /* Some heurestics for collapsing over SACK'd could be invented */ + /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; @@ -2612,16 +2695,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (space < 0) break; - /* Punt if not enough space exists in the first SKB for - * the data in the second - */ - if (skb->len > skb_availroom(to)) - break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) break; - tcp_collapse_retrans(sk, to); + if (!tcp_collapse_retrans(sk, to)) + break; } } @@ -3300,6 +3379,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) fo->copied = space; tcp_connect_queue_skb(sk, syn_data); + if (syn_data->len) + tcp_chrono_start(sk, TCP_CHRONO_BUSY); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); @@ -3464,8 +3545,6 @@ void tcp_send_ack(struct sock *sk) /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 - * We also avoid tcp_wfree() overhead (cache line miss accessing - * tp->tsq_flags) by using regular sock_wfree() */ skb_set_tcp_pure_ack(buff); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index bf5ea9e9bbc1..f2123075ce6e 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,6 +15,10 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 +struct scalable { + u32 loss_cwnd; +}; + static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -32,12 +36,23 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + struct scalable *ca = inet_csk_ca(sk); + + ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); } +static u32 tcp_scalable_cwnd_undo(struct sock *sk) +{ + const struct scalable *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, + .undo_cwnd = tcp_scalable_cwnd_undo, .cong_avoid = tcp_scalable_cong_avoid, .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3ea1cf804748..3705075f42c3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -310,7 +310,7 @@ static void tcp_delack_timer(unsigned long data) inet_csk(sk)->icsk_ack.blocked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); @@ -592,7 +592,7 @@ static void tcp_write_timer(unsigned long data) tcp_write_timer_handler(sk); } else { /* delegate our work to tcp_release_cb() */ - if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 4c4bac1b5eab..218cfcc77650 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_get_info); static struct tcp_congestion_ops tcp_vegas __read_mostly = { .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, + .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_vegas_cong_avoid, .pkts_acked = tcp_vegas_pkts_acked, .set_state = tcp_vegas_state, diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 40171e163cff..76005d4b8dfc 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -30,6 +30,7 @@ struct veno { u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ u32 inc; /* decide whether to increase cwnd */ u32 diff; /* calculate the diff rate */ + u32 loss_cwnd; /* cwnd when loss occured */ }; /* There are several situations when we must "re-start" Veno: @@ -193,6 +194,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); + veno->loss_cwnd = tp->snd_cwnd; if (veno->diff < beta) /* in "non-congestive state", cut cwnd by 1/5 */ return max(tp->snd_cwnd * 4 / 5, 2U); @@ -201,9 +203,17 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } +static u32 tcp_veno_cwnd_undo(struct sock *sk) +{ + const struct veno *veno = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, veno->loss_cwnd); +} + static struct tcp_congestion_ops tcp_veno __read_mostly = { .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, + .undo_cwnd = tcp_veno_cwnd_undo, .cong_avoid = tcp_veno_cong_avoid, .pkts_acked = tcp_veno_pkts_acked, .set_state = tcp_veno_state, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 4b03a2e2a050..fed66dc0e0f5 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -278,6 +278,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = tcp_westwood_event, .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 9c5fc973267f..e6ff99c4bd3b 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -37,6 +37,7 @@ struct yeah { u32 fast_count; u32 pkts_acked; + u32 loss_cwnd; }; static void tcp_yeah_init(struct sock *sk) @@ -219,13 +220,22 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) yeah->fast_count = 0; yeah->reno_count = max(yeah->reno_count>>1, 2U); + yeah->loss_cwnd = tp->snd_cwnd; return max_t(int, tp->snd_cwnd - reduction, 2); } +static u32 tcp_yeah_cwnd_undo(struct sock *sk) +{ + const struct yeah *yeah = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, yeah->loss_cwnd); +} + static struct tcp_congestion_ops tcp_yeah __read_mostly = { .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, + .undo_cwnd = tcp_yeah_cwnd_undo, .cong_avoid = tcp_yeah_cong_avoid, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7d96dc2d3d08..9ca279b130d5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -580,7 +580,8 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ - IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \ + IS_ENABLED(CONFIG_NF_SOCKET_IPV4) struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { @@ -1019,7 +1020,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, - faddr, saddr, dport, inet->inet_sport); + faddr, saddr, dport, inet->inet_sport, + sk->sk_uid); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); @@ -1172,6 +1174,181 @@ out: return ret; } +/* fully reclaim rmem/fwd memory allocated for skb */ +static void udp_rmem_release(struct sock *sk, int size, int partial) +{ + struct udp_sock *up = udp_sk(sk); + int amt; + + if (likely(partial)) { + up->forward_deficit += size; + size = up->forward_deficit; + if (size < (sk->sk_rcvbuf >> 2) && + !skb_queue_empty(&sk->sk_receive_queue)) + return; + } else { + size += up->forward_deficit; + } + up->forward_deficit = 0; + + sk->sk_forward_alloc += size; + amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1); + sk->sk_forward_alloc -= amt; + + if (amt) + __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT); + + atomic_sub(size, &sk->sk_rmem_alloc); +} + +/* Note: called with sk_receive_queue.lock held. + * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch + * This avoids a cache line miss while receive_queue lock is held. + * Look at __udp_enqueue_schedule_skb() to find where this copy is done. + */ +void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) +{ + udp_rmem_release(sk, skb->dev_scratch, 1); +} +EXPORT_SYMBOL(udp_skb_destructor); + +/* Idea of busylocks is to let producers grab an extra spinlock + * to relieve pressure on the receive_queue spinlock shared by consumer. + * Under flood, this means that only one producer can be in line + * trying to acquire the receive_queue spinlock. + * These busylock can be allocated on a per cpu manner, instead of a + * per socket one (that would consume a cache line per socket) + */ +static int udp_busylocks_log __read_mostly; +static spinlock_t *udp_busylocks __read_mostly; + +static spinlock_t *busylock_acquire(void *ptr) +{ + spinlock_t *busy; + + busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); + spin_lock(busy); + return busy; +} + +static void busylock_release(spinlock_t *busy) +{ + if (busy) + spin_unlock(busy); +} + +int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff_head *list = &sk->sk_receive_queue; + int rmem, delta, amt, err = -ENOMEM; + spinlock_t *busy = NULL; + int size; + + /* try to avoid the costly atomic add/sub pair when the receive + * queue is full; always allow at least a packet + */ + rmem = atomic_read(&sk->sk_rmem_alloc); + if (rmem > sk->sk_rcvbuf) + goto drop; + + /* Under mem pressure, it might be helpful to help udp_recvmsg() + * having linear skbs : + * - Reduce memory overhead and thus increase receive queue capacity + * - Less cache line misses at copyout() time + * - Less work at consume_skb() (less alien page frag freeing) + */ + if (rmem > (sk->sk_rcvbuf >> 1)) { + skb_condense(skb); + + busy = busylock_acquire(sk); + } + size = skb->truesize; + /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss + * in udp_skb_destructor() + */ + skb->dev_scratch = size; + + /* we drop only if the receive buf is full and the receive + * queue contains some other skb + */ + rmem = atomic_add_return(size, &sk->sk_rmem_alloc); + if (rmem > (size + sk->sk_rcvbuf)) + goto uncharge_drop; + + spin_lock(&list->lock); + if (size >= sk->sk_forward_alloc) { + amt = sk_mem_pages(size); + delta = amt << SK_MEM_QUANTUM_SHIFT; + if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) { + err = -ENOBUFS; + spin_unlock(&list->lock); + goto uncharge_drop; + } + + sk->sk_forward_alloc += delta; + } + + sk->sk_forward_alloc -= size; + + /* no need to setup a destructor, we will explicitly release the + * forward allocated memory on dequeue + */ + sock_skb_set_dropcount(sk, skb); + + __skb_queue_tail(list, skb); + spin_unlock(&list->lock); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + + busylock_release(busy); + return 0; + +uncharge_drop: + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + +drop: + atomic_inc(&sk->sk_drops); + busylock_release(busy); + return err; +} +EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); + +void udp_destruct_sock(struct sock *sk) +{ + /* reclaim completely the forward allocated memory */ + unsigned int total = 0; + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + total += skb->truesize; + kfree_skb(skb); + } + udp_rmem_release(sk, total, 0); + + inet_sock_destruct(sk); +} +EXPORT_SYMBOL_GPL(udp_destruct_sock); + +int udp_init_sock(struct sock *sk) +{ + sk->sk_destruct = udp_destruct_sock; + return 0; +} +EXPORT_SYMBOL_GPL(udp_init_sock); + +void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) +{ + if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) { + bool slow = lock_sock_fast(sk); + + sk_peek_offset_bwd(sk, len); + unlock_sock_fast(sk, slow); + } + consume_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_consume_udp); + /** * first_packet_length - return length of first packet in receive queue * @sk: socket @@ -1181,12 +1358,11 @@ out: */ static int first_packet_length(struct sock *sk) { - struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue; + struct sk_buff_head *rcvq = &sk->sk_receive_queue; struct sk_buff *skb; + int total = 0; int res; - __skb_queue_head_init(&list_kill); - spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL && udp_lib_checksum_complete(skb)) { @@ -1196,18 +1372,13 @@ static int first_packet_length(struct sock *sk) IS_UDPLITE(sk)); atomic_inc(&sk->sk_drops); __skb_unlink(skb, rcvq); - __skb_queue_tail(&list_kill, skb); + total += skb->truesize; + kfree_skb(skb); } res = skb ? skb->len : -1; + if (total) + udp_rmem_release(sk, total, 1); spin_unlock_bh(&rcvq->lock); - - if (!skb_queue_empty(&list_kill)) { - bool slow = lock_sock_fast(sk); - - __skb_queue_purge(&list_kill); - sk_mem_reclaim_partial(sk); - unlock_sock_fast(sk, slow); - } return res; } @@ -1256,15 +1427,13 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int err; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; - bool slow; if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len, addr_len); try_again: peeking = off = sk_peek_offset(sk, flags); - skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), - &peeked, &off, &err); + skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; @@ -1281,7 +1450,8 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) { + if (copied < ulen || peeking || + (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { checksum_valid = !udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; @@ -1297,13 +1467,12 @@ try_again: } if (unlikely(err)) { - trace_kfree_skb(skb, udp_recvmsg); if (!peeked) { atomic_inc(&sk->sk_drops); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } - skb_free_datagram_locked(sk, skb); + kfree_skb(skb); return err; } @@ -1322,22 +1491,21 @@ try_again: *addr_len = sizeof(*sin); } if (inet->cmsg_flags) - ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr) + off); + ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); err = copied; if (flags & MSG_TRUNC) err = ulen; - __skb_free_datagram_locked(sk, skb, peeking ? -err : err); + skb_consume_udp(sk, skb, peeking ? -err : err); return err; csum_copy_err: - slow = lock_sock_fast(sk); - if (!skb_kill_datagram(sk, skb, flags)) { + if (!__sk_queue_drop_skb(sk, skb, flags)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } - unlock_sock_fast(sk, slow); + kfree_skb(skb); /* starting over for a new packet, but check if we need to yield */ cond_resched(); @@ -1345,7 +1513,7 @@ csum_copy_err: goto try_again; } -int udp_disconnect(struct sock *sk, int flags) +int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); /* @@ -1367,6 +1535,15 @@ int udp_disconnect(struct sock *sk, int flags) sk_dst_reset(sk); return 0; } +EXPORT_SYMBOL(__udp_disconnect); + +int udp_disconnect(struct sock *sk, int flags) +{ + lock_sock(sk); + __udp_disconnect(sk, flags); + release_sock(sk); + return 0; +} EXPORT_SYMBOL(udp_disconnect); void udp_lib_unhash(struct sock *sk) @@ -1446,7 +1623,7 @@ static void udp_v4_rehash(struct sock *sk) udp_lib_rehash(sk, new_hash); } -static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -1454,9 +1631,11 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); + } else { + sk_mark_napi_id_once(sk, skb); } - rc = __sock_queue_rcv_skb(sk, skb); + rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); @@ -1471,7 +1650,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } return 0; - } static struct static_key udp_encap_needed __read_mostly; @@ -1493,7 +1671,6 @@ EXPORT_SYMBOL(udp_encap_enable); int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); - int rc; int is_udplite = IS_UDPLITE(sk); /* @@ -1580,25 +1757,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto drop; udp_csum_pull_header(skb); - if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { - __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, - is_udplite); - goto drop; - } - - rc = 0; ipv4_pktinfo_prepare(sk, skb); - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) - rc = __udp_queue_rcv_skb(sk, skb); - else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { - bh_unlock_sock(sk); - goto drop; - } - bh_unlock_sock(sk); - - return rc; + return __udp_queue_rcv_skb(sk, skb); csum_error: __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); @@ -1643,10 +1804,10 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & - udp_table.mask; - hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask; + udptable->mask; + hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udp_table.hash2[hash2]; + hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -2193,7 +2354,7 @@ int udp_abort(struct sock *sk, int err) sk->sk_err = err; sk->sk_error_report(sk); - udp_disconnect(sk, 0); + __udp_disconnect(sk, 0); release_sock(sk); @@ -2208,13 +2369,13 @@ struct proto udp_prot = { .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, + .init = udp_init_sock, .destroy = udp_destroy_sock, .setsockopt = udp_setsockopt, .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .sendpage = udp_sendpage, - .backlog_rcv = __udp_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, @@ -2503,6 +2664,7 @@ EXPORT_SYMBOL(udp_flow_hashrnd); void __init udp_init(void) { unsigned long limit; + unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; @@ -2513,4 +2675,13 @@ void __init udp_init(void) sysctl_udp_rmem_min = SK_MEM_QUANTUM; sysctl_udp_wmem_min = SK_MEM_QUANTUM; + + /* 16 spinlocks per cpu */ + udp_busylocks_log = ilog2(nr_cpu_ids) + 4; + udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, + GFP_KERNEL); + if (!udp_busylocks) + panic("UDP: failed to alloc udp_busylocks\n"); + for (i = 0; i < (1U << udp_busylocks_log); i++) + spin_lock_init(udp_busylocks + i); } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 7e0fe4bdd967..feb50a16398d 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -25,7 +25,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f9333c963607..b2be1d9757ef 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -295,7 +295,7 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = udp_sk(sk)->gro_receive(sk, head, skb); + pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index af817158d830..59f10fe9782e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -50,10 +50,11 @@ struct proto udplite_prot = { .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .sendpage = udp_sendpage, - .backlog_rcv = udp_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT |