diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-11 20:55:49 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-11 20:55:49 +0300 |
commit | 237f83dfbe668443b5e31c3c7576125871cca674 (patch) | |
tree | 11848a8d0aa414a1d3ce2024e181071b1d9dea08 /net/ipv6 | |
parent | 8f6ccf6159aed1f04c6d179f61f6fb2691261e84 (diff) | |
parent | 1ff2f0fa450ea4e4f87793d9ed513098ec6e12be (diff) | |
download | linux-237f83dfbe668443b5e31c3c7576125871cca674.tar.xz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
"Some highlights from this development cycle:
1) Big refactoring of ipv6 route and neigh handling to support
nexthop objects configurable as units from userspace. From David
Ahern.
2) Convert explored_states in BPF verifier into a hash table,
significantly decreased state held for programs with bpf2bpf
calls, from Alexei Starovoitov.
3) Implement bpf_send_signal() helper, from Yonghong Song.
4) Various classifier enhancements to mvpp2 driver, from Maxime
Chevallier.
5) Add aRFS support to hns3 driver, from Jian Shen.
6) Fix use after free in inet frags by allocating fqdirs dynamically
and reworking how rhashtable dismantle occurs, from Eric Dumazet.
7) Add act_ctinfo packet classifier action, from Kevin
Darbyshire-Bryant.
8) Add TFO key backup infrastructure, from Jason Baron.
9) Remove several old and unused ISDN drivers, from Arnd Bergmann.
10) Add devlink notifications for flash update status to mlxsw driver,
from Jiri Pirko.
11) Lots of kTLS offload infrastructure fixes, from Jakub Kicinski.
12) Add support for mv88e6250 DSA chips, from Rasmus Villemoes.
13) Various enhancements to ipv6 flow label handling, from Eric
Dumazet and Willem de Bruijn.
14) Support TLS offload in nfp driver, from Jakub Kicinski, Dirk van
der Merwe, and others.
15) Various improvements to axienet driver including converting it to
phylink, from Robert Hancock.
16) Add PTP support to sja1105 DSA driver, from Vladimir Oltean.
17) Add mqprio qdisc offload support to dpaa2-eth, from Ioana
Radulescu.
18) Add devlink health reporting to mlx5, from Moshe Shemesh.
19) Convert stmmac over to phylink, from Jose Abreu.
20) Add PTP PHC (Physical Hardware Clock) support to mlxsw, from
Shalom Toledo.
21) Add nftables SYNPROXY support, from Fernando Fernandez Mancera.
22) Convert tcp_fastopen over to use SipHash, from Ard Biesheuvel.
23) Track spill/fill of constants in BPF verifier, from Alexei
Starovoitov.
24) Support bounded loops in BPF, from Alexei Starovoitov.
25) Various page_pool API fixes and improvements, from Jesper Dangaard
Brouer.
26) Just like ipv4, support ref-countless ipv6 route handling. From
Wei Wang.
27) Support VLAN offloading in aquantia driver, from Igor Russkikh.
28) Add AF_XDP zero-copy support to mlx5, from Maxim Mikityanskiy.
29) Add flower GRE encap/decap support to nfp driver, from Pieter
Jansen van Vuuren.
30) Protect against stack overflow when using act_mirred, from John
Hurley.
31) Allow devmap map lookups from eBPF, from Toke Høiland-Jørgensen.
32) Use page_pool API in netsec driver, Ilias Apalodimas.
33) Add Google gve network driver, from Catherine Sullivan.
34) More indirect call avoidance, from Paolo Abeni.
35) Add kTLS TX HW offload support to mlx5, from Tariq Toukan.
36) Add XDP_REDIRECT support to bnxt_en, from Andy Gospodarek.
37) Add MPLS manipulation actions to TC, from John Hurley.
38) Add sending a packet to connection tracking from TC actions, and
then allow flower classifier matching on conntrack state. From
Paul Blakey.
39) Netfilter hw offload support, from Pablo Neira Ayuso"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2080 commits)
net/mlx5e: Return in default case statement in tx_post_resync_params
mlx5: Return -EINVAL when WARN_ON_ONCE triggers in mlx5e_tls_resync().
net: dsa: add support for BRIDGE_MROUTER attribute
pkt_sched: Include const.h
net: netsec: remove static declaration for netsec_set_tx_de()
net: netsec: remove superfluous if statement
netfilter: nf_tables: add hardware offload support
net: flow_offload: rename tc_cls_flower_offload to flow_cls_offload
net: flow_offload: add flow_block_cb_is_busy() and use it
net: sched: remove tcf block API
drivers: net: use flow block API
net: sched: use flow block API
net: flow_offload: add flow_block_cb_{priv, incref, decref}()
net: flow_offload: add list handling functions
net: flow_offload: add flow_block_cb_alloc() and flow_block_cb_free()
net: flow_offload: rename TCF_BLOCK_BINDER_TYPE_* to FLOW_BLOCK_BINDER_TYPE_*
net: flow_offload: rename TC_BLOCK_{UN}BIND to FLOW_BLOCK_{UN}BIND
net: flow_offload: add flow_block_cb_setup_simple()
net: hisilicon: Add an tx_desc to adapt HI13X1_GMAC
net: hisilicon: Add an rx_desc to adapt HI13X1_GMAC
...
Diffstat (limited to 'net/ipv6')
-rw-r--r-- | net/ipv6/addrconf.c | 19 | ||||
-rw-r--r-- | net/ipv6/addrconf_core.c | 6 | ||||
-rw-r--r-- | net/ipv6/af_inet6.c | 46 | ||||
-rw-r--r-- | net/ipv6/ah6.c | 4 | ||||
-rw-r--r-- | net/ipv6/esp6.c | 23 | ||||
-rw-r--r-- | net/ipv6/esp6_offload.c | 4 | ||||
-rw-r--r-- | net/ipv6/fib6_rules.c | 12 | ||||
-rw-r--r-- | net/ipv6/icmp.c | 7 | ||||
-rw-r--r-- | net/ipv6/inet6_hashtables.c | 2 | ||||
-rw-r--r-- | net/ipv6/ip6_fib.c | 214 | ||||
-rw-r--r-- | net/ipv6/ip6_flowlabel.c | 27 | ||||
-rw-r--r-- | net/ipv6/ip6_output.c | 340 | ||||
-rw-r--r-- | net/ipv6/ipcomp6.c | 3 | ||||
-rw-r--r-- | net/ipv6/mip6.c | 6 | ||||
-rw-r--r-- | net/ipv6/ndisc.c | 11 | ||||
-rw-r--r-- | net/ipv6/netfilter.c | 129 | ||||
-rw-r--r-- | net/ipv6/netfilter/ip6t_SYNPROXY.c | 420 | ||||
-rw-r--r-- | net/ipv6/netfilter/ip6table_raw.c | 2 | ||||
-rw-r--r-- | net/ipv6/netfilter/nf_conntrack_reasm.c | 53 | ||||
-rw-r--r-- | net/ipv6/proc.c | 4 | ||||
-rw-r--r-- | net/ipv6/raw.c | 4 | ||||
-rw-r--r-- | net/ipv6/reassembly.c | 52 | ||||
-rw-r--r-- | net/ipv6/route.c | 1471 | ||||
-rw-r--r-- | net/ipv6/sysctl_net_ipv6.c | 5 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 31 | ||||
-rw-r--r-- | net/ipv6/udp.c | 33 | ||||
-rw-r--r-- | net/ipv6/xfrm6_state.c | 137 |
27 files changed, 1830 insertions, 1235 deletions
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 081bb517e40d..521e3203e83a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2417,9 +2417,13 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex) + /* prefix routes only use builtin fib6_nh */ + if (rt->nh) continue; - if (no_gw && rt->fib6_nh.fib_nh_gw_family) + + if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex) + continue; + if (no_gw && rt->fib6_nh->fib_nh_gw_family) continue; if ((rt->fib6_flags & flags) != flags) continue; @@ -3123,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) struct in_device *in_dev = __in_dev_get_rtnl(dev); if (in_dev && (dev->flags & IFF_UP)) { struct in_ifaddr *ifa; - int flag = scope; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - + in_dev_for_each_ifa_rtnl(ifa, in_dev) { addr.s6_addr32[3] = ifa->ifa_local; if (ifa->ifa_scope == RT_SCOPE_LINK) @@ -6350,16 +6352,17 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct fib6_info *rt = ifa->rt; + /* host routes only use builtin fib6_nh */ + struct fib6_nh *nh = ifa->rt->fib6_nh; int cpu; rcu_read_lock(); ifa->rt->dst_nopolicy = val ? true : false; - if (rt->rt6i_pcpu) { + if (nh->rt6i_pcpu) { for_each_possible_cpu(cpu) { struct rt6_info **rtp; - rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu); + rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu); addrconf_set_nopolicy(*rtp, val); } } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 5b1246635e02..783f3c1466da 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -183,6 +183,11 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +{ + return -EAFNOSUPPORT; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .ipv6_route_input = eafnosupport_ipv6_route_input, @@ -192,6 +197,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .fib6_select_path = eafnosupport_fib6_select_path, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, .fib6_nh_init = eafnosupport_fib6_nh_init, + .ip6_del_rt = eafnosupport_ip6_del_rt, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5352708b7b2d..ef37e0574f54 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -208,7 +208,7 @@ lookup_protocol: np->mc_loop = 1; np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->repflow = net->ipv6.sysctl.flowlabel_reflect; + np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets @@ -564,6 +564,39 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } EXPORT_SYMBOL(inet6_ioctl); +INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, + size_t)); +int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + + if (unlikely(inet_send_prepare(sk))) + return -EAGAIN; + + return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, + sk, msg, size); +} + +INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *, + size_t, int, int, int *)); +int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + if (likely(!(flags & MSG_ERRQUEUE))) + sock_rps_record_flow(sk); + + err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, + sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} + const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -580,8 +613,8 @@ const struct proto_ops inet6_stream_ops = { .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = inet_recvmsg, /* ok */ + .sendmsg = inet6_sendmsg, /* retpoline's sake */ + .recvmsg = inet6_recvmsg, /* retpoline's sake */ #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif @@ -614,8 +647,8 @@ const struct proto_ops inet6_dgram_ops = { .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = inet_recvmsg, /* ok */ + .sendmsg = inet6_sendmsg, /* retpoline's sake */ + .recvmsg = inet6_recvmsg, /* retpoline's sake */ .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, @@ -922,6 +955,9 @@ static const struct ipv6_stub ipv6_stub_impl = { .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .fib6_nh_init = fib6_nh_init, .fib6_nh_release = fib6_nh_release, + .fib6_update_sernum = fib6_update_sernum_stub, + .fib6_rt_update = fib6_rt_update, + .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 68b9e92e469e..25e1172fd1c3 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -793,9 +793,7 @@ static void __exit ah6_fini(void) if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); - + xfrm_unregister_type(&ah6_type, AF_INET6); } module_init(ah6_init); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index ae6a739c5f52..a3b403ba8f8f 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -41,8 +41,6 @@ struct esp_skb_cb { #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) -static u32 esp6_get_mtu(struct xfrm_state *x, int mtu); - /* * Allocate an AEAD request structure with extra space for SG and IV. * @@ -447,7 +445,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } @@ -687,21 +685,6 @@ out: return ret; } -static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) -{ - struct crypto_aead *aead = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); - unsigned int net_adj; - - if (x->props.mode != XFRM_MODE_TUNNEL) - net_adj = sizeof(struct ipv6hdr); - else - net_adj = 0; - - return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - - net_adj) & ~(blksize - 1)) + net_adj - 2; -} - static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { @@ -919,7 +902,6 @@ static const struct xfrm_type esp6_type = { .flags = XFRM_TYPE_REPLAY_PROT, .init_state = esp6_init_state, .destructor = esp6_destroy, - .get_mtu = esp6_get_mtu, .input = esp6_input, .output = esp6_output, .hdr_offset = xfrm6_find_1stfragopt, @@ -951,8 +933,7 @@ static void __exit esp6_fini(void) { if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&esp6_type, AF_INET6); } module_init(esp6_init); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index d0d8528b294a..e31626ffccd1 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -336,9 +336,7 @@ static int __init esp6_offload_init(void) static void __exit esp6_offload_exit(void) { - if (xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type offload\n", __func__); - + xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6); inet6_del_offload(&esp6_offload, IPPROTO_ESP); } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index bcfae13409b5..d22b6c140f23 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); } - dst_hold(&net->ipv6.ip6_null_entry->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } @@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } again: - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); err = -EAGAIN; rt = NULL; goto out; discard_pkt: - dst_hold(&rt->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); out: res->rt6 = rt; return err; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 375b4b4f9bf5..62c997201970 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -75,9 +75,9 @@ * * On SMP we have one ICMP socket per-cpu. */ -static inline struct sock *icmpv6_sk(struct net *net) +static struct sock *icmpv6_sk(struct net *net) { - return *this_cpu_ptr(net->ipv6.icmp_sk); + return this_cpu_read(*net->ipv6.icmp_sk); } static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, @@ -703,6 +703,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; memset(&fl6, 0, sizeof(fl6)); + if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) + fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb)); + fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b2a55f300318..cf60fae9533b 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -174,7 +174,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, &in6addr_any, hnum, dif, sdif); done: - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9180c8b6f764..49884f96232b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -143,20 +143,19 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; + size_t sz = sizeof(*f6i); - f6i = kzalloc(sizeof(*f6i), gfp_flags); + if (with_fib6_nh) + sz += sizeof(struct fib6_nh); + + f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; - f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); - if (!f6i->rt6i_pcpu) { - kfree(f6i); - return NULL; - } - + /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); @@ -166,36 +165,15 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); - struct rt6_exception_bucket *bucket; WARN_ON(f6i->fib6_node); - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); - kfree(bucket); - - if (f6i->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - dst_dev_put(&pcpu_rt->dst); - dst_release(&pcpu_rt->dst); - *ppcpu_rt = NULL; - } - } - - free_percpu(f6i->rt6i_pcpu); - } - - fib6_nh_release(&f6i->fib6_nh); + if (f6i->nh) + nexthop_put(f6i->nh); + else + fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); - kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); @@ -338,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); + if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); } return &rt->dst; @@ -389,14 +368,30 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, return call_fib6_notifier(nb, net, event_type, &info.info); } -static int call_fib6_entry_notifiers(struct net *net, - enum fib_event_type event_type, - struct fib6_info *rt, - struct netlink_ext_ack *extack) +int call_fib6_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + }; + + rt->fib6_table->fib_seq++; + return call_fib6_notifiers(net, event_type, &info.info); +} + +int call_fib6_multipath_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + unsigned int nsiblings, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, + .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; @@ -469,12 +464,19 @@ static int fib6_dump_node(struct fib6_walker *w) struct fib6_info *rt; for_each_fib6_walker_rt(w) { - res = rt6_dump_route(rt, w->args); - if (res < 0) { + res = rt6_dump_route(rt, w->args, w->skip_in_node); + if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; + + /* We'll restart from this node, so if some routes were + * already dumped, skip them next time. + */ + w->skip_in_node += res; + return 1; } + w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the @@ -526,6 +528,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, if (cb->args[4] == 0) { w->count = 0; w->skip = 0; + w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); @@ -541,6 +544,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->state = FWS_INIT; w->node = w->root; w->skip = w->count; + w->skip_in_node = 0; } else w->skip = 0; @@ -558,9 +562,10 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, + .filter.dump_routes = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct rt6_rtnl_dump_arg arg = {}; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib6_walker *w; @@ -577,13 +582,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); - arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED); + if (rtm->rtm_flags & RTM_F_PREFIX) + arg.filter.flags = RTM_F_PREFIX; } - /* fib entries are never clones */ - if (arg.filter.flags & RTM_F_CLONED) - goto out; - w = (void *)cb->args[2]; if (!w) { /* New dump: @@ -895,16 +897,14 @@ insert_above: return ln; } -static void fib6_drop_pcpu_from(struct fib6_info *f6i, - const struct fib6_table *table) +static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, + const struct fib6_info *match, + const struct fib6_table *table) { int cpu; - /* Make sure rt6_make_pcpu_route() wont add other percpu routes - * while we are cleaning them here. - */ - f6i->fib6_destroying = 1; - mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + if (!fib6_nh->rt6i_pcpu) + return; /* release the reference to this fib entry from * all of its cached pcpu routes @@ -913,9 +913,15 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { + + /* only dropping the 'from' reference if the cached route + * is using 'match'. The cached pcpu_rt->from only changes + * from a fib6_info to NULL (ip6_dst_destroy); it can never + * change from one fib6_info reference to another + */ + if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); @@ -924,13 +930,53 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, } } +struct fib6_nh_pcpu_arg { + struct fib6_info *from; + const struct fib6_table *table; +}; + +static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_pcpu_arg *arg = _arg; + + __fib6_drop_pcpu_from(nh, arg->from, arg->table); + return 0; +} + +static void fib6_drop_pcpu_from(struct fib6_info *f6i, + const struct fib6_table *table) +{ + /* Make sure rt6_make_pcpu_route() wont add other percpu routes + * while we are cleaning them here. + */ + f6i->fib6_destroying = 1; + mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + + if (f6i->nh) { + struct fib6_nh_pcpu_arg arg = { + .from = f6i, + .table = table + }; + + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, + &arg); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = f6i->fib6_nh; + __fib6_drop_pcpu_from(fib6_nh, f6i, table); + } +} + static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; - if (rt->rt6i_pcpu) - fib6_drop_pcpu_from(rt, table); + fib6_drop_pcpu_from(rt, table); + + if (rt->nh && !list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1101,11 +1147,13 @@ next_iter: add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); - if (err) - return err; + if (!info->skip_notify_kernel) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); + if (err) + return err; + } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); @@ -1130,11 +1178,13 @@ add: return -ENOENT; } - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_REPLACE, - rt, extack); - if (err) - return err; + if (!info->skip_notify_kernel) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_REPLACE, + rt, extack); + if (err) + return err; + } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); @@ -1218,6 +1268,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } +/* allow ipv4 to update sernum via ipv6_stub */ +void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_update_sernum_upto_root(net, f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> @@ -1331,6 +1389,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { + if (rt->nh) + list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); } @@ -1536,7 +1596,8 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, if (plen == fn->fn_bit) return fn; - prev = fn; + if (fn->fn_flags & RTN_RTINFO) + prev = fn; next: /* @@ -1807,9 +1868,11 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); + if (!info->skip_notify_kernel) + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); + fib6_info_release(rt); } @@ -2041,6 +2104,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; + c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; @@ -2292,9 +2356,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; + struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; + if (rt->nh) + fib6_nh = nexthop_fib6_nh(rt->nh); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES @@ -2302,14 +2370,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->fib6_nh.fib_nh_gw_family) { + if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; - seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6); + seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } - dev = rt->fib6_nh.fib_nh_dev; + dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 545e339b8c4f..ad284b1fd308 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> +#include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -53,6 +54,9 @@ static DEFINE_SPINLOCK(ip6_fl_lock); static DEFINE_SPINLOCK(ip6_sk_fl_lock); +DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); +EXPORT_SYMBOL(ipv6_flowlabel_exclusive); + #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ fl != NULL; \ @@ -90,6 +94,13 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) return fl; } +static bool fl_shared_exclusive(struct ip6_flowlabel *fl) +{ + return fl->share == IPV6_FL_S_EXCL || + fl->share == IPV6_FL_S_PROCESS || + fl->share == IPV6_FL_S_USER; +} + static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); @@ -103,8 +114,13 @@ static void fl_free_rcu(struct rcu_head *head) static void fl_free(struct ip6_flowlabel *fl) { - if (fl) - call_rcu(&fl->rcu, fl_free_rcu); + if (!fl) + return; + + if (fl_shared_exclusive(fl) || fl->opt) + static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); + + call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) @@ -240,7 +256,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, /* Socket flowlabel lists */ -struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) +struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); @@ -260,7 +276,7 @@ struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) rcu_read_unlock_bh(); return NULL; } -EXPORT_SYMBOL_GPL(fl6_sock_lookup); +EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { @@ -419,6 +435,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); + if (fl_shared_exclusive(fl) || fl->opt) + static_branch_deferred_inc(&ipv6_flowlabel_exclusive); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: @@ -854,6 +872,7 @@ int ip6_flowlabel_init(void) void ip6_flowlabel_cleanup(void) { + static_key_deferred_flush(&ipv6_flowlabel_exclusive); del_timer(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 21efcd02f337..8e49fd62eea9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -124,16 +124,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return -EINVAL; } -static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } - #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -150,6 +142,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ip6_finish_output2(net, sk, skb); } +static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip6_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip6_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -588,6 +596,169 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, + struct ip6_fraglist_iter *iter) +{ + unsigned int first_len; + struct frag_hdr *fh; + + /* BUILD HEADER */ + *prevhdr = NEXTHDR_FRAGMENT; + iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); + if (!iter->tmp_hdr) + return -ENOMEM; + + iter->frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + + iter->offset = 0; + iter->hlen = hlen; + iter->frag_id = frag_id; + iter->nexthdr = nexthdr; + + __skb_pull(skb, hlen); + fh = __skb_push(skb, sizeof(struct frag_hdr)); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); + + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + fh->identification = frag_id; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); + + return 0; +} +EXPORT_SYMBOL(ip6_fraglist_init); + +void ip6_fraglist_prepare(struct sk_buff *skb, + struct ip6_fraglist_iter *iter) +{ + struct sk_buff *frag = iter->frag; + unsigned int hlen = iter->hlen; + struct frag_hdr *fh; + + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + fh = __skb_push(frag, sizeof(struct frag_hdr)); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); + iter->offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = iter->nexthdr; + fh->reserved = 0; + fh->frag_off = htons(iter->offset); + if (frag->next) + fh->frag_off |= htons(IP6_MF); + fh->identification = iter->frag_id; + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); +} +EXPORT_SYMBOL(ip6_fraglist_prepare); + +void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, + unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) +{ + state->prevhdr = prevhdr; + state->nexthdr = nexthdr; + state->frag_id = frag_id; + + state->hlen = hlen; + state->mtu = mtu; + + state->left = skb->len - hlen; /* Space per frame */ + state->ptr = hlen; /* Where to start from */ + + state->hroom = hdr_room; + state->troom = needed_tailroom; + + state->offset = 0; +} +EXPORT_SYMBOL(ip6_frag_init); + +struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) +{ + u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; + struct sk_buff *frag; + struct frag_hdr *fh; + unsigned int len; + + len = state->left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > state->mtu) + len = state->mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < state->left) + len &= ~7; + + /* Allocate buffer */ + frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + + state->hroom + state->troom, GFP_ATOMIC); + if (!frag) + return ERR_PTR(-ENOMEM); + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, state->hroom); + skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); + frag->transport_header = (frag->network_header + state->hlen + + sizeof(struct frag_hdr)); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); + + fragnexthdr_offset = skb_network_header(frag); + fragnexthdr_offset += prevhdr - skb_network_header(skb); + *fragnexthdr_offset = NEXTHDR_FRAGMENT; + + /* + * Build fragment header. + */ + fh->nexthdr = state->nexthdr; + fh->reserved = 0; + fh->identification = state->frag_id; + + /* + * Copy a block of the IP datagram. + */ + BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), + len)); + state->left -= len; + + fh->frag_off = htons(state->offset); + if (state->left > 0) + fh->frag_off |= htons(IP6_MF); + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + + state->ptr += len; + state->offset += len; + + return frag; +} +EXPORT_SYMBOL(ip6_frag_next); + int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { @@ -595,12 +766,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - struct ipv6hdr *tmp_hdr; - struct frag_hdr *fh; - unsigned int mtu, hlen, left, len, nexthdr_offset; - int hroom, troom; + struct ip6_frag_state state; + unsigned int mtu, hlen, nexthdr_offset; + int hroom, err = 0; __be32 frag_id; - int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; err = ip6_find_1stfragopt(skb, &prevhdr); @@ -647,6 +816,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; struct sk_buff *frag2; if (first_len - hlen > mtu || @@ -674,74 +844,29 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, skb->truesize -= frag->truesize; } - err = 0; - offset = 0; - /* BUILD HEADER */ - - *prevhdr = NEXTHDR_FRAGMENT; - tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); - if (!tmp_hdr) { - err = -ENOMEM; + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) goto fail; - } - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - - __skb_pull(skb, hlen); - fh = __skb_push(skb, sizeof(struct frag_hdr)); - __skb_push(skb, hlen); - skb_reset_network_header(skb); - memcpy(skb_network_header(skb), tmp_hdr, hlen); - - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(IP6_MF); - fh->identification = frag_id; - - first_len = skb_pagelen(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - ipv6_hdr(skb)->payload_len = htons(first_len - - sizeof(struct ipv6hdr)); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - fh = __skb_push(frag, sizeof(struct frag_hdr)); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), tmp_hdr, - hlen); - offset += skb->len - hlen - sizeof(struct frag_hdr); - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(offset); - if (frag->next) - fh->frag_off |= htons(IP6_MF); - fh->identification = frag_id; - ipv6_hdr(frag)->payload_len = - htons(frag->len - - sizeof(struct ipv6hdr)); - ip6_copy_metadata(frag, skb); - } + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); - if (err || !frag) + if (err || !iter.frag) break; - skb = frag; - frag = skb->next; - skb_mark_not_on_list(skb); + skb = ip6_fraglist_next(&iter); } - kfree(tmp_hdr); + kfree(iter.tmp_hdr); if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -749,7 +874,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - kfree_skb_list(frag); + kfree_skb_list(iter.frag); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); @@ -766,91 +891,26 @@ slow_path_clean: } slow_path: - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - /* * Fragment the datagram. */ - troom = rt->dst.dev->needed_tailroom; + ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, + LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, + &state); /* * Keep copying data until we run out. */ - while (left > 0) { - u8 *fragnexthdr_offset; - - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } - /* Allocate buffer */ - frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + - hroom + troom, GFP_ATOMIC); - if (!frag) { - err = -ENOMEM; + while (state.left > 0) { + frag = ip6_frag_next(skb, &state); + if (IS_ERR(frag)) { + err = PTR_ERR(frag); goto fail; } /* - * Set up data on packet - */ - - ip6_copy_metadata(frag, skb); - skb_reserve(frag, hroom); - skb_put(frag, len + hlen + sizeof(struct frag_hdr)); - skb_reset_network_header(frag); - fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); - frag->transport_header = (frag->network_header + hlen + - sizeof(struct frag_hdr)); - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - if (skb->sk) - skb_set_owner_w(frag, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); - - fragnexthdr_offset = skb_network_header(frag); - fragnexthdr_offset += prevhdr - skb_network_header(skb); - *fragnexthdr_offset = NEXTHDR_FRAGMENT; - - /* - * Build fragment header. - */ - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->identification = frag_id; - - /* - * Copy a block of the IP datagram. - */ - BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), - len)); - left -= len; - - fh->frag_off = htons(offset); - if (left > 0) - fh->frag_off |= htons(IP6_MF); - ipv6_hdr(frag)->payload_len = htons(frag->len - - sizeof(struct ipv6hdr)); - - ptr += len; - offset += len; - - /* * Put this fragment into the sending queue. */ err = output(net, sk, frag); diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 51fd33294c7c..3752bd3e92ce 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -206,8 +206,7 @@ static void __exit ipcomp6_fini(void) { if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&ipcomp6_type, AF_INET6); } module_init(ipcomp6_init); diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 91801432878c..878fcec14949 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -499,10 +499,8 @@ static void __exit mip6_fini(void) { if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) pr_info("%s: can't remove rawv6 mh filter\n", __func__); - if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type(rthdr)\n", __func__); - if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type(destopt)\n", __func__); + xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); + xfrm_unregister_type(&mip6_destopt_type, AF_INET6); } module_init(mip6_init); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 09dd2edfb868..083cc1c94cd3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1285,12 +1285,11 @@ static void ndisc_router_discovery(struct sk_buff *skb) !in6_dev->cnf.accept_ra_rtr_pref) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif - + /* routes added from RAs do not use nexthop objects */ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); - if (rt) { - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, @@ -1319,8 +1318,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 1240ccd57f39..61819ed858b1 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -16,6 +16,9 @@ #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/netfilter/nf_queue.h> +#include <net/netfilter/nf_conntrack_bridge.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include "../bridge/br_private.h" int ip6_route_me_harder(struct net *net, struct sk_buff *skb) { @@ -109,16 +112,142 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst, } EXPORT_SYMBOL_GPL(__nf_ip6_route); +int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + struct ip6_frag_state state; + u8 *prevhdr, nexthdr = 0; + unsigned int mtu, hlen; + int hroom, err = 0; + __be32 frag_id; + + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + goto blackhole; + hlen = err; + nexthdr = *prevhdr; + + mtu = skb->dev->mtu; + if (frag_max_size > mtu || + frag_max_size < IPV6_MIN_MTU) + goto blackhole; + + mtu = frag_max_size; + if (mtu < hlen + sizeof(struct frag_hdr) + 8) + goto blackhole; + mtu -= hlen + sizeof(struct frag_hdr); + + frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto blackhole; + + hroom = LL_RESERVED_SPACE(skb->dev); + if (skb_has_frag_list(skb)) { + unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; + struct sk_buff *frag2; + + if (first_len - hlen > mtu || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + goto blackhole; + + if (skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag2) { + if (frag2->len > mtu || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + goto blackhole; + + /* Partially cloned skb? */ + if (skb_shared(frag2)) + goto slow_path; + } + + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) + goto blackhole; + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. + */ + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); + + err = output(net, sk, data, skb); + if (err || !iter.frag) + break; + + skb = ip6_fraglist_next(&iter); + } + + kfree(iter.tmp_hdr); + if (!err) + return 0; + + kfree_skb_list(iter.frag); + return err; + } +slow_path: + /* This is a linearized skbuff, the original geometry is lost for us. + * This may also be a clone skbuff, we could preserve the geometry for + * the copies but probably not worth the effort. + */ + ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom, + LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id, + &state); + + while (state.left > 0) { + struct sk_buff *skb2; + + skb2 = ip6_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); + goto blackhole; + } + + err = output(net, sk, data, skb2); + if (err) + goto blackhole; + } + consume_skb(skb); + return err; + +blackhole: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL_GPL(br_ip6_fragment); + static const struct nf_ipv6_ops ipv6ops = { #if IS_MODULE(CONFIG_IPV6) .chk_addr = ipv6_chk_addr, .route_me_harder = ip6_route_me_harder, .dev_get_saddr = ipv6_dev_get_saddr, .route = __nf_ip6_route, +#if IS_ENABLED(CONFIG_SYN_COOKIES) + .cookie_init_sequence = __cookie_v6_init_sequence, + .cookie_v6_check = __cookie_v6_check, +#endif #endif .route_input = ip6_route_input, .fragment = ip6_fragment, .reroute = nf_ip6_reroute, +#if IS_MODULE(CONFIG_IPV6) && IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + .br_defrag = nf_ct_frag6_gather, +#endif +#if IS_MODULE(CONFIG_IPV6) + .br_fragment = br_ip6_fragment, +#endif }; int __init ipv6_netfilter_init(void) diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 41325d517478..e77ea1ed5edd 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -3,272 +3,11 @@ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <net/ip6_checksum.h> -#include <net/ip6_route.h> -#include <net/tcp.h> - #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_synproxy.h> -#include <net/netfilter/nf_conntrack_ecache.h> - -static struct ipv6hdr * -synproxy_build_ip(struct net *net, struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - struct ipv6hdr *iph; - - skb_reset_network_header(skb); - iph = skb_put(skb, sizeof(*iph)); - ip6_flow_hdr(iph, 0, 0); - iph->hop_limit = net->ipv6.devconf_all->hop_limit; - iph->nexthdr = IPPROTO_TCP; - iph->saddr = *saddr; - iph->daddr = *daddr; - - return iph; -} - -static void -synproxy_send_tcp(struct net *net, - const struct sk_buff *skb, struct sk_buff *nskb, - struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, - struct ipv6hdr *niph, struct tcphdr *nth, - unsigned int tcp_hdr_size) -{ - struct dst_entry *dst; - struct flowi6 fl6; - - nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)nth - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_TCP; - fl6.saddr = niph->saddr; - fl6.daddr = niph->daddr; - fl6.fl6_sport = nth->source; - fl6.fl6_dport = nth->dest; - security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - dst_release(dst); - goto free_nskb; - } - dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); - if (IS_ERR(dst)) - goto free_nskb; - - skb_dst_set(nskb, dst); - - if (nfct) { - nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); - nf_conntrack_get(nfct); - } - - ip6_local_out(net, nskb->sk, nskb); - return; - -free_nskb: - kfree_skb(nskb); -} - -static void -synproxy_send_client_synack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - u16 mss = opts->mss; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE; - nth->doff = tcp_hdr_size / 4; - nth->window = 0; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} -static void -synproxy_send_server_syn(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(recv_seq - 1); - /* ack_seq is used to relay our ISN to the synproxy hook to initialize - * sequence number translation once a connection tracking entry exists. - */ - nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); - tcp_flag_word(nth) = TCP_FLAG_SYN; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; - nth->doff = tcp_hdr_size / 4; - nth->window = th->window; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, - niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_ack(struct net *net, - const struct ip_ct_tcp *state, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(ntohl(th->ack_seq)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(ntohl(th->seq) + 1); - nth->ack_seq = th->ack_seq; - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(ntohs(th->window) >> opts->wscale); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static bool -synproxy_recv_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - int mss; - - mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); - if (mss == 0) { - this_cpu_inc(snet->stats->cookie_invalid); - return false; - } - - this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; - - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy_check_timestamp_cookie(opts); - - synproxy_send_server_syn(net, skb, th, opts, recv_seq); - return true; -} +#include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) @@ -304,13 +43,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(net, skb, th, &opts); + synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { + if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, + ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { @@ -321,141 +61,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv6_synproxy_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct net *net = nhs->net; - struct synproxy_net *snet = synproxy_pernet(net); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - struct nf_conn_synproxy *synproxy; - struct synproxy_options opts = {}; - const struct ip_ct_tcp *state; - struct tcphdr *th, _th; - __be16 frag_off; - u8 nexthdr; - int thoff; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_ACCEPT; - - synproxy = nfct_synproxy(ct); - if (synproxy == NULL) - return NF_ACCEPT; - - if (nf_is_loopback_packet(skb)) - return NF_ACCEPT; - - nexthdr = ipv6_hdr(skb)->nexthdr; - thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (thoff < 0 || nexthdr != IPPROTO_TCP) - return NF_ACCEPT; - - th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); - if (th == NULL) - return NF_DROP; - - state = &ct->proto.tcp; - switch (state->state) { - case TCP_CONNTRACK_CLOSE: - if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - - ntohl(th->seq) + 1); - break; - } - - if (!th->syn || th->ack || - CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - break; - - /* Reopened connection - reset the sequence number and timestamp - * adjustments, they will get initialized once the connection is - * reestablished. - */ - nf_ct_seqadj_init(ct, ctinfo, 0); - synproxy->tsoff = 0; - this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ - case TCP_CONNTRACK_SYN_SENT: - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (!th->syn && th->ack && - CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, - * therefore we need to add 1 to make the SYN sequence - * number match the one of first SYN. - */ - if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) { - this_cpu_inc(snet->stats->cookie_retrans); - consume_skb(skb); - return NF_STOLEN; - } else { - return NF_DROP; - } - } - - synproxy->isn = ntohl(th->ack_seq); - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy->its = opts.tsecr; - - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - break; - case TCP_CONNTRACK_SYN_RECV: - if (!th->syn || !th->ack) - break; - - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { - synproxy->tsoff = opts.tsval - synproxy->its; - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - } - - opts.options &= ~(XT_SYNPROXY_OPT_MSS | - XT_SYNPROXY_OPT_WSCALE | - XT_SYNPROXY_OPT_SACK_PERM); - - swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(net, state, skb, th, &opts); - - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); - nf_conntrack_event_cache(IPCT_SEQADJ, ct); - - swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(net, skb, th, &opts); - - consume_skb(skb); - return NF_STOLEN; - default: - break; - } - - synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); - return NF_ACCEPT; -} - -static const struct nf_hook_ops ipv6_synproxy_ops[] = { - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); @@ -471,16 +76,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par) if (err) return err; - if (snet->hook_ref6 == 0) { - err = nf_register_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); - if (err) { - nf_ct_netns_put(par->net, par->family); - return err; - } + err = nf_synproxy_ipv6_init(snet, par->net); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; } - snet->hook_ref6++; return err; } @@ -488,10 +89,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); - snet->hook_ref6--; - if (snet->hook_ref6 == 0) - nf_unregister_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); + nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 3f7d4691c423..a22100b1cf2c 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -2,7 +2,7 @@ /* * IPv6 raw table, a port of the IPv4 raw table to IPv6 * - * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 84322ce81d70..398e1df41406 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -54,26 +54,21 @@ static struct inet_frags nf_frags; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", - .data = &init_net.nf_frag.frags.timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_frag6_low_thresh", - .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", - .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.nf_frag.frags.low_thresh }, { } }; @@ -89,15 +84,15 @@ static int nf_ct_frag6_sysctl_register(struct net *net) GFP_KERNEL); if (table == NULL) goto err_alloc; - - table[0].data = &net->nf_frag.frags.timeout; - table[1].data = &net->nf_frag.frags.low_thresh; - table[1].extra2 = &net->nf_frag.frags.high_thresh; - table[2].data = &net->nf_frag.frags.high_thresh; - table[2].extra1 = &net->nf_frag.frags.low_thresh; - table[2].extra2 = &init_net.nf_frag.frags.high_thresh; } + table[0].data = &net->nf_frag.fqdir->timeout; + table[1].data = &net->nf_frag.fqdir->low_thresh; + table[1].extra2 = &net->nf_frag.fqdir->high_thresh; + table[2].data = &net->nf_frag.fqdir->high_thresh; + table[2].extra1 = &net->nf_frag.fqdir->low_thresh; + table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh; + hdr = register_net_sysctl(net, "net/netfilter", table); if (hdr == NULL) goto err_reg; @@ -144,12 +139,10 @@ static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } /* Creation primitives. */ @@ -165,7 +158,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->nf_frag.frags, &key); + q = inet_frag_find(net->nf_frag.fqdir, &key); if (!q) return NULL; @@ -278,7 +271,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -494,29 +487,35 @@ static int nf_ct_net_init(struct net *net) { int res; - net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - net->nf_frag.frags.f = &nf_frags; - - res = inet_frags_init_net(&net->nf_frag.frags); + res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net); if (res < 0) return res; + + net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = nf_ct_frag6_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); return res; } +static void nf_ct_net_pre_exit(struct net *net) +{ + fqdir_pre_exit(net->nf_frag.fqdir); +} + static void nf_ct_net_exit(struct net *net) { nf_ct_frags6_sysctl_unregister(net); - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); } static struct pernet_operations nf_ct_net_ops = { - .init = nf_ct_net_init, - .exit = nf_ct_net_exit, + .init = nf_ct_net_init, + .pre_exit = nf_ct_net_pre_exit, + .exit = nf_ct_net_exit, }; static const struct rhashtable_params nfct_rhash_params = { diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 4a8da679866e..bbff3e02e302 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -44,8 +44,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %u memory %lu\n", - atomic_read(&net->ipv6.frags.rhashtable.nelems), - frag_mem_limit(&net->ipv6.frags)); + atomic_read(&net->ipv6.fqdir->rhashtable.nelems), + frag_mem_limit(net->ipv6.fqdir)); return 0; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 70693bc7ad9d..8a6131991e38 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -834,7 +834,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -876,7 +876,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b2b2c0c38b87..ca05b16f1bb9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -72,12 +72,10 @@ static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, ipv6.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } static struct frag_queue * @@ -96,7 +94,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) IPV6_ADDR_LINKLOCAL))) key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &key); + q = inet_frag_find(net->ipv6.fqdir, &key); if (!q) return NULL; @@ -196,7 +194,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); fragsize = -skb_network_offset(skb) + skb->len; if (fragsize > fq->q.max_size) @@ -250,7 +248,7 @@ err: static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { - struct net *net = container_of(fq->q.net, struct net, ipv6.frags); + struct net *net = fq->q.fqdir->net; unsigned int nhoff; void *reasm_data; int payload_len; @@ -397,23 +395,18 @@ static const struct inet6_protocol frag_protocol = { static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", - .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", - .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv6.frags.high_thresh }, { .procname = "ip6frag_time", - .data = &init_net.ipv6.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -445,12 +438,12 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv6.frags.high_thresh; - table[0].extra1 = &net->ipv6.frags.low_thresh; - table[1].data = &net->ipv6.frags.low_thresh; - table[1].extra2 = &net->ipv6.frags.high_thresh; - table[2].data = &net->ipv6.frags.timeout; } + table[0].data = &net->ipv6.fqdir->high_thresh; + table[0].extra1 = &net->ipv6.fqdir->low_thresh; + table[1].data = &net->ipv6.fqdir->low_thresh; + table[1].extra2 = &net->ipv6.fqdir->high_thresh; + table[2].data = &net->ipv6.fqdir->timeout; hdr = register_net_sysctl(net, "net/ipv6", table); if (!hdr) @@ -513,30 +506,35 @@ static int __net_init ipv6_frags_init_net(struct net *net) { int res; - net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; - net->ipv6.frags.f = &ip6_frags; - - res = inet_frags_init_net(&net->ipv6.frags); + res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net); if (res < 0) return res; + net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = ip6_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); return res; } +static void __net_exit ipv6_frags_pre_exit_net(struct net *net) +{ + fqdir_pre_exit(net->ipv6.fqdir); +} + static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); } static struct pernet_operations ip6_frags_ops = { - .init = ipv6_frags_init_net, - .exit = ipv6_frags_exit_net, + .init = ipv6_frags_init_net, + .pre_exit = ipv6_frags_pre_exit_net, + .exit = ipv6_frags_exit_net, }; static const struct rhashtable_params ip6_rhash_params = { @@ -587,8 +585,8 @@ err_protocol: void ipv6_frag_exit(void) { - inet_frags_fini(&ip6_frags); ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + inet_frags_fini(&ip6_frags); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 97a843cf164c..4d2e6b31a8d6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -100,7 +100,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict); -static size_t rt6_nlmsg_size(struct fib6_info *rt); +static size_t rt6_nlmsg_size(struct fib6_info *f6i); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } if (rt_dev == dev) { - rt->dst.dev = loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(rt_dev); } @@ -429,21 +429,27 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, struct fib6_info *sibling, *next_sibling; struct fib6_info *match = res->f6i; - if (!match->fib6_nsiblings || have_oif_match) + if ((!match->fib6_nsiblings && !match->nh) || have_oif_match) goto out; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ - if (!fl6->mp_hash) + if (!fl6->mp_hash && + (!match->nh || nexthop_is_multipath(match->nh))) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) + if (unlikely(match->nh)) { + nexthop_path_fib6_result(res, fl6->mp_hash); + return; + } + + if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) goto out; list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { - const struct fib6_nh *nh = &sibling->fib6_nh; + const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); @@ -457,7 +463,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, out: res->f6i = match; - res->nh = &match->fib6_nh; + res->nh = match->fib6_nh; } /* @@ -485,6 +491,45 @@ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, return false; } +struct fib6_nh_dm_arg { + struct net *net; + const struct in6_addr *saddr; + int oif; + int flags; + struct fib6_nh *nh; +}; + +static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_dm_arg *arg = _arg; + + arg->nh = nh; + return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, + arg->flags); +} + +/* returns fib6_nh from nexthop or NULL */ +static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, + struct fib6_result *res, + const struct in6_addr *saddr, + int oif, int flags) +{ + struct fib6_nh_dm_arg arg = { + .net = net, + .saddr = saddr, + .oif = oif, + .flags = flags, + }; + + if (nexthop_is_blackhole(nh)) + return NULL; + + if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) + return arg.nh; + + return NULL; +} + static void rt6_device_match(struct net *net, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { @@ -493,14 +538,31 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { - nh = &f6i->fib6_nh; + if (unlikely(f6i->nh)) { + nh = nexthop_fib6_nh(f6i->nh); + if (nexthop_is_blackhole(f6i->nh)) + goto out_blackhole; + } else { + nh = f6i->fib6_nh; + } if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { - nh = &spf6i->fib6_nh; - if (__rt6_device_match(net, nh, saddr, oif, flags)) { + bool matched = false; + + if (unlikely(spf6i->nh)) { + nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, + oif, flags); + if (nh) + matched = true; + } else { + nh = spf6i->fib6_nh; + if (__rt6_device_match(net, nh, saddr, oif, flags)) + matched = true; + } + if (matched) { res->f6i = spf6i; goto out; } @@ -508,19 +570,32 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; goto out; } - nh = &f6i->fib6_nh; + if (unlikely(f6i->nh)) { + nh = nexthop_fib6_nh(f6i->nh); + if (nexthop_is_blackhole(f6i->nh)) + goto out_blackhole; + } else { + nh = f6i->fib6_nh; + } + if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; } out: res->nh = nh; res->fib6_type = res->f6i->fib6_type; res->fib6_flags = res->f6i->fib6_flags; + return; + +out_blackhole: + res->fib6_flags |= RTF_REJECT; + res->fib6_type = RTN_BLACKHOLE; + res->nh = nh; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -691,6 +766,24 @@ out: return rc; } +struct fib6_nh_frl_arg { + u32 flags; + int oif; + int strict; + int *mpri; + bool *do_rr; + struct fib6_nh *nh; +}; + +static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_frl_arg *arg = _arg; + + arg->nh = nh; + return find_match(nh, arg->flags, arg->oif, arg->strict, + arg->mpri, arg->do_rr); +} + static void __find_rr_leaf(struct fib6_info *f6i_start, struct fib6_info *nomatch, u32 metric, struct fib6_result *res, struct fib6_info **cont, @@ -701,6 +794,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, for (f6i = f6i_start; f6i && f6i != nomatch; f6i = rcu_dereference(f6i->fib6_next)) { + bool matched = false; struct fib6_nh *nh; if (cont && f6i->fib6_metric != metric) { @@ -711,8 +805,34 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, if (fib6_check_expired(f6i)) continue; - nh = &f6i->fib6_nh; - if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { + if (unlikely(f6i->nh)) { + struct fib6_nh_frl_arg arg = { + .flags = f6i->fib6_flags, + .oif = oif, + .strict = strict, + .mpri = mpri, + .do_rr = do_rr + }; + + if (nexthop_is_blackhole(f6i->nh)) { + res->fib6_flags = RTF_REJECT; + res->fib6_type = RTN_BLACKHOLE; + res->f6i = f6i; + res->nh = nexthop_fib6_nh(f6i->nh); + return; + } + if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, + &arg)) { + matched = true; + nh = arg.nh; + } + } else { + nh = f6i->fib6_nh; + if (find_match(nh, f6i->fib6_flags, oif, strict, + mpri, do_rr)) + matched = true; + } + if (matched) { res->f6i = f6i; res->nh = nh; res->fib6_flags = f6i->fib6_flags; @@ -793,7 +913,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif, out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; - res->nh = &res->f6i->fib6_nh; + res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } @@ -1114,6 +1234,8 @@ restart: rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); goto out; + } else if (res.fib6_flags & RTF_REJECT) { + goto do_create; } fib6_select_path(net, &res, fl6, fl6->flowi6_oif, @@ -1125,6 +1247,7 @@ restart: if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); } else { +do_create: rt = ip6_create_rt_rcu(&res); } @@ -1265,13 +1388,9 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) /* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { - struct rt6_info *pcpu_rt, **p; - - p = this_cpu_ptr(res->f6i->rt6i_pcpu); - pcpu_rt = *p; + struct rt6_info *pcpu_rt; - if (pcpu_rt) - ip6_hold_safe(NULL, &pcpu_rt); + pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); return pcpu_rt; } @@ -1282,13 +1401,10 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(res); - if (!pcpu_rt) { - dst_hold(&net->ipv6.ip6_null_entry->dst); - return net->ipv6.ip6_null_entry; - } + if (!pcpu_rt) + return NULL; - dst_hold(&pcpu_rt->dst); - p = this_cpu_ptr(res->f6i->rt6i_pcpu); + p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -1458,25 +1574,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res) return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } +#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL + +/* used when the flushed bit is not relevant, only access to the bucket + * (ie., all bucket users except rt6_insert_exception); + * + * called under rcu lock; sometimes called with rt6_exception_lock held + */ +static +struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + + if (lock) + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + else + bucket = rcu_dereference(nh->rt6i_exception_bucket); + + /* remove bucket flushed bit if set */ + if (bucket) { + unsigned long p = (unsigned long)bucket; + + p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + } + + return bucket; +} + +static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) +{ + unsigned long p = (unsigned long)bucket; + + return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); +} + +/* called with rt6_exception_lock held */ +static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + unsigned long p; + + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + + p = (unsigned long)bucket; + p |= FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); +} + static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; + struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *f6i = res->f6i; + struct fib6_nh *nh = res->nh; int err = 0; spin_lock_bh(&rt6_exception_lock); - if (f6i->exception_bucket_flushed) { - err = -EINVAL; - goto out; - } - - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), GFP_ATOMIC); @@ -1484,7 +1649,10 @@ static int rt6_insert_exception(struct rt6_info *nrt, err = -ENOMEM; goto out; } - rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); + } else if (fib6_nh_excptn_bucket_flushed(bucket)) { + err = -EINVAL; + goto out; } #ifdef CONFIG_IPV6_SUBTREES @@ -1539,7 +1707,7 @@ out: return err; } -void rt6_flush_exceptions(struct fib6_info *rt) +static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1547,25 +1715,46 @@ void rt6_flush_exceptions(struct fib6_info *rt) int i; spin_lock_bh(&rt6_exception_lock); - /* Prevent rt6_insert_exception() to recreate the bucket list */ - rt->exception_bucket_flushed = 1; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; + /* Prevent rt6_insert_exception() to recreate the bucket list */ + if (!from) + fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) - rt6_remove_exception(bucket, rt6_ex); - WARN_ON_ONCE(bucket->depth); + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { + if (!from || + rcu_access_pointer(rt6_ex->rt6i->from) == from) + rt6_remove_exception(bucket, rt6_ex); + } + WARN_ON_ONCE(!from && bucket->depth); bucket++; } - out: spin_unlock_bh(&rt6_exception_lock); } +static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) +{ + struct fib6_info *f6i = arg; + + fib6_nh_flush_exceptions(nh, f6i); + + return 0; +} + +void rt6_flush_exceptions(struct fib6_info *f6i) +{ + if (f6i->nh) + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, + f6i); + else + fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); +} + /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ @@ -1594,7 +1783,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, src_key = saddr; find_ex: #endif - bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); + bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) @@ -1612,25 +1801,20 @@ find_ex: } /* Remove the passed in cached rt from the hash table that contains it */ -static int rt6_remove_exception_rt(struct rt6_info *rt) +static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) { + const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *from; int err; - from = rcu_dereference(rt->from); - if (!from || - !(rt->rt6i_flags & RTF_CACHE)) - return -EINVAL; - - if (!rcu_access_pointer(from->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(from->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); + #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1638,7 +1822,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, @@ -1655,23 +1839,60 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) return err; } -/* Find rt6_ex which contains the passed in rt cache and - * refresh its stamp - */ -static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +struct fib6_nh_excptn_arg { + struct rt6_info *rt; + int plen; +}; + +static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_excptn_arg *arg = _arg; + int err; + + err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); + if (err == 0) + return 1; + + return 0; +} + +static int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; - struct rt6_exception *rt6_ex; struct fib6_info *from; - rcu_read_lock(); from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) - goto unlock; + return -EINVAL; - bucket = rcu_dereference(from->rt6i_exception_bucket); + if (from->nh) { + struct fib6_nh_excptn_arg arg = { + .rt = rt, + .plen = from->fib6_src.plen + }; + int rc; + + /* rc = 1 means an entry was found */ + rc = nexthop_for_each_fib6_nh(from->nh, + rt6_nh_remove_exception_rt, + &arg); + return rc ? 0 : -ENOENT; + } + return fib6_nh_remove_exception(from->fib6_nh, + from->fib6_src.plen, rt); +} + +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) +{ + const struct in6_addr *src_key = NULL; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1679,15 +1900,63 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif - rt6_ex = __rt6_find_exception_rcu(&bucket, - &rt->rt6i_dst.addr, - src_key); + rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; +} + +struct fib6_nh_match_arg { + const struct net_device *dev; + const struct in6_addr *gw; + struct fib6_nh *match; +}; + +/* determine if fib6_nh has given device and gateway */ +static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_match_arg *arg = _arg; + + if (arg->dev != nh->fib_nh_dev || + (arg->gw && !nh->fib_nh_gw_family) || + (!arg->gw && nh->fib_nh_gw_family) || + (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) + return 0; + + arg->match = nh; + + /* found a match, break the loop */ + return 1; +} + +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct fib6_info *from; + struct fib6_nh *fib6_nh; + + rcu_read_lock(); + + from = rcu_dereference(rt->from); + if (!from || !(rt->rt6i_flags & RTF_CACHE)) + goto unlock; + if (from->nh) { + struct fib6_nh_match_arg arg = { + .dev = rt->dst.dev, + .gw = &rt->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); + + if (!arg.match) + return; + fib6_nh = arg.match; + } else { + fib6_nh = from->fib6_nh; + } + fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } @@ -1715,15 +1984,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, - struct fib6_info *rt, int mtu) + const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; @@ -1745,21 +2012,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) -static void rt6_exceptions_clean_tohost(struct fib6_info *rt, - struct in6_addr *gateway) +static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, + const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1824,23 +2089,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -void rt6_age_exceptions(struct fib6_info *rt, - struct fib6_gc_args *gc_args, - unsigned long now) +static void fib6_nh_age_exceptions(const struct fib6_nh *nh, + struct fib6_gc_args *gc_args, + unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1855,6 +2118,36 @@ void rt6_age_exceptions(struct fib6_info *rt, rcu_read_unlock_bh(); } +struct fib6_nh_age_excptn_arg { + struct fib6_gc_args *gc_args; + unsigned long now; +}; + +static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_age_excptn_arg *arg = _arg; + + fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); + return 0; +} + +void rt6_age_exceptions(struct fib6_info *f6i, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + if (f6i->nh) { + struct fib6_nh_age_excptn_arg arg = { + .gc_args = gc_args, + .now = now + }; + + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, + &arg); + } else { + fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); + } +} + /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) @@ -1891,9 +2184,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; - struct rt6_info *rt; + struct rt6_info *rt = NULL; int strict = 0; + WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && + !rcu_read_lock_held()); + strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (net->ipv6.devconf_all->forwarding == 0) @@ -1902,23 +2198,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); - if (res.f6i == net->ipv6.fib6_null_entry) { - rt = net->ipv6.ip6_null_entry; - rcu_read_unlock(); - dst_hold(&rt->dst); - return rt; - } + if (res.f6i == net->ipv6.fib6_null_entry) + goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { - if (ip6_hold_safe(net, &rt)) - dst_use_noref(&rt->dst, jiffies); - - rcu_read_unlock(); - return rt; + goto out; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be @@ -1926,40 +2214,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ - struct rt6_info *uncached_rt; - - uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); + rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); - rcu_read_unlock(); - - if (uncached_rt) { - /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() - * No need for another dst_hold() + if (rt) { + /* 1 refcnt is taken during ip6_rt_cache_alloc(). + * As rt6_uncached_list_add() does not consume refcnt, + * this refcnt is always returned to the caller even + * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ - rt6_uncached_list_add(uncached_rt); + rt6_uncached_list_add(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); - } else { - uncached_rt = net->ipv6.ip6_null_entry; - dst_hold(&uncached_rt->dst); - } + rcu_read_unlock(); - return uncached_rt; + return rt; + } } else { /* Get a percpu copy */ - - struct rt6_info *pcpu_rt; - local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(&res); + rt = rt6_get_pcpu_route(&res); - if (!pcpu_rt) - pcpu_rt = rt6_make_pcpu_route(net, &res); + if (!rt) + rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); - rcu_read_unlock(); - - return pcpu_rt; } +out: + if (!rt) + rt = net->ipv6.ip6_null_entry; + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + ip6_hold_safe(net, &rt); + rcu_read_unlock(); + + return rt; } EXPORT_SYMBOL_GPL(ip6_pol_route); @@ -2084,17 +2370,54 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.basic.ip_proto = fl6->flowi6_proto; } break; + case 2: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (skb) { + struct flow_keys keys; + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, 0); + flkeys = &keys; + } + + /* Inner can be v4 or v6 */ + if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; + } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; + hash_keys.tags.flow_label = flkeys->tags.flow_label; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); + } + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; } mhash = flow_hash_from_keys(&hash_keys); return mhash >> 1; } +/* Called with rcu held */ void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); - int flags = RT6_LOOKUP_F_HAS_SADDR; + int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, @@ -2116,8 +2439,8 @@ void ip6_route_input(struct sk_buff *skb) if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); - skb_dst_set(skb, - ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); + skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, + &fl6, skb, flags)); } static struct rt6_info *ip6_pol_route_output(struct net *net, @@ -2129,8 +2452,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, int flags) { bool any_src; @@ -2138,6 +2462,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; + /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; @@ -2145,6 +2470,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, fl6->flowi6_iif = LOOPBACK_IFINDEX; + flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) @@ -2157,6 +2483,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } +EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); + +struct dst_entry *ip6_route_output_flags(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) +{ + struct dst_entry *dst; + struct rt6_info *rt6; + + rcu_read_lock(); + dst = ip6_route_output_flags_noref(net, sk, fl6, flags); + rt6 = (struct rt6_info *)dst; + /* For dst cached in uncached_list, refcnt is already taken. */ + if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { + dst = &net->ipv6.ip6_null_entry->dst; + dst_hold(dst); + } + rcu_read_unlock(); + + return dst; +} EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) @@ -2381,10 +2729,31 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, rcu_read_unlock(); return; } - res.nh = &res.f6i->fib6_nh; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; + if (res.f6i->nh) { + struct fib6_nh_match_arg arg = { + .dev = dst->dev, + .gw = &rt6->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(res.f6i->nh, + fib6_nh_find_match, &arg); + + /* fib6_info uses a nexthop that does not have fib6_nh + * using the dst->dev + gw. Should be impossible. + */ + if (!arg.match) { + rcu_read_unlock(); + return; + } + + res.nh = arg.match; + } else { + res.nh = res.f6i->fib6_nh; + } + nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); @@ -2491,6 +2860,21 @@ static bool ip6_redirect_nh_match(const struct fib6_result *res, return true; } +struct fib6_nh_rd_arg { + struct fib6_result *res; + struct flowi6 *fl6; + const struct in6_addr *gw; + struct rt6_info **ret; +}; + +static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_rd_arg *arg = _arg; + + arg->res->nh = nh; + return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); +} + /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; @@ -2506,6 +2890,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL; struct fib6_result res = {}; + struct fib6_nh_rd_arg arg = { + .res = &res, + .fl6 = fl6, + .gw = &rdfl->gateway, + .ret = &ret + }; struct fib6_info *rt; struct fib6_node *fn; @@ -2530,14 +2920,24 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; - res.nh = &rt->fib6_nh; - if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; - if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) - goto out; + if (unlikely(rt->nh)) { + if (nexthop_is_blackhole(rt->nh)) + continue; + /* on match, res->nh is filled in and potentially ret */ + if (nexthop_for_each_fib6_nh(rt->nh, + fib6_nh_redirect_match, + &arg)) + goto out; + } else { + res.nh = rt->fib6_nh; + if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, + &ret)) + goto out; + } } if (!rt) @@ -2554,7 +2954,7 @@ restart: } res.f6i = rt; - res.nh = &rt->fib6_nh; + res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); @@ -2781,10 +3181,9 @@ out: return entries > rt_max_size; } -static struct rt6_info *ip6_nh_lookup_table(struct net *net, - struct fib6_config *cfg, - const struct in6_addr *gw_addr, - u32 tbid, int flags) +static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, + const struct in6_addr *gw_addr, u32 tbid, + int flags, struct fib6_result *res) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, @@ -2792,25 +3191,23 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, .saddr = cfg->fc_prefsrc, }; struct fib6_table *table; - struct rt6_info *rt; + int err; table = fib6_get_table(net, tbid); if (!table) - return NULL; + return -EINVAL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; - rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); - /* if table lookup failed, fall back to full lookup */ - if (rt == net->ipv6.ip6_null_entry) { - ip6_rt_put(rt); - rt = NULL; - } + err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); + if (!err && res->f6i != net->ipv6.fib6_null_entry) + fib6_select_path(net, res, &fl6, cfg->fc_ifindex, + cfg->fc_ifindex != 0, NULL, flags); - return rt; + return err; } static int ip6_route_check_nh_onlink(struct net *net, @@ -2818,29 +3215,19 @@ static int ip6_route_check_nh_onlink(struct net *net, const struct net_device *dev, struct netlink_ext_ack *extack) { - u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; - u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; - struct fib6_info *from; - struct rt6_info *grt; + struct fib6_result res = {}; int err; - err = 0; - grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); - if (grt) { - rcu_read_lock(); - from = rcu_dereference(grt->from); - if (!grt->dst.error && - /* ignore match if it is the default route */ - from && !ipv6_addr_any(&from->fib6_dst.addr) && - (grt->rt6i_flags & flags || dev != grt->dst.dev)) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway or device mismatch"); - err = -EINVAL; - } - rcu_read_unlock(); - - ip6_rt_put(grt); + err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); + if (!err && !(res.fib6_flags & RTF_REJECT) && + /* ignore match if it is the default route */ + !ipv6_addr_any(&res.f6i->fib6_dst.addr) && + (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway or device mismatch"); + err = -EINVAL; } return err; @@ -2853,47 +3240,50 @@ static int ip6_route_check_nh(struct net *net, { const struct in6_addr *gw_addr = &cfg->fc_gateway; struct net_device *dev = _dev ? *_dev : NULL; - struct rt6_info *grt = NULL; + int flags = RT6_LOOKUP_F_IFACE; + struct fib6_result res = {}; int err = -EHOSTUNREACH; if (cfg->fc_table) { - int flags = RT6_LOOKUP_F_IFACE; - - grt = ip6_nh_lookup_table(net, cfg, gw_addr, - cfg->fc_table, flags); - if (grt) { - if (grt->rt6i_flags & RTF_GATEWAY || - (dev && dev != grt->dst.dev)) { - ip6_rt_put(grt); - grt = NULL; - } - } + err = ip6_nh_lookup_table(net, cfg, gw_addr, + cfg->fc_table, flags, &res); + /* gw_addr can not require a gateway or resolve to a reject + * route. If a device is given, it must match the result. + */ + if (err || res.fib6_flags & RTF_REJECT || + res.nh->fib_nh_gw_family || + (dev && dev != res.nh->fib_nh_dev)) + err = -EHOSTUNREACH; } - if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); + if (err < 0) { + struct flowi6 fl6 = { + .flowi6_oif = cfg->fc_ifindex, + .daddr = *gw_addr, + }; - if (!grt) - goto out; + err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); + if (err || res.fib6_flags & RTF_REJECT || + res.nh->fib_nh_gw_family) + err = -EHOSTUNREACH; + if (err) + return err; + + fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, + cfg->fc_ifindex != 0, NULL, flags); + } + + err = 0; if (dev) { - if (dev != grt->dst.dev) { - ip6_rt_put(grt); - goto out; - } + if (dev != res.nh->fib_nh_dev) + err = -EHOSTUNREACH; } else { - *_dev = dev = grt->dst.dev; - *idev = grt->rt6i_idev; + *_dev = dev = res.nh->fib_nh_dev; dev_hold(dev); - in6_dev_hold(grt->rt6i_idev); + *idev = in6_dev_get(dev); } - if (!(grt->rt6i_flags & RTF_GATEWAY)) - err = 0; - - ip6_rt_put(grt); - -out: return err; } @@ -2934,11 +3324,15 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, goto out; } + rcu_read_lock(); + if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else err = ip6_route_check_nh(net, cfg, _dev, idev); + rcu_read_unlock(); + if (err) goto out; } @@ -3039,7 +3433,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, goto out; } } - goto set_dev; + goto pcpu_alloc; } if (cfg->fc_flags & RTF_GATEWAY) { @@ -3075,7 +3469,14 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) goto out; -set_dev: + +pcpu_alloc: + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!fib6_nh->rt6i_pcpu) { + err = -ENOMEM; + goto out; + } + fib6_nh->fib_nh_dev = dev; fib6_nh->fib_nh_oif = dev->ifindex; err = 0; @@ -3095,6 +3496,38 @@ out: void fib6_nh_release(struct fib6_nh *fib6_nh) { + struct rt6_exception_bucket *bucket; + + rcu_read_lock(); + + fib6_nh_flush_exceptions(fib6_nh, NULL); + bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); + if (bucket) { + rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); + kfree(bucket); + } + + rcu_read_unlock(); + + if (fib6_nh->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + + free_percpu(fib6_nh->rt6i_pcpu); + } + fib_nh_common_release(&fib6_nh->nh_common); } @@ -3104,7 +3537,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_info *rt = NULL; + struct nexthop *nh = NULL; struct fib6_table *table; + struct fib6_nh *fib6_nh; int err = -EINVAL; int addr_type; @@ -3140,6 +3575,16 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } #endif + if (cfg->fc_nh_id) { + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto out; + } + err = fib6_check_nexthop(nh, cfg, extack); + if (err) + goto out; + } err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && @@ -3157,7 +3602,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags); + rt = fib6_info_alloc(gfp_flags, !nh); if (!rt) goto out; @@ -3197,19 +3642,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); - if (err) - goto out; + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + goto out; + } + if (rt->fib6_src.plen) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto out; + } + rt->nh = nh; + fib6_nh = nexthop_fib6_nh(rt->nh); + } else { + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); + if (err) + goto out; - /* We cannot add true routes via loopback here, - * they would result in kernel looping; promote them to reject routes - */ - addr_type = ipv6_addr_type(&cfg->fc_dst); - if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) - rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; + fib6_nh = rt->fib6_nh; + + /* We cannot add true routes via loopback here, they would + * result in kernel looping; promote them to reject routes + */ + addr_type = ipv6_addr_type(&cfg->fc_dst); + if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, + addr_type)) + rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; + } if (!ipv6_addr_any(&cfg->fc_prefsrc)) { - struct net_device *dev = fib6_info_nh_dev(rt); + struct net_device *dev = fib6_nh->fib_nh_dev; if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); @@ -3301,6 +3762,12 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) info->skip_notify = 1; } + info->skip_notify_kernel = 1; + call_fib6_multipath_entry_notifiers(net, + FIB_EVENT_ENTRY_DEL, + rt, + rt->fib6_nsiblings, + NULL); list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { @@ -3323,7 +3790,7 @@ out_put: return err; } -static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) +static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; @@ -3339,10 +3806,49 @@ out: return rc; } +static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, + struct fib6_nh *nh) +{ + struct fib6_result res = { + .f6i = rt, + .nh = nh, + }; + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); + if (rt_cache) + return __ip6_del_cached_rt(rt_cache, cfg); + + return 0; +} + +struct fib6_nh_del_cached_rt_arg { + struct fib6_config *cfg; + struct fib6_info *f6i; +}; + +static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_del_cached_rt_arg *arg = _arg; + int rc; + + rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); + return rc != -ESRCH ? rc : 0; +} + +static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) +{ + struct fib6_nh_del_cached_rt_arg arg = { + .cfg = cfg, + .f6i = f6i + }; + + return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); +} + static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt_cache; struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; @@ -3365,26 +3871,45 @@ static int ip6_route_del(struct fib6_config *cfg, for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; + if (rt->nh && cfg->fc_nh_id && + rt->nh->id != cfg->fc_nh_id) + continue; + if (cfg->fc_flags & RTF_CACHE) { - struct fib6_result res = { - .f6i = rt, - }; - int rc; - - rt_cache = rt6_find_cached_rt(&res, - &cfg->fc_dst, - &cfg->fc_src); - if (rt_cache) { - rc = ip6_del_cached_rt(rt_cache, cfg); - if (rc != -ESRCH) { - rcu_read_unlock(); - return rc; - } + int rc = 0; + + if (rt->nh) { + rc = ip6_del_cached_rt_nh(cfg, rt); + } else if (cfg->fc_nh_id) { + continue; + } else { + nh = rt->fib6_nh; + rc = ip6_del_cached_rt(cfg, rt, nh); + } + if (rc != -ESRCH) { + rcu_read_unlock(); + return rc; } continue; } - nh = &rt->fib6_nh; + if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) + continue; + if (cfg->fc_protocol && + cfg->fc_protocol != rt->fib6_protocol) + continue; + + if (rt->nh) { + if (!fib6_info_hold_safe(rt)) + continue; + rcu_read_unlock(); + + return __ip6_del_rt(rt, &cfg->fc_nlinfo); + } + if (cfg->fc_nh_id) + continue; + + nh = rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) @@ -3392,10 +3917,6 @@ static int ip6_route_del(struct fib6_config *cfg, if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; - if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) - continue; - if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) - continue; if (!fib6_info_hold_safe(rt)) continue; rcu_read_unlock(); @@ -3506,7 +4027,25 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (!res.f6i) goto out; - res.nh = &res.f6i->fib6_nh; + if (res.f6i->nh) { + struct fib6_nh_match_arg arg = { + .dev = dst->dev, + .gw = &rt->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(res.f6i->nh, + fib6_nh_find_match, &arg); + + /* fib6_info uses a nexthop that does not have fib6_nh + * using the dst->dev. Should be impossible + */ + if (!arg.match) + goto out; + res.nh = arg.match; + } else { + res.nh = res.f6i->fib6_nh; + } + res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); @@ -3558,12 +4097,15 @@ static struct fib6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) + /* these routes do not use nexthops */ + if (rt->nh) + continue; + if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || - !rt->fib6_nh.fib_nh_gw_family) + !rt->fib6_nh->fib_nh_gw_family) continue; - if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; @@ -3621,8 +4163,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - struct fib6_nh *nh = &rt->fib6_nh; + struct fib6_nh *nh; + + /* RA routes do not use nexthops */ + if (rt->nh) + continue; + nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&nh->fib_nh_gw6, addr)) @@ -3873,7 +4420,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && + if (!rt->nh && + ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -3901,18 +4449,22 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; + struct fib6_nh *nh; + /* RA routes do not use nexthops */ + if (rt->nh) + return 0; + + nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - rt->fib6_nh.fib_nh_gw_family && - ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { + nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; - } /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ - rt6_exceptions_clean_tohost(rt, gateway); + fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } @@ -3950,11 +4502,12 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) return NULL; } +/* only called for fib entries with builtin fib6_nh */ static bool rt6_is_dead(const struct fib6_info *rt) { - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || - (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && - ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) + if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || + (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && + ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; @@ -3966,11 +4519,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->fib6_nh.fib_nh_weight; + total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) - total += iter->fib6_nh.fib_nh_weight; + total += iter->fib6_nh->fib_nh_weight; } return total; @@ -3981,11 +4534,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->fib6_nh.fib_nh_weight; + *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) @@ -4028,9 +4581,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.fib6_null_entry && - rt->fib6_nh.fib_nh_dev == arg->dev) { - rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; + if (rt != net->ipv6.fib6_null_entry && !rt->nh && + rt->fib6_nh->fib_nh_dev == arg->dev) { + rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -4053,15 +4606,16 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } +/* only called for fib entries with inline fib6_nh */ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) + if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) + if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; @@ -4082,12 +4636,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, struct fib6_info *iter; unsigned int dead = 0; - if (rt->fib6_nh.fib_nh_dev == down_dev || - rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh->fib_nh_dev == down_dev || + rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == down_dev || - iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh->fib_nh_dev == down_dev || + iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -4099,11 +4653,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) - rt->fib6_nh.fib_nh_flags |= nh_flags; + if (rt->fib6_nh->fib_nh_dev == dev) + rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) - iter->fib6_nh.fib_nh_flags |= nh_flags; + if (iter->fib6_nh->fib_nh_dev == dev) + iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -4113,17 +4667,17 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); - if (rt == net->ipv6.fib6_null_entry) + if (rt == net->ipv6.fib6_null_entry || rt->nh) return 0; switch (arg->event) { case NETDEV_UNREGISTER: - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -4139,10 +4693,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->fib6_nh.fib_nh_dev != dev || + if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -4176,9 +4730,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event) struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; + struct fib6_info *f6i; }; -static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) +static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; + struct fib6_info *f6i = arg->f6i; + + /* For administrative MTU increase, there is no way to discover + * IPv6 PMTU increase, so PMTU increase should be updated here. + * Since RFC 1981 doesn't include administrative MTU increase + * update PMTU increase is a MUST. (i.e. jumbo frame) + */ + if (nh->fib_nh_dev == arg->dev) { + struct inet6_dev *idev = __in6_dev_get(arg->dev); + u32 mtu = f6i->fib6_pmtu; + + if (mtu >= arg->mtu || + (mtu < arg->mtu && mtu == idev->cnf.mtu6)) + fib6_metric_set(f6i, RTAX_MTU, arg->mtu); + + spin_lock_bh(&rt6_exception_lock); + rt6_exceptions_update_pmtu(idev, nh, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); + } + + return 0; +} + +static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; @@ -4193,24 +4774,17 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) if (!idev) return 0; - /* For administrative MTU increase, there is no way to discover - IPv6 PMTU increase, so PMTU increase should be updated here. - Since RFC 1981 doesn't include administrative MTU increase - update PMTU increase is a MUST. (i.e. jumbo frame) - */ - if (rt->fib6_nh.fib_nh_dev == arg->dev && - !fib6_metric_locked(rt, RTAX_MTU)) { - u32 mtu = rt->fib6_pmtu; - - if (mtu >= arg->mtu || - (mtu < arg->mtu && mtu == idev->cnf.mtu6)) - fib6_metric_set(rt, RTAX_MTU, arg->mtu); + if (fib6_metric_locked(f6i, RTAX_MTU)) + return 0; - spin_lock_bh(&rt6_exception_lock); - rt6_exceptions_update_pmtu(idev, rt, arg->mtu); - spin_unlock_bh(&rt6_exception_lock); + arg->f6i = f6i; + if (f6i->nh) { + /* fib6_nh_mtu_change only returns 0, so this is safe */ + return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, + arg); } - return 0; + + return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) @@ -4224,6 +4798,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu) } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { + [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, @@ -4241,6 +4816,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, + [RTA_NH_ID] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -4287,6 +4863,16 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); + if (tb[RTA_NH_ID]) { + if (tb[RTA_GATEWAY] || tb[RTA_OIF] || + tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { + NL_SET_ERR_MSG(extack, + "Nexthop specification and nexthop id are mutually exclusive"); + goto errout; + } + cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); + } + if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; @@ -4430,6 +5016,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; + enum fib_event_type event_type; struct fib6_config r_cfg; struct rtnexthop *rtnh; struct fib6_info *rt; @@ -4489,7 +5076,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); @@ -4501,12 +5088,23 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } + if (list_empty(&rt6_nh_list)) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - no valid nexthops"); + return -EINVAL; + } + /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done */ info->skip_notify = 1; + /* For add and replace, send one notification with all nexthops. For + * append, send one notification with all appended nexthops. + */ + info->skip_notify_kernel = 1; + err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { err = __ip6_ins_rt(nh->fib6_info, info, extack); @@ -4543,6 +5141,15 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nhn++; } + event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD; + err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type, + rt_notif, nhn - 1, extack); + if (err) { + /* Delete all the siblings that were just added */ + err_nh = NULL; + goto add_errout; + } + /* success ... tell user about new route */ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); goto cleanup; @@ -4621,6 +5228,12 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (cfg.fc_nh_id && + !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + return -EINVAL; + } + if (cfg.fc_mp) return ip6_route_multipath_del(&cfg, extack); else { @@ -4648,17 +5261,46 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, return ip6_route_add(&cfg, GFP_KERNEL, extack); } -static size_t rt6_nlmsg_size(struct fib6_info *rt) +/* add the overhead of this fib6_nh to nexthop_len */ +static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) { - int nexthop_len = 0; + int *nexthop_len = arg; - if (rt->fib6_nsiblings) { - nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ - + NLA_ALIGN(sizeof(struct rtnexthop)) - + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); + *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16); /* RTA_GATEWAY */ - nexthop_len *= rt->fib6_nsiblings; + if (nh->fib_nh_lws) { + /* RTA_ENCAP_TYPE */ + *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); + /* RTA_ENCAP */ + *nexthop_len += nla_total_size(2); + } + + return 0; +} + +static size_t rt6_nlmsg_size(struct fib6_info *f6i) +{ + int nexthop_len; + + if (f6i->nh) { + nexthop_len = nla_total_size(4); /* RTA_NH_ID */ + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, + &nexthop_len); + } else { + struct fib6_nh *nh = f6i->fib6_nh; + + nexthop_len = 0; + if (f6i->fib6_nsiblings) { + nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16) /* RTA_GATEWAY */ + + lwtunnel_get_encap_size(nh->fib_nh_lws); + + nexthop_len *= f6i->fib6_nsiblings; + } + nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } return NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -4674,10 +5316,38 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) + nexthop_len; } +static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, + unsigned char *flags) +{ + if (nexthop_is_multipath(nh)) { + struct nlattr *mp; + + mp = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp) + goto nla_put_failure; + + if (nexthop_mpath_fill_node(skb, nh)) + goto nla_put_failure; + + nla_nest_end(skb, mp); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = nexthop_fib6_nh(nh); + if (fib_nexthop_info(skb, &fib6_nh->nh_common, + flags, false) < 0) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -4687,6 +5357,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct rt6_info *rt6 = (struct rt6_info *)dst; struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; + unsigned char nh_flags = 0; struct nlmsghdr *nlh; struct rtmsg *rtm; long expires = 0; @@ -4794,22 +5465,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (!mp) goto nla_put_failure; - if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, - rt->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, + rt->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { - if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, - sibling->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, + sibling->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; } nla_nest_end(skb, mp); - } else { - unsigned char nh_flags = 0; + } else if (rt->nh) { + if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) + goto nla_put_failure; - if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, + if (nexthop_is_blackhole(rt->nh)) + rtm->rtm_type = RTN_BLACKHOLE; + + if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) + goto nla_put_failure; + + rtm->rtm_flags |= nh_flags; + } else { + if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, &nh_flags, false) < 0) goto nla_put_failure; @@ -4836,10 +5516,28 @@ nla_put_failure: return -EMSGSIZE; } +static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) +{ + const struct net_device *dev = arg; + + if (nh->fib_nh_dev == dev) + return 1; + + return 0; +} + static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { - if (f6i->fib6_nh.fib_nh_dev == dev) + if (f6i->nh) { + struct net_device *_dev = (struct net_device *)dev; + + return !!nexthop_for_each_fib6_nh(f6i->nh, + fib6_info_nh_uses_dev, + _dev); + } + + if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (f6i->fib6_nsiblings) { @@ -4847,7 +5545,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, list_for_each_entry_safe(sibling, next_sibling, &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh.fib_nh_dev == dev) + if (sibling->fib6_nh->fib_nh_dev == dev) return true; } } @@ -4855,33 +5553,131 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, return false; } -int rt6_dump_route(struct fib6_info *rt, void *p_arg) +struct fib6_nh_exception_dump_walker { + struct rt6_rtnl_dump_arg *dump; + struct fib6_info *rt; + unsigned int flags; + unsigned int skip; + unsigned int count; +}; + +static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) +{ + struct fib6_nh_exception_dump_walker *w = arg; + struct rt6_rtnl_dump_arg *dump = w->dump; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i, err; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); + if (!bucket) + return 0; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + if (w->skip) { + w->skip--; + continue; + } + + /* Expiration of entries doesn't bump sernum, insertion + * does. Removal is triggered by insertion, so we can + * rely on the fact that if entries change between two + * partial dumps, this node is scanned again completely, + * see rt6_insert_exception() and fib6_dump_table(). + * + * Count expired entries we go through as handled + * entries that we'll skip next time, in case of partial + * node dump. Otherwise, if entries expire meanwhile, + * we'll skip the wrong amount. + */ + if (rt6_check_expired(rt6_ex->rt6i)) { + w->count++; + continue; + } + + err = rt6_fill_node(dump->net, dump->skb, w->rt, + &rt6_ex->rt6i->dst, NULL, NULL, 0, + RTM_NEWROUTE, + NETLINK_CB(dump->cb->skb).portid, + dump->cb->nlh->nlmsg_seq, w->flags); + if (err) + return err; + + w->count++; + } + bucket++; + } + + return 0; +} + +/* Return -1 if done with node, number of handled routes on partial dump */ +int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct fib_dump_filter *filter = &arg->filter; unsigned int flags = NLM_F_MULTI; struct net *net = arg->net; + int count = 0; if (rt == net->ipv6.fib6_null_entry) - return 0; + return -1; if ((filter->flags & RTM_F_PREFIX) && !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ - return 1; + return -1; } - if (filter->filter_set) { - if ((filter->rt_type && rt->fib6_type != filter->rt_type) || - (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || - (filter->protocol && rt->fib6_protocol != filter->protocol)) { - return 1; - } + if (filter->filter_set && + ((filter->rt_type && rt->fib6_type != filter->rt_type) || + (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || + (filter->protocol && rt->fib6_protocol != filter->protocol))) { + return -1; + } + + if (filter->filter_set || + !filter->dump_routes || !filter->dump_exceptions) { flags |= NLM_F_DUMP_FILTERED; } - return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, - RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, - arg->cb->nlh->nlmsg_seq, flags); + if (filter->dump_routes) { + if (skip) { + skip--; + } else { + if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, + 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).portid, + arg->cb->nlh->nlmsg_seq, flags)) { + return 0; + } + count++; + } + } + + if (filter->dump_exceptions) { + struct fib6_nh_exception_dump_walker w = { .dump = arg, + .rt = rt, + .flags = flags, + .skip = skip, + .count = 0 }; + int err; + + rcu_read_lock(); + if (rt->nh) { + err = nexthop_for_each_fib6_nh(rt->nh, + rt6_nh_dump_exceptions, + &w); + } else { + err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); + } + rcu_read_unlock(); + + if (err) + return count += w.count; + } + + return -1; } static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, @@ -5126,6 +5922,38 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } +void fib6_rt_update(struct net *net, struct fib6_info *rt, + struct nl_info *info) +{ + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + /* call_fib6_entry_notifiers will be removed when in-kernel notifier + * is implemented and supported for nexthop objects + */ + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL); + + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + if (!skb) + goto errout; + + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, + RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); + if (err < 0) { + /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); +} + static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -5136,7 +5964,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { - net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; + net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -5330,11 +6158,11 @@ static int __net_init ip6_route_net_init(struct net *net) if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; - net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, - sizeof(*net->ipv6.fib6_null_entry), - GFP_KERNEL); + net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; + memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, + sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), @@ -5344,6 +6172,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; @@ -5355,6 +6184,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), @@ -5364,6 +6194,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); #endif net->ipv6.sysctl.flush_delay = 0; @@ -5471,7 +6302,7 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index e15cd37024fd..dc4c91e0bfb8 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -23,6 +23,7 @@ static int zero; static int one = 1; +static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; @@ -113,7 +114,9 @@ static struct ctl_table ipv6_table_template[] = { .data = &init_net.ipv6.sysctl.flowlabel_reflect, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &flowlabel_reflect_max, }, { .procname = "max_dst_opts_number", diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7a14ea37d2df..d56a9019a0fe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -171,7 +171,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } @@ -883,9 +883,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_oif = oif; } - if (sk) - mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) { + mark = inet_twsk(sk)->tw_mark; + /* autoflowlabel relies on buff->hash */ + skb_set_hash(buff, inet_twsk(sk)->tw_txhash, + PKT_HASH_TYPE_L4); + } else { + mark = sk->sk_mark; + } + buff->tstamp = tcp_transmit_time(sk); + } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; @@ -912,15 +920,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); u32 seq = 0, ack_seq = 0; struct tcp_md5sig_key *key = NULL; #ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; - struct ipv6hdr *ipv6h = ipv6_hdr(skb); unsigned char newhash[16]; int genhash; struct sock *sk1 = NULL; #endif + __be32 label = 0; + struct net *net; int oif = 0; if (th->rst) @@ -932,6 +942,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (!sk && !ipv6_unicast_destination(skb)) return; + net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); @@ -945,7 +956,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), + sk1 = inet6_lookup_listener(net, &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, @@ -975,9 +986,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) trace_tcp_send_reset(sk, skb); + if (sk->sk_state == TCP_TIME_WAIT) + label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); + } else { + if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) + label = ip6_flowlabel(ipv6h); } - tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, + label); #ifdef CONFIG_TCP_MD5SIG out: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 70b01bd95022..827fe7385078 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -54,16 +54,6 @@ #include <trace/events/skb.h> #include "udp_impl.h" -static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if defined(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_udp_l3mdev_accept && - skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) - return true; -#endif - return false; -} - static u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -111,7 +101,7 @@ void udp_v6_rehash(struct sock *sk) static int compute_score(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, - int dif, int sdif, bool exact_dif) + int dif, int sdif) { int score; struct inet_sock *inet; @@ -155,8 +145,8 @@ static int compute_score(struct sock *sk, struct net *net, static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, - int dif, int sdif, bool exact_dif, - struct udp_hslot *hslot2, struct sk_buff *skb) + int dif, int sdif, struct udp_hslot *hslot2, + struct sk_buff *skb) { struct sock *sk, *result; int score, badness; @@ -166,7 +156,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); + daddr, hnum, dif, sdif); if (score > badness) { if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, @@ -195,14 +185,13 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result; - bool exact_dif = udp6_lib_exact_dif_match(net, skb); hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif, + daddr, hnum, dif, sdif, hslot2, skb); if (!result) { hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); @@ -212,10 +201,9 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, - exact_dif, hslot2, - skb); + hslot2, skb); } - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } @@ -838,8 +826,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - ip6_compute_pseudo); + skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo); ret = udpv6_queue_rcv_skb(sk, skb); @@ -1332,7 +1319,7 @@ do_udp_sendmsg: fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -1384,7 +1371,7 @@ do_udp_sendmsg: } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 5bdca3d5d6b7..78daadecbdef 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -21,137 +21,6 @@ #include <net/ipv6.h> #include <net/addrconf.h> -static void -__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) -{ - const struct flowi6 *fl6 = &fl->u.ip6; - - /* Initialize temporary selector matching only - * to current session. */ - *(struct in6_addr *)&sel->daddr = fl6->daddr; - *(struct in6_addr *)&sel->saddr = fl6->saddr; - sel->dport = xfrm_flowi_dport(fl, &fl6->uli); - sel->dport_mask = htons(0xffff); - sel->sport = xfrm_flowi_sport(fl, &fl6->uli); - sel->sport_mask = htons(0xffff); - sel->family = AF_INET6; - sel->prefixlen_d = 128; - sel->prefixlen_s = 128; - sel->proto = fl6->flowi6_proto; - sel->ifindex = fl6->flowi6_oif; -} - -static void -xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, - const xfrm_address_t *daddr, const xfrm_address_t *saddr) -{ - x->id = tmpl->id; - if (ipv6_addr_any((struct in6_addr *)&x->id.daddr)) - memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); - memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); - if (ipv6_addr_any((struct in6_addr *)&x->props.saddr)) - memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); - x->props.mode = tmpl->mode; - x->props.reqid = tmpl->reqid; - x->props.family = AF_INET6; -} - -/* distribution counting sort function for xfrm_state and xfrm_tmpl */ -static int -__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) -{ - int count[XFRM_MAX_DEPTH] = { }; - int class[XFRM_MAX_DEPTH]; - int i; - - for (i = 0; i < n; i++) { - int c; - class[i] = c = cmp(src[i]); - count[c]++; - } - - for (i = 2; i < maxclass; i++) - count[i] += count[i - 1]; - - for (i = 0; i < n; i++) { - dst[count[class[i] - 1]++] = src[i]; - src[i] = NULL; - } - - return 0; -} - -/* - * Rule for xfrm_state: - * - * rule 1: select IPsec transport except AH - * rule 2: select MIPv6 RO or inbound trigger - * rule 3: select IPsec transport AH - * rule 4: select IPsec tunnel - * rule 5: others - */ -static int __xfrm6_state_sort_cmp(void *p) -{ - struct xfrm_state *v = p; - - switch (v->props.mode) { - case XFRM_MODE_TRANSPORT: - if (v->id.proto != IPPROTO_AH) - return 1; - else - return 3; -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case XFRM_MODE_ROUTEOPTIMIZATION: - case XFRM_MODE_IN_TRIGGER: - return 2; -#endif - case XFRM_MODE_TUNNEL: - case XFRM_MODE_BEET: - return 4; - } - return 5; -} - -static int -__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n) -{ - return __xfrm6_sort((void **)dst, (void **)src, n, - __xfrm6_state_sort_cmp, 6); -} - -/* - * Rule for xfrm_tmpl: - * - * rule 1: select IPsec transport - * rule 2: select MIPv6 RO or inbound trigger - * rule 3: select IPsec tunnel - * rule 4: others - */ -static int __xfrm6_tmpl_sort_cmp(void *p) -{ - struct xfrm_tmpl *v = p; - switch (v->mode) { - case XFRM_MODE_TRANSPORT: - return 1; -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case XFRM_MODE_ROUTEOPTIMIZATION: - case XFRM_MODE_IN_TRIGGER: - return 2; -#endif - case XFRM_MODE_TUNNEL: - case XFRM_MODE_BEET: - return 3; - } - return 4; -} - -static int -__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n) -{ - return __xfrm6_sort((void **)dst, (void **)src, n, - __xfrm6_tmpl_sort_cmp, 5); -} - int xfrm6_extract_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); @@ -171,12 +40,6 @@ int xfrm6_extract_header(struct sk_buff *skb) static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, .proto = IPPROTO_IPV6, - .eth_proto = htons(ETH_P_IPV6), - .owner = THIS_MODULE, - .init_tempsel = __xfrm6_init_tempsel, - .init_temprop = xfrm6_init_temprop, - .tmpl_sort = __xfrm6_tmpl_sort, - .state_sort = __xfrm6_state_sort, .output = xfrm6_output, .output_finish = xfrm6_output_finish, .extract_input = xfrm6_extract_input, |